diff --git a/Notebooks/String_matching.ipynb b/Notebooks/String_matching.ipynb index bffe310dc69e2fcd762b30381f16e9435f5025b9..90e4af23551082b18b921af02b89403c4b09414b 100644 --- a/Notebooks/String_matching.ipynb +++ b/Notebooks/String_matching.ipynb @@ -115,7 +115,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 15, "id": "c0f4a42a-7e21-41e8-833c-2dd2f9d1985e", "metadata": { "tags": [] @@ -125,7 +125,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "3\n" + "2\n" ] }, { @@ -162,68 +162,56 @@ " </thead>\n", " <tbody>\n", " <tr>\n", - " <th>133</th>\n", - " <td>14.376</td>\n", - " <td>45</td>\n", - " <td>Theologia</td>\n", - " <td>Critici Sacri</td>\n", + " <th>8398</th>\n", + " <td>14.378</td>\n", + " <td>438</td>\n", + " <td>Paralipomena Historica</td>\n", + " <td>De Re Litterariâ & Bibliographicâ, cum Catalogis Bibliothecarum</td>\n", " <td>Quarto</td>\n", - " <td>21</td>\n", - " <td>14.376_045_00</td>\n", - " <td>Goësij (Willhelmi) Pilatus judex; cui accedunt¬ Theologi cujusdam in Pilatum judicem Stricturæ, cum ejusdem Goësij notis & animadversionibus.¬ Hagæ Comitis. 1677. Ioan. Tongerloo. n. 200.</td>\n", - " <td>Goesij Willhelmi Pilatus judex cui accedunt Theologi cujusdam in Pilatum judicem Stricturae cum ejusdem Goesij notis & animadversionibus Hagae Comitis 1677 Ioan Tongerloo n 200</td>\n", + " <td>1270</td>\n", + " <td>14.378_438_00</td>\n", + " <td>Le Iournal des Savans commencé en 1665 par M.r de Hédouville, & continué d'année en année jusques & compris 172 par des Anonymes. 64 Vol. Paris. 1665. & années Suivantes. n. 2144 Plus les deux derniers vol. del A. 1733, et 1734 n. 2144.</td>\n", + " <td>Le Iournal des Savans commence en 1665 par Mr de Hedouville & continue d'annee en annee jusques & compris 172 par des Anonymes 64 Vol Paris 1665 & annees Suivantes n 2144 Plus les deux derniers vol del A 1733 et 1734 n 2144</td>\n", " </tr>\n", " <tr>\n", - " <th>338</th>\n", - " <td>14.376</td>\n", - " <td>85</td>\n", - " <td>Theologia</td>\n", - " <td>Sanctissimi Patres Latini</td>\n", + " <th>8446</th>\n", + " <td>14.378</td>\n", + " <td>444</td>\n", + " <td>Paralipomena Historica</td>\n", + " <td>De Re Litterariâ & Bibliographicâ, cum Catalogis Bibliothecarum</td>\n", " <td>Octavo und kleiner</td>\n", - " <td>61</td>\n", - " <td>14.376_085_04</td>\n", - " <td>S. Augustin de la veritable Religion et des moeurs de L'Eglise catholique trad. en Franç. (par M. Dubois) 8.° Paris. 1694. Louis Guerin. n. 200</td>\n", - " <td>S Augustin de la veritable Religion et des moeurs de L'Eglise catholique trad en Franç par M Dubois 8° Paris 1694 Louis Guerin n 200</td>\n", - " </tr>\n", - " <tr>\n", - " <th>930</th>\n", - " <td>14.376</td>\n", - " <td>180</td>\n", - " <td>Iurisprudentia</td>\n", - " <td>Ius Civile, Publicum, & Municipale</td>\n", - " <td>Folio</td>\n", - " <td>148</td>\n", - " <td>14.376_180_00</td>\n", - " <td>243 Sigonij (Car.) de antiquo Iure Populi Rom. Libri XI. nempè, de antiquo jure Civium Romanorum Libri II. de Iure antiquo Italiæ Libri III. de antiquo Iure¬ Provinciarum Libri III. ac de Iudiciis Libri III. Bononiæ 1574. Societas Typographorum. n. 200.</td>\n", - " <td>Sigonij Car de antiquo Iure Populi Rom Libri XI nempe de antiquo jure Civium Romanorum Libri II de Iure antiquo Italiae Libri III de antiquo Iure Provinciarum Libri III ac de Iudiciis Libri III Bononiae 1574 Societas Typographorum n 200</td>\n", + " <td>1276</td>\n", + " <td>14.378_444_00</td>\n", + " <td>Iournal Litteraire composé par un nombre de Savans 12.° 6 Vol. La Haye. 1713. & Suiv. Iohnson. n. 2252.</td>\n", + " <td>Iournal Litteraire compose par un nombre de Savans 12° 6 Vol La Haye 1713 & Suiv Iohnson n 2252</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ - " volume page number category subcategory \n", - "133 14.376 45 Theologia Critici Sacri \\\n", - "338 14.376 85 Theologia Sanctissimi Patres Latini \n", - "930 14.376 180 Iurisprudentia Ius Civile, Publicum, & Municipale \n", + " volume page number category \n", + "8398 14.378 438 Paralipomena Historica \\\n", + "8446 14.378 444 Paralipomena Historica \n", + "\n", + " subcategory \n", + "8398 De Re Litterariâ & Bibliographicâ, cum Catalogis Bibliothecarum \\\n", + "8446 De Re Litterariâ & Bibliographicâ, cum Catalogis Bibliothecarum \n", "\n", - " format handwritten page number entry_ID \n", - "133 Quarto 21 14.376_045_00 \\\n", - "338 Octavo und kleiner 61 14.376_085_04 \n", - "930 Folio 148 14.376_180_00 \n", + " format handwritten page number entry_ID \n", + "8398 Quarto 1270 14.378_438_00 \\\n", + "8446 Octavo und kleiner 1276 14.378_444_00 \n", "\n", - " entry \n", - "133 Goësij (Willhelmi) Pilatus judex; cui accedunt¬ Theologi cujusdam in Pilatum judicem Stricturæ, cum ejusdem Goësij notis & animadversionibus.¬ Hagæ Comitis. 1677. Ioan. Tongerloo. n. 200. \\\n", - "338 S. Augustin de la veritable Religion et des moeurs de L'Eglise catholique trad. en Franç. (par M. Dubois) 8.° Paris. 1694. Louis Guerin. n. 200 \n", - "930 243 Sigonij (Car.) de antiquo Iure Populi Rom. Libri XI. nempè, de antiquo jure Civium Romanorum Libri II. de Iure antiquo Italiæ Libri III. de antiquo Iure¬ Provinciarum Libri III. ac de Iudiciis Libri III. Bononiæ 1574. Societas Typographorum. n. 200. \n", + " entry \n", + "8398 Le Iournal des Savans commencé en 1665 par M.r de Hédouville, & continué d'année en année jusques & compris 172 par des Anonymes. 64 Vol. Paris. 1665. & années Suivantes. n. 2144 Plus les deux derniers vol. del A. 1733, et 1734 n. 2144. \\\n", + "8446 Iournal Litteraire composé par un nombre de Savans 12.° 6 Vol. La Haye. 1713. & Suiv. Iohnson. n. 2252. \n", "\n", - " cleaned entry \n", - "133 Goesij Willhelmi Pilatus judex cui accedunt Theologi cujusdam in Pilatum judicem Stricturae cum ejusdem Goesij notis & animadversionibus Hagae Comitis 1677 Ioan Tongerloo n 200 \n", - "338 S Augustin de la veritable Religion et des moeurs de L'Eglise catholique trad en Franç par M Dubois 8° Paris 1694 Louis Guerin n 200 \n", - "930 Sigonij Car de antiquo Iure Populi Rom Libri XI nempe de antiquo jure Civium Romanorum Libri II de Iure antiquo Italiae Libri III de antiquo Iure Provinciarum Libri III ac de Iudiciis Libri III Bononiae 1574 Societas Typographorum n 200 " + " cleaned entry \n", + "8398 Le Iournal des Savans commence en 1665 par Mr de Hedouville & continue d'annee en annee jusques & compris 172 par des Anonymes 64 Vol Paris 1665 & annees Suivantes n 2144 Plus les deux derniers vol del A 1733 et 1734 n 2144 \n", + "8446 Iournal Litteraire compose par un nombre de Savans 12° 6 Vol La Haye 1713 & Suiv Iohnson n 2252 " ] }, - "execution_count": 8, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -232,11 +220,94 @@ "def search_in_entry(df, string):\n", " return df[df['cleaned entry'].str.contains(string)]\n", "\n", - "info = search_in_entry(search_in_entry(entry_df, ''), 'n 200\\Z')\n", + "info = search_in_entry(search_in_entry(entry_df, 'Savans'), 'Iournal')\n", "print(len(info))\n", "info" ] }, + { + "cell_type": "code", + "execution_count": 12, + "id": "97b7a8d5-154c-4b7a-8d9e-a6bb3247a3c7", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>volume</th>\n", + " <th>page number</th>\n", + " <th>category</th>\n", + " <th>subcategory</th>\n", + " <th>format</th>\n", + " <th>handwritten page number</th>\n", + " <th>entry_ID</th>\n", + " <th>entry</th>\n", + " <th>cleaned entry</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1620</th>\n", + " <td>14.376</td>\n", + " <td>280</td>\n", + " <td>Historia Naturalis</td>\n", + " <td>Plantarum, Arborum, Fruticum & Florum</td>\n", + " <td>Folio</td>\n", + " <td>240</td>\n", + " <td>14.376_280_05</td>\n", + " <td>451 Memoires pour l'Hist. des Plantes de L'acad. des Sciences. &c. vide Imagines Pynacothecæ Regiæ.</td>\n", + " <td>Memoires pour l'Hist des Plantes de L'acad des Sciences &c vide Imagines Pynacothecae Regiae</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " volume page number category \n", + "1620 14.376 280 Historia Naturalis \\\n", + "\n", + " subcategory format handwritten page number \n", + "1620 Plantarum, Arborum, Fruticum & Florum Folio 240 \\\n", + "\n", + " entry_ID \n", + "1620 14.376_280_05 \\\n", + "\n", + " entry \n", + "1620 451 Memoires pour l'Hist. des Plantes de L'acad. des Sciences. &c. vide Imagines Pynacothecæ Regiæ. \\\n", + "\n", + " cleaned entry \n", + "1620 Memoires pour l'Hist des Plantes de L'acad des Sciences &c vide Imagines Pynacothecae Regiae " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "entry_df[entry_df['entry_ID'] == '14.376_280_05']" + ] + }, { "cell_type": "code", "execution_count": 24, diff --git a/Notebooks/XML_Aufbereitung.ipynb b/Notebooks/XML_Aufbereitung.ipynb index 27a17b9af590f57be0f4fcdf1a83f53b5d838c90..eb317fa26e0a7f630b14b5c9065f86c0a67ce35f 100644 --- a/Notebooks/XML_Aufbereitung.ipynb +++ b/Notebooks/XML_Aufbereitung.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 88, "id": "5b24e324-6659-482d-8d82-39c1d604f0d3", "metadata": { "tags": [] @@ -20,6 +20,7 @@ "import requests\n", "import json\n", "import pandas as pd\n", + "import lxml.etree as et\n", "# from scipy.optimize import curve_fit\n", "# import pandas as pd" ] @@ -2483,7 +2484,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 49, "id": "ae4d5368-bda2-4cdf-a170-f7e0eda103c3", "metadata": { "tags": [] @@ -2517,7 +2518,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 50, "id": "b73227d4-a0f1-4113-ad2c-093507aaf3a4", "metadata": { "tags": [] @@ -2562,7 +2563,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 51, "id": "ebb4894b-76db-4146-b62a-86c30f2f8609", "metadata": { "tags": [] @@ -2582,7 +2583,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 52, "id": "ba37d08d-f064-4ba0-aaf5-d33c9893bb1c", "metadata": { "tags": [] @@ -2619,7 +2620,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 53, "id": "629a49b3-75c7-4e3d-b3aa-5c6165c32e06", "metadata": { "tags": [] @@ -2642,7 +2643,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 54, "id": "ec90629a-8afb-4a65-b6bb-782830fd8bcf", "metadata": { "tags": [] @@ -2699,7 +2700,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 55, "id": "bd52a621-e46c-49d8-a5c5-a243ee85fba4", "metadata": { "tags": [] @@ -2724,7 +2725,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 56, "id": "a4a7bc48-ec82-4b67-b0dd-055d27dd008e", "metadata": { "tags": [] @@ -2739,7 +2740,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 57, "id": "710a147b-ea34-47ca-a381-5b477d8f12ab", "metadata": { "tags": [] @@ -2756,7 +2757,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 58, "id": "e7e81cbf-e473-4ae4-b846-2aea6b2e5c81", "metadata": { "tags": [] @@ -2781,7 +2782,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 59, "id": "de097669-9180-4510-a9ac-d215433a7fca", "metadata": { "tags": [] @@ -2804,7 +2805,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 60, "id": "4687407c-d205-4aab-8c86-fafa73b997b9", "metadata": { "tags": [] @@ -2832,7 +2833,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 61, "id": "32240f6b-1ecc-47d1-8f2d-0cd14f899b3d", "metadata": { "tags": [] @@ -2859,7 +2860,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 62, "id": "55b131a0-58aa-4eb1-ba47-92b3bb46ffaa", "metadata": { "tags": [] @@ -2895,7 +2896,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 63, "id": "a50d2147-4ee5-4d6b-a950-b4aca1f5e5ca", "metadata": { "tags": [] @@ -2972,8 +2973,7 @@ "</change>\n", "''', \"lxml-xml\")\n", " listChange = tei[key].listChange\n", - " listChange.append(change)\n", - " print(listChange)" + " listChange.append(change)" ] }, { @@ -2986,7 +2986,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 66, "id": "ee793bf1-5a43-4715-8b2b-b5780d6fd8b3", "metadata": { "tags": [] @@ -3001,10 +3001,150 @@ " tei_xml_output.write(tei_3.prettify(formatter='minimal'))" ] }, + { + "cell_type": "markdown", + "id": "f92f803b-a4d6-473a-955a-d8c174fc72ad", + "metadata": {}, + "source": [ + "## Fix additional line breaks originating from stripped tags" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "id": "092825b9-9755-4658-8fd5-2183926ed981", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "with open('../../digital-edition/Step2_Matching/Cod. 14.376_tei.xml', 'rb') as tei_xml:\n", + " xml_1 = tei_xml.read()\n", + "with open('../../digital-edition/Step2_Matching/Cod. 14.377_tei.xml', 'rb') as tei_xml:\n", + " xml_2 = tei_xml.read()\n", + "with open('../../digital-edition/Step2_Matching/Cod. 14.378_tei.xml', 'rb') as tei_xml:\n", + " xml_3 = tei_xml.read()\n", + "\n", + "xmls = [xml_1, xml_2, xml_3]\n", + "\n", + "str_xsl = '''<xsl:stylesheet version=\"1.0\" xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\">\n", + " <xsl:output indent=\"yes\"/>\n", + " <xsl:strip-space elements=\"*\"/>\n", + "\n", + " <!-- IDENTITY TRANSFORM -->\n", + " <xsl:template match=\"@*|node()\">\n", + " <xsl:copy>\n", + " <xsl:apply-templates select=\"@*|node()\"/>\n", + " </xsl:copy>\n", + " </xsl:template>\n", + "\n", + " <!-- RUN normalize-space() ON ALL TEXT NODES -->\n", + " <xsl:template match=\"text()\">\n", + " <xsl:copy-of select=\"normalize-space()\"/>\n", + " </xsl:template> \n", + "</xsl:stylesheet>\n", + "'''\n", + "\n", + "for i, xml in enumerate(xmls):\n", + " str_xml = xml\n", + " doc = et.fromstring(str_xml)\n", + " style = et.fromstring(str_xsl)\n", + " transformer = et.XSLT(style)\n", + " result = transformer(doc)\n", + " str_res = str(result)\n", + " # str_res = re.sub('(?<=[>\\S])<idno', ' <idno', str_res)\n", + " str_res = re.sub('\"n_signature\">', '\"n_signature\"> ', str_res)\n", + " # str_res = re.sub('(?<=[>\\S])<hi', ' <hi', str_res)\n", + " with open(f'../../digital-edition/Step3_Header/Cod. 14.37{i+6}_tei.xml', 'w') as f:\n", + " f.write(str_res)" + ] + }, + { + "cell_type": "markdown", + "id": "636031ca-c748-478d-8a26-6175a9fd004e", + "metadata": {}, + "source": [ + "## Add `<idno type=\"PID\">o:bed.cod-14.3XX</idno>` and `<msDesc>` to header" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "id": "63477b16-cda4-4eea-a27d-4ec0c5ceea6c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "with open('../../digital-edition/Step3_Header/Cod. 14.376_tei.xml', 'rb') as tei_xml:\n", + " xml_1 = tei_xml.read()\n", + "with open('../../digital-edition/Step3_Header/Cod. 14.377_tei.xml', 'rb') as tei_xml:\n", + " xml_2 = tei_xml.read()\n", + "with open('../../digital-edition/Step3_Header/Cod. 14.378_tei.xml', 'rb') as tei_xml:\n", + " xml_3 = tei_xml.read()\n", + "\n", + "xmls = [xml_1, xml_2, xml_3]\n", + "\n", + "ns = {\n", + " 'xml': 'http://www.tei-c.org/ns/1.0'\n", + "}\n", + "\n", + "msDescIds = [\n", + " {\n", + " 'AC': 'AC13948858',\n", + " 'REPO': '131ABCFD',\n", + " 'DOD': '51202'\n", + " },\n", + " {\n", + " 'AC': 'AC13956022',\n", + " 'REPO': '131ABD07',\n", + " 'DOD': '51184'\n", + " },\n", + " {\n", + " 'AC': 'AC13956023',\n", + " 'REPO': '131ABD79',\n", + " 'DOD': '51219'\n", + " }\n", + "]\n", + "\n", + "for i, xml in enumerate(xmls):\n", + " doc = et.XML(xml)\n", + " pubStmt = doc.find('.//xml:publicationStmt', namespaces=ns)\n", + " pubStmt.append(et.XML(f'<idno type=\"PID\">o>bed.cod-14.37{i+6}</idno>', base_url=ns['xml']))\n", + " \n", + " title = doc.find('.//xml:title[@type=\"main\"]', namespaces=ns)\n", + " title.text = f'Catalogus Librorum Bibliothecae [...] Principis Eugenii [...] Tomus {\"I\"*(i+1)}'\n", + " \n", + " msDesc = et.XML(f'''\n", + " <msDesc>\n", + " <msIdentifier>\n", + " <settlement>Wien</settlement>\n", + " <repository>Österreichische Nationalbibliothek</repository>\n", + " <collection type=\"main\">Sammlung von Handschriften und alten Drucken</collection>\n", + " <idno type=\"sn\">Signatur: Cod. 1437{i+6} HAN MAG</idno>\n", + " <altIdentifier>\n", + " <idno type=\"AC\">{msDescIds[i]['AC']}</idno>\n", + " </altIdentifier>\n", + " <altIdentifier>\n", + " <idno type=\"REPO\">{msDescIds[i]['REPO']}</idno>\n", + " </altIdentifier>\n", + " <altIdentifier>\n", + " <idno type=\"DOD\">{msDescIds[i]['DOD']}</idno>\n", + " </altIdentifier>\n", + " </msIdentifier>\n", + " </msDesc>\n", + "''', base_url=ns['xml'])\n", + " sourceDesc = doc.find('.//xml:sourceDesc', namespaces=ns)\n", + " sourceDesc.append(msDesc)\n", + " \n", + " with open(f'../../digital-edition/Step3_Header/Cod. 14.37{i+6}_tei.xml', 'wb') as f:\n", + " f.write(et.tostring(doc, encoding='utf-8', xml_declaration=True))" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "f7d6908c-6008-49f7-b854-3b3f66c34986", + "id": "5850f6ad-c2af-40f5-b4f6-0e1a3329d6d2", "metadata": {}, "outputs": [], "source": []