From 02f99229a38ba8f1b76a02be812374acc37ebd44 Mon Sep 17 00:00:00 2001 From: smayer <simon.mayer@onb.ac.at> Date: Thu, 17 Oct 2024 10:41:58 +0000 Subject: [PATCH] Update Notebooks for TEI-XML changes --- Notebooks/String_matching.ipynb | 171 ++++++++++++++++++++--------- Notebooks/XML_Aufbereitung.ipynb | 180 +++++++++++++++++++++++++++---- 2 files changed, 281 insertions(+), 70 deletions(-) diff --git a/Notebooks/String_matching.ipynb b/Notebooks/String_matching.ipynb index bffe310..90e4af2 100644 --- a/Notebooks/String_matching.ipynb +++ b/Notebooks/String_matching.ipynb @@ -115,7 +115,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 15, "id": "c0f4a42a-7e21-41e8-833c-2dd2f9d1985e", "metadata": { "tags": [] @@ -125,7 +125,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "3\n" + "2\n" ] }, { @@ -162,68 +162,56 @@ " </thead>\n", " <tbody>\n", " <tr>\n", - " <th>133</th>\n", - " <td>14.376</td>\n", - " <td>45</td>\n", - " <td>Theologia</td>\n", - " <td>Critici Sacri</td>\n", + " <th>8398</th>\n", + " <td>14.378</td>\n", + " <td>438</td>\n", + " <td>Paralipomena Historica</td>\n", + " <td>De Re Litterariâ & Bibliographicâ, cum Catalogis Bibliothecarum</td>\n", " <td>Quarto</td>\n", - " <td>21</td>\n", - " <td>14.376_045_00</td>\n", - " <td>Goësij (Willhelmi) Pilatus judex; cui accedunt¬ Theologi cujusdam in Pilatum judicem Stricturæ, cum ejusdem Goësij notis & animadversionibus.¬ Hagæ Comitis. 1677. Ioan. Tongerloo. n. 200.</td>\n", - " <td>Goesij Willhelmi Pilatus judex cui accedunt Theologi cujusdam in Pilatum judicem Stricturae cum ejusdem Goesij notis & animadversionibus Hagae Comitis 1677 Ioan Tongerloo n 200</td>\n", + " <td>1270</td>\n", + " <td>14.378_438_00</td>\n", + " <td>Le Iournal des Savans commencé en 1665 par M.r de Hédouville, & continué d'année en année jusques & compris 172 par des Anonymes. 64 Vol. Paris. 1665. & années Suivantes. n. 2144 Plus les deux derniers vol. del A. 1733, et 1734 n. 2144.</td>\n", + " <td>Le Iournal des Savans commence en 1665 par Mr de Hedouville & continue d'annee en annee jusques & compris 172 par des Anonymes 64 Vol Paris 1665 & annees Suivantes n 2144 Plus les deux derniers vol del A 1733 et 1734 n 2144</td>\n", " </tr>\n", " <tr>\n", - " <th>338</th>\n", - " <td>14.376</td>\n", - " <td>85</td>\n", - " <td>Theologia</td>\n", - " <td>Sanctissimi Patres Latini</td>\n", + " <th>8446</th>\n", + " <td>14.378</td>\n", + " <td>444</td>\n", + " <td>Paralipomena Historica</td>\n", + " <td>De Re Litterariâ & Bibliographicâ, cum Catalogis Bibliothecarum</td>\n", " <td>Octavo und kleiner</td>\n", - " <td>61</td>\n", - " <td>14.376_085_04</td>\n", - " <td>S. Augustin de la veritable Religion et des moeurs de L'Eglise catholique trad. en Franç. (par M. Dubois) 8.° Paris. 1694. Louis Guerin. n. 200</td>\n", - " <td>S Augustin de la veritable Religion et des moeurs de L'Eglise catholique trad en Franç par M Dubois 8° Paris 1694 Louis Guerin n 200</td>\n", - " </tr>\n", - " <tr>\n", - " <th>930</th>\n", - " <td>14.376</td>\n", - " <td>180</td>\n", - " <td>Iurisprudentia</td>\n", - " <td>Ius Civile, Publicum, & Municipale</td>\n", - " <td>Folio</td>\n", - " <td>148</td>\n", - " <td>14.376_180_00</td>\n", - " <td>243 Sigonij (Car.) de antiquo Iure Populi Rom. Libri XI. nempè, de antiquo jure Civium Romanorum Libri II. de Iure antiquo Italiæ Libri III. de antiquo Iure¬ Provinciarum Libri III. ac de Iudiciis Libri III. Bononiæ 1574. Societas Typographorum. n. 200.</td>\n", - " <td>Sigonij Car de antiquo Iure Populi Rom Libri XI nempe de antiquo jure Civium Romanorum Libri II de Iure antiquo Italiae Libri III de antiquo Iure Provinciarum Libri III ac de Iudiciis Libri III Bononiae 1574 Societas Typographorum n 200</td>\n", + " <td>1276</td>\n", + " <td>14.378_444_00</td>\n", + " <td>Iournal Litteraire composé par un nombre de Savans 12.° 6 Vol. La Haye. 1713. & Suiv. Iohnson. n. 2252.</td>\n", + " <td>Iournal Litteraire compose par un nombre de Savans 12° 6 Vol La Haye 1713 & Suiv Iohnson n 2252</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ - " volume page number category subcategory \n", - "133 14.376 45 Theologia Critici Sacri \\\n", - "338 14.376 85 Theologia Sanctissimi Patres Latini \n", - "930 14.376 180 Iurisprudentia Ius Civile, Publicum, & Municipale \n", + " volume page number category \n", + "8398 14.378 438 Paralipomena Historica \\\n", + "8446 14.378 444 Paralipomena Historica \n", + "\n", + " subcategory \n", + "8398 De Re Litterariâ & Bibliographicâ, cum Catalogis Bibliothecarum \\\n", + "8446 De Re Litterariâ & Bibliographicâ, cum Catalogis Bibliothecarum \n", "\n", - " format handwritten page number entry_ID \n", - "133 Quarto 21 14.376_045_00 \\\n", - "338 Octavo und kleiner 61 14.376_085_04 \n", - "930 Folio 148 14.376_180_00 \n", + " format handwritten page number entry_ID \n", + "8398 Quarto 1270 14.378_438_00 \\\n", + "8446 Octavo und kleiner 1276 14.378_444_00 \n", "\n", - " entry \n", - "133 Goësij (Willhelmi) Pilatus judex; cui accedunt¬ Theologi cujusdam in Pilatum judicem Stricturæ, cum ejusdem Goësij notis & animadversionibus.¬ Hagæ Comitis. 1677. Ioan. Tongerloo. n. 200. \\\n", - "338 S. Augustin de la veritable Religion et des moeurs de L'Eglise catholique trad. en Franç. (par M. Dubois) 8.° Paris. 1694. Louis Guerin. n. 200 \n", - "930 243 Sigonij (Car.) de antiquo Iure Populi Rom. Libri XI. nempè, de antiquo jure Civium Romanorum Libri II. de Iure antiquo Italiæ Libri III. de antiquo Iure¬ Provinciarum Libri III. ac de Iudiciis Libri III. Bononiæ 1574. Societas Typographorum. n. 200. \n", + " entry \n", + "8398 Le Iournal des Savans commencé en 1665 par M.r de Hédouville, & continué d'année en année jusques & compris 172 par des Anonymes. 64 Vol. Paris. 1665. & années Suivantes. n. 2144 Plus les deux derniers vol. del A. 1733, et 1734 n. 2144. \\\n", + "8446 Iournal Litteraire composé par un nombre de Savans 12.° 6 Vol. La Haye. 1713. & Suiv. Iohnson. n. 2252. \n", "\n", - " cleaned entry \n", - "133 Goesij Willhelmi Pilatus judex cui accedunt Theologi cujusdam in Pilatum judicem Stricturae cum ejusdem Goesij notis & animadversionibus Hagae Comitis 1677 Ioan Tongerloo n 200 \n", - "338 S Augustin de la veritable Religion et des moeurs de L'Eglise catholique trad en Franç par M Dubois 8° Paris 1694 Louis Guerin n 200 \n", - "930 Sigonij Car de antiquo Iure Populi Rom Libri XI nempe de antiquo jure Civium Romanorum Libri II de Iure antiquo Italiae Libri III de antiquo Iure Provinciarum Libri III ac de Iudiciis Libri III Bononiae 1574 Societas Typographorum n 200 " + " cleaned entry \n", + "8398 Le Iournal des Savans commence en 1665 par Mr de Hedouville & continue d'annee en annee jusques & compris 172 par des Anonymes 64 Vol Paris 1665 & annees Suivantes n 2144 Plus les deux derniers vol del A 1733 et 1734 n 2144 \n", + "8446 Iournal Litteraire compose par un nombre de Savans 12° 6 Vol La Haye 1713 & Suiv Iohnson n 2252 " ] }, - "execution_count": 8, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -232,11 +220,94 @@ "def search_in_entry(df, string):\n", " return df[df['cleaned entry'].str.contains(string)]\n", "\n", - "info = search_in_entry(search_in_entry(entry_df, ''), 'n 200\\Z')\n", + "info = search_in_entry(search_in_entry(entry_df, 'Savans'), 'Iournal')\n", "print(len(info))\n", "info" ] }, + { + "cell_type": "code", + "execution_count": 12, + "id": "97b7a8d5-154c-4b7a-8d9e-a6bb3247a3c7", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>volume</th>\n", + " <th>page number</th>\n", + " <th>category</th>\n", + " <th>subcategory</th>\n", + " <th>format</th>\n", + " <th>handwritten page number</th>\n", + " <th>entry_ID</th>\n", + " <th>entry</th>\n", + " <th>cleaned entry</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1620</th>\n", + " <td>14.376</td>\n", + " <td>280</td>\n", + " <td>Historia Naturalis</td>\n", + " <td>Plantarum, Arborum, Fruticum & Florum</td>\n", + " <td>Folio</td>\n", + " <td>240</td>\n", + " <td>14.376_280_05</td>\n", + " <td>451 Memoires pour l'Hist. des Plantes de L'acad. des Sciences. &c. vide Imagines Pynacothecæ Regiæ.</td>\n", + " <td>Memoires pour l'Hist des Plantes de L'acad des Sciences &c vide Imagines Pynacothecae Regiae</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " volume page number category \n", + "1620 14.376 280 Historia Naturalis \\\n", + "\n", + " subcategory format handwritten page number \n", + "1620 Plantarum, Arborum, Fruticum & Florum Folio 240 \\\n", + "\n", + " entry_ID \n", + "1620 14.376_280_05 \\\n", + "\n", + " entry \n", + "1620 451 Memoires pour l'Hist. des Plantes de L'acad. des Sciences. &c. vide Imagines Pynacothecæ Regiæ. \\\n", + "\n", + " cleaned entry \n", + "1620 Memoires pour l'Hist des Plantes de L'acad des Sciences &c vide Imagines Pynacothecae Regiae " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "entry_df[entry_df['entry_ID'] == '14.376_280_05']" + ] + }, { "cell_type": "code", "execution_count": 24, diff --git a/Notebooks/XML_Aufbereitung.ipynb b/Notebooks/XML_Aufbereitung.ipynb index 27a17b9..eb317fa 100644 --- a/Notebooks/XML_Aufbereitung.ipynb +++ b/Notebooks/XML_Aufbereitung.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 88, "id": "5b24e324-6659-482d-8d82-39c1d604f0d3", "metadata": { "tags": [] @@ -20,6 +20,7 @@ "import requests\n", "import json\n", "import pandas as pd\n", + "import lxml.etree as et\n", "# from scipy.optimize import curve_fit\n", "# import pandas as pd" ] @@ -2483,7 +2484,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 49, "id": "ae4d5368-bda2-4cdf-a170-f7e0eda103c3", "metadata": { "tags": [] @@ -2517,7 +2518,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 50, "id": "b73227d4-a0f1-4113-ad2c-093507aaf3a4", "metadata": { "tags": [] @@ -2562,7 +2563,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 51, "id": "ebb4894b-76db-4146-b62a-86c30f2f8609", "metadata": { "tags": [] @@ -2582,7 +2583,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 52, "id": "ba37d08d-f064-4ba0-aaf5-d33c9893bb1c", "metadata": { "tags": [] @@ -2619,7 +2620,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 53, "id": "629a49b3-75c7-4e3d-b3aa-5c6165c32e06", "metadata": { "tags": [] @@ -2642,7 +2643,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 54, "id": "ec90629a-8afb-4a65-b6bb-782830fd8bcf", "metadata": { "tags": [] @@ -2699,7 +2700,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 55, "id": "bd52a621-e46c-49d8-a5c5-a243ee85fba4", "metadata": { "tags": [] @@ -2724,7 +2725,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 56, "id": "a4a7bc48-ec82-4b67-b0dd-055d27dd008e", "metadata": { "tags": [] @@ -2739,7 +2740,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 57, "id": "710a147b-ea34-47ca-a381-5b477d8f12ab", "metadata": { "tags": [] @@ -2756,7 +2757,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 58, "id": "e7e81cbf-e473-4ae4-b846-2aea6b2e5c81", "metadata": { "tags": [] @@ -2781,7 +2782,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 59, "id": "de097669-9180-4510-a9ac-d215433a7fca", "metadata": { "tags": [] @@ -2804,7 +2805,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 60, "id": "4687407c-d205-4aab-8c86-fafa73b997b9", "metadata": { "tags": [] @@ -2832,7 +2833,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 61, "id": "32240f6b-1ecc-47d1-8f2d-0cd14f899b3d", "metadata": { "tags": [] @@ -2859,7 +2860,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 62, "id": "55b131a0-58aa-4eb1-ba47-92b3bb46ffaa", "metadata": { "tags": [] @@ -2895,7 +2896,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 63, "id": "a50d2147-4ee5-4d6b-a950-b4aca1f5e5ca", "metadata": { "tags": [] @@ -2972,8 +2973,7 @@ "</change>\n", "''', \"lxml-xml\")\n", " listChange = tei[key].listChange\n", - " listChange.append(change)\n", - " print(listChange)" + " listChange.append(change)" ] }, { @@ -2986,7 +2986,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 66, "id": "ee793bf1-5a43-4715-8b2b-b5780d6fd8b3", "metadata": { "tags": [] @@ -3001,10 +3001,150 @@ " tei_xml_output.write(tei_3.prettify(formatter='minimal'))" ] }, + { + "cell_type": "markdown", + "id": "f92f803b-a4d6-473a-955a-d8c174fc72ad", + "metadata": {}, + "source": [ + "## Fix additional line breaks originating from stripped tags" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "id": "092825b9-9755-4658-8fd5-2183926ed981", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "with open('../../digital-edition/Step2_Matching/Cod. 14.376_tei.xml', 'rb') as tei_xml:\n", + " xml_1 = tei_xml.read()\n", + "with open('../../digital-edition/Step2_Matching/Cod. 14.377_tei.xml', 'rb') as tei_xml:\n", + " xml_2 = tei_xml.read()\n", + "with open('../../digital-edition/Step2_Matching/Cod. 14.378_tei.xml', 'rb') as tei_xml:\n", + " xml_3 = tei_xml.read()\n", + "\n", + "xmls = [xml_1, xml_2, xml_3]\n", + "\n", + "str_xsl = '''<xsl:stylesheet version=\"1.0\" xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\">\n", + " <xsl:output indent=\"yes\"/>\n", + " <xsl:strip-space elements=\"*\"/>\n", + "\n", + " <!-- IDENTITY TRANSFORM -->\n", + " <xsl:template match=\"@*|node()\">\n", + " <xsl:copy>\n", + " <xsl:apply-templates select=\"@*|node()\"/>\n", + " </xsl:copy>\n", + " </xsl:template>\n", + "\n", + " <!-- RUN normalize-space() ON ALL TEXT NODES -->\n", + " <xsl:template match=\"text()\">\n", + " <xsl:copy-of select=\"normalize-space()\"/>\n", + " </xsl:template> \n", + "</xsl:stylesheet>\n", + "'''\n", + "\n", + "for i, xml in enumerate(xmls):\n", + " str_xml = xml\n", + " doc = et.fromstring(str_xml)\n", + " style = et.fromstring(str_xsl)\n", + " transformer = et.XSLT(style)\n", + " result = transformer(doc)\n", + " str_res = str(result)\n", + " # str_res = re.sub('(?<=[>\\S])<idno', ' <idno', str_res)\n", + " str_res = re.sub('\"n_signature\">', '\"n_signature\"> ', str_res)\n", + " # str_res = re.sub('(?<=[>\\S])<hi', ' <hi', str_res)\n", + " with open(f'../../digital-edition/Step3_Header/Cod. 14.37{i+6}_tei.xml', 'w') as f:\n", + " f.write(str_res)" + ] + }, + { + "cell_type": "markdown", + "id": "636031ca-c748-478d-8a26-6175a9fd004e", + "metadata": {}, + "source": [ + "## Add `<idno type=\"PID\">o:bed.cod-14.3XX</idno>` and `<msDesc>` to header" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "id": "63477b16-cda4-4eea-a27d-4ec0c5ceea6c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "with open('../../digital-edition/Step3_Header/Cod. 14.376_tei.xml', 'rb') as tei_xml:\n", + " xml_1 = tei_xml.read()\n", + "with open('../../digital-edition/Step3_Header/Cod. 14.377_tei.xml', 'rb') as tei_xml:\n", + " xml_2 = tei_xml.read()\n", + "with open('../../digital-edition/Step3_Header/Cod. 14.378_tei.xml', 'rb') as tei_xml:\n", + " xml_3 = tei_xml.read()\n", + "\n", + "xmls = [xml_1, xml_2, xml_3]\n", + "\n", + "ns = {\n", + " 'xml': 'http://www.tei-c.org/ns/1.0'\n", + "}\n", + "\n", + "msDescIds = [\n", + " {\n", + " 'AC': 'AC13948858',\n", + " 'REPO': '131ABCFD',\n", + " 'DOD': '51202'\n", + " },\n", + " {\n", + " 'AC': 'AC13956022',\n", + " 'REPO': '131ABD07',\n", + " 'DOD': '51184'\n", + " },\n", + " {\n", + " 'AC': 'AC13956023',\n", + " 'REPO': '131ABD79',\n", + " 'DOD': '51219'\n", + " }\n", + "]\n", + "\n", + "for i, xml in enumerate(xmls):\n", + " doc = et.XML(xml)\n", + " pubStmt = doc.find('.//xml:publicationStmt', namespaces=ns)\n", + " pubStmt.append(et.XML(f'<idno type=\"PID\">o>bed.cod-14.37{i+6}</idno>', base_url=ns['xml']))\n", + " \n", + " title = doc.find('.//xml:title[@type=\"main\"]', namespaces=ns)\n", + " title.text = f'Catalogus Librorum Bibliothecae [...] Principis Eugenii [...] Tomus {\"I\"*(i+1)}'\n", + " \n", + " msDesc = et.XML(f'''\n", + " <msDesc>\n", + " <msIdentifier>\n", + " <settlement>Wien</settlement>\n", + " <repository>Österreichische Nationalbibliothek</repository>\n", + " <collection type=\"main\">Sammlung von Handschriften und alten Drucken</collection>\n", + " <idno type=\"sn\">Signatur: Cod. 1437{i+6} HAN MAG</idno>\n", + " <altIdentifier>\n", + " <idno type=\"AC\">{msDescIds[i]['AC']}</idno>\n", + " </altIdentifier>\n", + " <altIdentifier>\n", + " <idno type=\"REPO\">{msDescIds[i]['REPO']}</idno>\n", + " </altIdentifier>\n", + " <altIdentifier>\n", + " <idno type=\"DOD\">{msDescIds[i]['DOD']}</idno>\n", + " </altIdentifier>\n", + " </msIdentifier>\n", + " </msDesc>\n", + "''', base_url=ns['xml'])\n", + " sourceDesc = doc.find('.//xml:sourceDesc', namespaces=ns)\n", + " sourceDesc.append(msDesc)\n", + " \n", + " with open(f'../../digital-edition/Step3_Header/Cod. 14.37{i+6}_tei.xml', 'wb') as f:\n", + " f.write(et.tostring(doc, encoding='utf-8', xml_declaration=True))" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "f7d6908c-6008-49f7-b854-3b3f66c34986", + "id": "5850f6ad-c2af-40f5-b4f6-0e1a3329d6d2", "metadata": {}, "outputs": [], "source": [] -- GitLab