diff --git a/Notebooks/Completing_BE_data.ipynb b/Notebooks/Completing_BE_data.ipynb index 8a66ccc2ff3e08ab034d3609bb7dd5330c2d2c35..54daadf3725cb558d781069660d5f638c57e55cf 100644 --- a/Notebooks/Completing_BE_data.ipynb +++ b/Notebooks/Completing_BE_data.ipynb @@ -925,7 +925,7 @@ }, { "cell_type": "code", - "execution_count": 272, + "execution_count": 276, "id": "b9ee6630-7122-44d9-9d2f-c8113e5a8e97", "metadata": { "tags": [] @@ -940,7 +940,7 @@ "Name: count, dtype: int64" ] }, - "execution_count": 272, + "execution_count": 276, "metadata": {}, "output_type": "execute_result" } @@ -951,7 +951,7 @@ }, { "cell_type": "code", - "execution_count": 273, + "execution_count": 277, "id": "175c1147-0063-45e4-8498-39ee390ea7a0", "metadata": { "tags": [] @@ -967,7 +967,7 @@ "Name: count, dtype: int64" ] }, - "execution_count": 273, + "execution_count": 277, "metadata": {}, "output_type": "execute_result" } @@ -990,68 +990,497 @@ }, { "cell_type": "code", - "execution_count": 274, - "id": "dea71f92-ee6d-474c-b655-6936a86a4586", - "metadata": { - "tags": [] - }, + "execution_count": 278, + "id": "bd73ed76-756a-4a6e-b8f6-183ebfe33ae4", + "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Signatur</th>\n", + " <th>Barcode</th>\n", + " <th>Titel</th>\n", + " <th>Autor</th>\n", + " <th>Mitwirkender</th>\n", + " <th>Anfang Veröffentlichungsdatum</th>\n", + " <th>Ende Veröffentlichungsdatum</th>\n", + " <th>Veröffentlichungsdatum</th>\n", + " <th>Veröffentlichungsort</th>\n", + " <th>Veröffentlichungsort (normiert)</th>\n", + " <th>...</th>\n", + " <th>Wissensunterklasse</th>\n", + " <th>Formatangabe</th>\n", + " <th>hs. Katalogseite Handschrift</th>\n", + " <th>hs. Katalogeintrag ID</th>\n", + " <th>hs. Katalogeintrag</th>\n", + " <th>hs. Katalog Image URL</th>\n", + " <th>dup_title</th>\n", + " <th>copy_from</th>\n", + " <th>Einfache Klassifizierung</th>\n", + " <th>Komplexe Klassifizierung</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>23376</th>\n", + " <td>*28.A.79.(Vol.1)</td>\n", + " <td>Z222907107</td>\n", + " <td>Histoire des ouvrages des scavans</td>\n", + " <td>Basnage de Beauval, Henri</td>\n", + " <td>NaN</td>\n", + " <td>1687.0</td>\n", + " <td>1709.0</td>\n", + " <td>1687-1709</td>\n", + " <td>Rotterdam</td>\n", + " <td>Rotterdam</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>True</td>\n", + " <td>-1.0</td>\n", + " <td>True</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23383</th>\n", + " <td>*28.A.79.(Vol.10)</td>\n", + " <td>Z222908100</td>\n", + " <td>Histoire des ouvrages des scavans</td>\n", + " <td>Basnage de Beauval, Henri</td>\n", + " <td>NaN</td>\n", + " <td>1687.0</td>\n", + " <td>1709.0</td>\n", + " <td>1687-1709</td>\n", + " <td>Rotterdam</td>\n", + " <td>Rotterdam</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>True</td>\n", + " <td>-1.0</td>\n", + " <td>True</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23384</th>\n", + " <td>*28.A.79.(Vol.12)</td>\n", + " <td>Z222908306</td>\n", + " <td>Histoire des ouvrages des scavans</td>\n", + " <td>Basnage de Beauval, Henri</td>\n", + " <td>NaN</td>\n", + " <td>1687.0</td>\n", + " <td>1709.0</td>\n", + " <td>1687-1709</td>\n", + " <td>Rotterdam</td>\n", + " <td>Rotterdam</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>True</td>\n", + " <td>-1.0</td>\n", + " <td>True</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23385</th>\n", + " <td>*28.A.79.(Vol.13)</td>\n", + " <td>Z222908409</td>\n", + " <td>Histoire des ouvrages des scavans</td>\n", + " <td>Basnage de Beauval, Henri</td>\n", + " <td>NaN</td>\n", + " <td>1687.0</td>\n", + " <td>1709.0</td>\n", + " <td>1687-1709</td>\n", + " <td>Rotterdam</td>\n", + " <td>Rotterdam</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>True</td>\n", + " <td>-1.0</td>\n", + " <td>True</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23386</th>\n", + " <td>*28.A.79.(Vol.14)</td>\n", + " <td>Z222908501</td>\n", + " <td>Histoire des ouvrages des scavans</td>\n", + " <td>Basnage de Beauval, Henri</td>\n", + " <td>NaN</td>\n", + " <td>1687.0</td>\n", + " <td>1709.0</td>\n", + " <td>1687-1709</td>\n", + " <td>Rotterdam</td>\n", + " <td>Rotterdam</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>True</td>\n", + " <td>-1.0</td>\n", + " <td>True</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23636</th>\n", + " <td>Ink 9.F.4</td>\n", + " <td>NaN</td>\n", + " <td>Opera</td>\n", + " <td>Sallustius Crispus, Gaius</td>\n", + " <td>NaN</td>\n", + " <td>1481.0</td>\n", + " <td>NaN</td>\n", + " <td>23 Dec. 1481</td>\n", + " <td>Venice</td>\n", + " <td>Venedig</td>\n", + " <td>...</td>\n", + " <td>Historia Romana Sæculorum aliquot, præsertim Imperatorum temporibus</td>\n", + " <td>Folio</td>\n", + " <td>825</td>\n", + " <td>14.377_437_08</td>\n", + " <td>1447.........Ejusdem Historia Eadem. Venetiis. 1481.¬ Baptista de Torris. n. 2217. LIII. R. 12.</td>\n", + " <td>https://iiif.onb.ac.at/images/DOD/51184/00000437.jp2/full/full/0/native.jpg</td>\n", + " <td>True</td>\n", + " <td>-3.0</td>\n", + " <td>True</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22640</th>\n", + " <td>Ink 9.F.5</td>\n", + " <td>1460328-10</td>\n", + " <td>Biblia ; Interpretationes Hebraicorum nominum</td>\n", + " <td>NaN</td>\n", + " <td>Wild, Leonhard</td>\n", + " <td>1481.0</td>\n", + " <td>NaN</td>\n", + " <td>1481</td>\n", + " <td>Venedig</td>\n", + " <td>Venedig</td>\n", + " <td>...</td>\n", + " <td>Textus & Versiones Sacræ Scripturæ</td>\n", + " <td>Folio</td>\n", + " <td>2</td>\n", + " <td>14.376_026_00</td>\n", + " <td>9. Biblia Sacra Latina. Venetiis. 1481. Leonard Wild de Ratisbonâ n. 2302. III. D. 11.</td>\n", + " <td>https://iiif.onb.ac.at/images/DOD/51202/00000026.jp2/full/full/0/native.jpg</td>\n", + " <td>True</td>\n", + " <td>-3.0</td>\n", + " <td>True</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22650</th>\n", + " <td>SA.71.E.58</td>\n", + " <td>Z252861302</td>\n", + " <td>Dialogue sur la musique des anciens</td>\n", + " <td>Chateauneuf, Francois abbe de</td>\n", + " <td>NaN</td>\n", + " <td>1725.0</td>\n", + " <td>NaN</td>\n", + " <td>1725</td>\n", + " <td>Paris</td>\n", + " <td>Paris</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>False</td>\n", + " <td>-1.0</td>\n", + " <td>True</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22651</th>\n", + " <td>SA.71.F.74</td>\n", + " <td>Z252867808</td>\n", + " <td>Friderici Adolfi Lampe De Cymbalis Veterum Libri Tres</td>\n", + " <td>Ember, Paul</td>\n", + " <td>Hase, Cornelius <<von>>; Röell, Herman Alexander</td>\n", + " <td>1703.0</td>\n", + " <td>NaN</td>\n", + " <td>1703</td>\n", + " <td>Trajecti Ad Rhenum</td>\n", + " <td>Utrecht</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>False</td>\n", + " <td>-1.0</td>\n", + " <td>True</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23439</th>\n", + " <td>SA.73.B.48</td>\n", + " <td>Z25920770X</td>\n", + " <td>Claudii Ptolomaei harmonicorum libri tres. Ex Codd. Mss. Undecim, nunc primum graece editus. Johannes Wallis ... recensuit, ed. (etc.)</td>\n", + " <td>Ptolemaeus, Claudius</td>\n", + " <td>Wallis, Johannes</td>\n", + " <td>1682.0</td>\n", + " <td>NaN</td>\n", + " <td>1682</td>\n", + " <td>Oxford</td>\n", + " <td>Oxford</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>False</td>\n", + " <td>-1.0</td>\n", + " <td>True</td>\n", + " <td>2</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>22874 rows × 37 columns</p>\n", + "</div>" + ], "text/plain": [ - "Signatur BE.1.A.14\n", - "Barcode B998501\n", - "Titel Columna Cochlis M. Aurelio Antonio Augusto dicata\n", - "Autor Bellori, Giovanni Pietro\n", - "Mitwirkender NaN\n", - "Anfang Veröffentlichungsdatum 1704.0\n", - "Ende Veröffentlichungsdatum NaN\n", - "Veröffentlichungsdatum 1704\n", - "Veröffentlichungsort Roma\n", - "Veröffentlichungsort (normiert) Rom\n", - "Sprache Italian\n", - "Dateiname NaN\n", - "Wappenklassifizierung NaN\n", - "p_A NaN\n", - "p_B NaN\n", - "p_C NaN\n", - "p_N NaN\n", - "Farbklassifizierung NaN\n", - "p_blue NaN\n", - "p_red NaN\n", - "p_yellow NaN\n", - "IIIF Manifest NaN\n", - "hs. Katalog 1.0\n", - "hs. Katalog Konfidenz sicher | sicher\n", - "hs. Katalogband 14.378 | 14.378\n", - "hs. Katalogseite Digitalisat 371 | 560\n", - "Wissensklasse Paralipomena Historica | Imagines Incisæ\n", - "Wissensunterklasse Antiquitatis Monumenta, seu Ædificia, Amphitheatri, Obelisci, Statuæ, Gemmæ, Lucernæ, Vasa, &c | Imaginum Romæ in æs incisarum Collectio\n", - "Formatangabe Folio | \n", - "hs. Katalogseite Handschrift 1203 | 1380\n", - "hs. Katalogeintrag ID 14.378_371_01 | 14.378_560_03\n", - "hs. Katalogeintrag 2342 Columna Cochlis seu Antoniana; vide Imagines Romæ | CLXXIX Columna Cochlis seu Antoniana a Petro S. Bartholo in æs incisa cum I. Petri Bellorij notis. in fol.° chartâ magnâ, formâ oblongâ. Romæ. 1704. Domin. de Rubeis n. 886.\n", - "hs. Katalog Image URL https://iiif.onb.ac.at/images/DOD/51219/00000371.jp2/full/full/0/native.jpg | https://iiif.onb.ac.at/images/DOD/51219/00000560.jp2/full/full/0/native.jpg\n", - "dup_title False\n", - "copy_from -1.0\n", - "Einfache Klassifizierung True\n", - "Komplexe Klassifizierung 2\n", - "Name: 6, dtype: object" + " Signatur Barcode \n", + "23376 *28.A.79.(Vol.1) Z222907107 \\\n", + "23383 *28.A.79.(Vol.10) Z222908100 \n", + "23384 *28.A.79.(Vol.12) Z222908306 \n", + "23385 *28.A.79.(Vol.13) Z222908409 \n", + "23386 *28.A.79.(Vol.14) Z222908501 \n", + "... ... ... \n", + "23636 Ink 9.F.4 NaN \n", + "22640 Ink 9.F.5 1460328-10 \n", + "22650 SA.71.E.58 Z252861302 \n", + "22651 SA.71.F.74 Z252867808 \n", + "23439 SA.73.B.48 Z25920770X \n", + "\n", + " Titel \n", + "23376 Histoire des ouvrages des scavans \\\n", + "23383 Histoire des ouvrages des scavans \n", + "23384 Histoire des ouvrages des scavans \n", + "23385 Histoire des ouvrages des scavans \n", + "23386 Histoire des ouvrages des scavans \n", + "... ... \n", + "23636 Opera \n", + "22640 Biblia ; Interpretationes Hebraicorum nominum \n", + "22650 Dialogue sur la musique des anciens \n", + "22651 Friderici Adolfi Lampe De Cymbalis Veterum Libri Tres \n", + "23439 Claudii Ptolomaei harmonicorum libri tres. Ex Codd. Mss. Undecim, nunc primum graece editus. Johannes Wallis ... recensuit, ed. (etc.) \n", + "\n", + " Autor \n", + "23376 Basnage de Beauval, Henri \\\n", + "23383 Basnage de Beauval, Henri \n", + "23384 Basnage de Beauval, Henri \n", + "23385 Basnage de Beauval, Henri \n", + "23386 Basnage de Beauval, Henri \n", + "... ... \n", + "23636 Sallustius Crispus, Gaius \n", + "22640 NaN \n", + "22650 Chateauneuf, Francois abbe de \n", + "22651 Ember, Paul \n", + "23439 Ptolemaeus, Claudius \n", + "\n", + " Mitwirkender \n", + "23376 NaN \\\n", + "23383 NaN \n", + "23384 NaN \n", + "23385 NaN \n", + "23386 NaN \n", + "... ... \n", + "23636 NaN \n", + "22640 Wild, Leonhard \n", + "22650 NaN \n", + "22651 Hase, Cornelius <<von>>; Röell, Herman Alexander \n", + "23439 Wallis, Johannes \n", + "\n", + " Anfang Veröffentlichungsdatum Ende Veröffentlichungsdatum \n", + "23376 1687.0 1709.0 \\\n", + "23383 1687.0 1709.0 \n", + "23384 1687.0 1709.0 \n", + "23385 1687.0 1709.0 \n", + "23386 1687.0 1709.0 \n", + "... ... ... \n", + "23636 1481.0 NaN \n", + "22640 1481.0 NaN \n", + "22650 1725.0 NaN \n", + "22651 1703.0 NaN \n", + "23439 1682.0 NaN \n", + "\n", + " Veröffentlichungsdatum Veröffentlichungsort \n", + "23376 1687-1709 Rotterdam \\\n", + "23383 1687-1709 Rotterdam \n", + "23384 1687-1709 Rotterdam \n", + "23385 1687-1709 Rotterdam \n", + "23386 1687-1709 Rotterdam \n", + "... ... ... \n", + "23636 23 Dec. 1481 Venice \n", + "22640 1481 Venedig \n", + "22650 1725 Paris \n", + "22651 1703 Trajecti Ad Rhenum \n", + "23439 1682 Oxford \n", + "\n", + " Veröffentlichungsort (normiert) ... \n", + "23376 Rotterdam ... \\\n", + "23383 Rotterdam ... \n", + "23384 Rotterdam ... \n", + "23385 Rotterdam ... \n", + "23386 Rotterdam ... \n", + "... ... ... \n", + "23636 Venedig ... \n", + "22640 Venedig ... \n", + "22650 Paris ... \n", + "22651 Utrecht ... \n", + "23439 Oxford ... \n", + "\n", + " Wissensunterklasse \n", + "23376 NaN \\\n", + "23383 NaN \n", + "23384 NaN \n", + "23385 NaN \n", + "23386 NaN \n", + "... ... \n", + "23636 Historia Romana Sæculorum aliquot, præsertim Imperatorum temporibus \n", + "22640 Textus & Versiones Sacræ Scripturæ \n", + "22650 NaN \n", + "22651 NaN \n", + "23439 NaN \n", + "\n", + " Formatangabe hs. Katalogseite Handschrift hs. Katalogeintrag ID \n", + "23376 NaN NaN NaN \\\n", + "23383 NaN NaN NaN \n", + "23384 NaN NaN NaN \n", + "23385 NaN NaN NaN \n", + "23386 NaN NaN NaN \n", + "... ... ... ... \n", + "23636 Folio 825 14.377_437_08 \n", + "22640 Folio 2 14.376_026_00 \n", + "22650 NaN NaN NaN \n", + "22651 NaN NaN NaN \n", + "23439 NaN NaN NaN \n", + "\n", + " hs. Katalogeintrag \n", + "23376 NaN \\\n", + "23383 NaN \n", + "23384 NaN \n", + "23385 NaN \n", + "23386 NaN \n", + "... ... \n", + "23636 1447.........Ejusdem Historia Eadem. Venetiis. 1481.¬ Baptista de Torris. n. 2217. LIII. R. 12. \n", + "22640 9. Biblia Sacra Latina. Venetiis. 1481. Leonard Wild de Ratisbonâ n. 2302. III. D. 11. \n", + "22650 NaN \n", + "22651 NaN \n", + "23439 NaN \n", + "\n", + " hs. Katalog Image URL \n", + "23376 NaN \\\n", + "23383 NaN \n", + "23384 NaN \n", + "23385 NaN \n", + "23386 NaN \n", + "... ... \n", + "23636 https://iiif.onb.ac.at/images/DOD/51184/00000437.jp2/full/full/0/native.jpg \n", + "22640 https://iiif.onb.ac.at/images/DOD/51202/00000026.jp2/full/full/0/native.jpg \n", + "22650 NaN \n", + "22651 NaN \n", + "23439 NaN \n", + "\n", + " dup_title copy_from Einfache Klassifizierung Komplexe Klassifizierung \n", + "23376 True -1.0 True 2 \n", + "23383 True -1.0 True 2 \n", + "23384 True -1.0 True 2 \n", + "23385 True -1.0 True 2 \n", + "23386 True -1.0 True 2 \n", + "... ... ... ... ... \n", + "23636 True -3.0 True 2 \n", + "22640 True -3.0 True 2 \n", + "22650 False -1.0 True 2 \n", + "22651 False -1.0 True 2 \n", + "23439 False -1.0 True 2 \n", + "\n", + "[22874 rows x 37 columns]" ] }, - "execution_count": 274, + "execution_count": 278, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "BE_with_Ink_df.loc[6]" + "BE_with_Ink_df" ] }, { "cell_type": "code", "execution_count": null, - "id": "bd73ed76-756a-4a6e-b8f6-183ebfe33ae4", + "id": "51e2acf9-1a2b-4503-8423-c091a5244d9b", "metadata": {}, "outputs": [], "source": [] diff --git a/Notebooks/String_matching.ipynb b/Notebooks/String_matching.ipynb index 6b78eec15e66bad8db41f7445b067ef82889f0d8..bffe310dc69e2fcd762b30381f16e9435f5025b9 100644 --- a/Notebooks/String_matching.ipynb +++ b/Notebooks/String_matching.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 17, + "execution_count": 1, "id": "7a3837ac-cced-4e01-bf57-265e40729692", "metadata": { "tags": [] @@ -24,7 +24,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 2, "id": "29ca0dc8-cae7-4f12-bd60-fd74ea6ae5ac", "metadata": { "tags": [] @@ -36,7 +36,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 3, "id": "c1e1c42a-962f-40bc-bb17-b62e8089feb7", "metadata": { "tags": [] @@ -48,7 +48,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 4, "id": "50d15898-4687-46b7-b7e0-528d7cf9aec0", "metadata": { "tags": [] @@ -70,7 +70,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 5, "id": "990dfeee-1141-4acb-8a3d-a7af0573f5be", "metadata": { "tags": [] @@ -85,7 +85,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 6, "id": "bcd301fe-cb80-4b1c-b65f-465fce5ed915", "metadata": { "tags": [] @@ -104,7 +104,7 @@ " 0.0102726686745882]], dtype=object)" ] }, - "execution_count": 22, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -115,7 +115,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 8, "id": "c0f4a42a-7e21-41e8-833c-2dd2f9d1985e", "metadata": { "tags": [] @@ -125,7 +125,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "6\n" + "3\n" ] }, { @@ -162,124 +162,68 @@ " </thead>\n", " <tbody>\n", " <tr>\n", - " <th>192</th>\n", + " <th>133</th>\n", " <td>14.376</td>\n", - " <td>58</td>\n", + " <td>45</td>\n", " <td>Theologia</td>\n", - " <td>Concilia, & quæ ad eamdem Rem pertinentia</td>\n", - " <td>Folio</td>\n", - " <td>34</td>\n", - " <td>14.376_058_01</td>\n", - " <td>64 Piccolomini (æneæ Sylvij) Fasciculus expetendarum ac fugiendarum rerum, Seu comment. de Concilij Basiliensis gestis; insunt præterea huic operi aliquot epistolæ, libelli, tractatus, & opuscula quæ, si futurum Concilium celebrari contigerit, summopere tanquam¬ cognitu digna & necessaria ad optimis expostulabunt. omnia ab Orthuio Gratio edita. Coloniæ. 1535. n. 168.</td>\n", - " <td>Piccolomini aeneae Sylvij Fasciculus expetendarum ac fugiendarum rerum Seu comment de Concilij Basiliensis gestis insunt praeterea huic operi aliquot epistolae libelli tractatus & opuscula quae si futurum Concilium celebrari contigerit summopere tanquam cognitu digna & necessaria ad optimis expostulabunt omnia ab Orthuio Gratio edita Coloniae 1535 n 168</td>\n", + " <td>Critici Sacri</td>\n", + " <td>Quarto</td>\n", + " <td>21</td>\n", + " <td>14.376_045_00</td>\n", + " <td>Goësij (Willhelmi) Pilatus judex; cui accedunt¬ Theologi cujusdam in Pilatum judicem Stricturæ, cum ejusdem Goësij notis & animadversionibus.¬ Hagæ Comitis. 1677. Ioan. Tongerloo. n. 200.</td>\n", + " <td>Goesij Willhelmi Pilatus judex cui accedunt Theologi cujusdam in Pilatum judicem Stricturae cum ejusdem Goesij notis & animadversionibus Hagae Comitis 1677 Ioan Tongerloo n 200</td>\n", " </tr>\n", " <tr>\n", - " <th>193</th>\n", + " <th>338</th>\n", " <td>14.376</td>\n", - " <td>58</td>\n", + " <td>85</td>\n", " <td>Theologia</td>\n", - " <td>Concilia, & quæ ad eamdem Rem pertinentia</td>\n", - " <td>Folio</td>\n", - " <td>34</td>\n", - " <td>14.376_058_02</td>\n", - " <td>65.......... Idem Fasciculus ab innumeris mendis expurgat. vna cum appendice, seu tomo 2.° scriptorum veterum qui Ecclesiæ Rom. abusus ac errores detegunt & damnant, necessitatemque reformationis vrgent. opera & Studio Edwardi Brown. 2 Vol. Lond. 1690. Rich. Chiswel. n. 169.</td>\n", - " <td>Idem Fasciculus ab innumeris mendis expurgat vna cum appendice seu tomo 2° scriptorum veterum qui Ecclesiae Rom abusus ac errores detegunt & damnant necessitatemque reformationis vrgent opera & Studio Edwardi Brown 2 Vol Lond 1690 Rich Chiswel n 169</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4585</th>\n", - " <td>14.377</td>\n", - " <td>221</td>\n", - " <td>Philologia</td>\n", - " <td>Epistolographi Græci & Latini</td>\n", + " <td>Sanctissimi Patres Latini</td>\n", " <td>Octavo und kleiner</td>\n", - " <td>617</td>\n", - " <td>14.377_221_00</td>\n", - " <td>Fasciculus Latinarum Epistolarum Ludovici Molinæi cum interpretatione gallicâ. 12.° Eleutheropoli. 1676. n. 1589.</td>\n", - " <td>Fasciculus Latinarum Epistolarum Ludovici Molinaei cum interpretatione gallica 12° Eleutheropoli 1676 n 1589</td>\n", + " <td>61</td>\n", + " <td>14.376_085_04</td>\n", + " <td>S. Augustin de la veritable Religion et des moeurs de L'Eglise catholique trad. en Franç. (par M. Dubois) 8.° Paris. 1694. Louis Guerin. n. 200</td>\n", + " <td>S Augustin de la veritable Religion et des moeurs de L'Eglise catholique trad en Franç par M Dubois 8° Paris 1694 Louis Guerin n 200</td>\n", " </tr>\n", " <tr>\n", - " <th>5137</th>\n", - " <td>14.377</td>\n", - " <td>319</td>\n", - " <td>Chronologia</td>\n", - " <td>Chronologia Technica & Historica</td>\n", - " <td>Folio</td>\n", - " <td>711</td>\n", - " <td>14.377_319_00</td>\n", - " <td>1170 Fasciculus Temporum. editio antiqua, sine Loco & Venetijs 1480 XLVIII.R.19 anno. n. 2160. et S. A. n. 2324. XLVIII.R.18</td>\n", - " <td>Fasciculus Temporum editio antiqua sine Loco & Venetijs 1480 XLVIIIR19 anno n 2160 et S A n 2324 XLVIIIR18</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5179</th>\n", - " <td>14.377</td>\n", - " <td>327</td>\n", - " <td>Chronologia</td>\n", - " <td>Chronographi, Seu Chronica, & Historiæ Universales</td>\n", + " <th>930</th>\n", + " <td>14.376</td>\n", + " <td>180</td>\n", + " <td>Iurisprudentia</td>\n", + " <td>Ius Civile, Publicum, & Municipale</td>\n", " <td>Folio</td>\n", - " <td>719</td>\n", - " <td>14.377_327_03</td>\n", - " <td>1192 Chronica qua dicitur Fasciculus temporum per quemdam Carthusiensem edita, nunc emendata cum additionibus ad hæc usque tempora. Venetiis. 1480. Erhardus Ratdolt.</td>\n", - " <td>Chronica qua dicitur Fasciculus temporum per quemdam Carthusiensem edita nunc emendata cum additionibus ad haec usque tempora Venetiis 1480 Erhardus Ratdolt</td>\n", - " </tr>\n", - " <tr>\n", - " <th>8487</th>\n", - " <td>14.378</td>\n", - " <td>448</td>\n", - " <td>Paralipomena Historica</td>\n", - " <td>De Academiis, Universitatibus, Scholis, Colleg. &c</td>\n", - " <td>Quarto</td>\n", - " <td>1280</td>\n", - " <td>14.378_448_02</td>\n", - " <td>Omeisij (Magni Dan.) Academiæ Altdorfinæ Gloria, Sive Orationum Fasciculus, Vniversitatis Noricæ Ortus & Progressus, omniumque ipsius Professorum Vitæ & Scripta; accedunt I. Pauli Felwingeri additamenta quædam. Altdorfi. 1683. Meyerus. n. 1600. Fasti Consolari dell'Academia Fiorentina da Salvino Salvini. in Firenze. 1717. Tartini. n. 2151 Dissertationes Ioannis Melchioris Schwimmer de¬ Academicis omnium Facultatum Professoribus. Ienæ. 1671. Mullerus. n. 2152.</td>\n", - " <td>Omeisij Magni Dan Academiae Altdorfinae Gloria Sive Orationum Fasciculus Vniversitatis Noricae Ortus & Progressus omniumque ipsius Professorum Vitae & Scripta accedunt I Pauli Felwingeri additamenta quaedam Altdorfi 1683 Meyerus n 1600 Fasti Consolari dell'Academia Fiorentina da Salvino Salvini in Firenze 1717 Tartini n 2151 Dissertationes Ioannis Melchioris Schwimmer de Academicis omnium Facultatum Professoribus Ienae 1671 Mullerus n 2152</td>\n", + " <td>148</td>\n", + " <td>14.376_180_00</td>\n", + " <td>243 Sigonij (Car.) de antiquo Iure Populi Rom. Libri XI. nempè, de antiquo jure Civium Romanorum Libri II. de Iure antiquo Italiæ Libri III. de antiquo Iure¬ Provinciarum Libri III. ac de Iudiciis Libri III. Bononiæ 1574. Societas Typographorum. n. 200.</td>\n", + " <td>Sigonij Car de antiquo Iure Populi Rom Libri XI nempe de antiquo jure Civium Romanorum Libri II de Iure antiquo Italiae Libri III de antiquo Iure Provinciarum Libri III ac de Iudiciis Libri III Bononiae 1574 Societas Typographorum n 200</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ - " volume page number category \n", - "192 14.376 58 Theologia \\\n", - "193 14.376 58 Theologia \n", - "4585 14.377 221 Philologia \n", - "5137 14.377 319 Chronologia \n", - "5179 14.377 327 Chronologia \n", - "8487 14.378 448 Paralipomena Historica \n", + " volume page number category subcategory \n", + "133 14.376 45 Theologia Critici Sacri \\\n", + "338 14.376 85 Theologia Sanctissimi Patres Latini \n", + "930 14.376 180 Iurisprudentia Ius Civile, Publicum, & Municipale \n", "\n", - " subcategory format \n", - "192 Concilia, & quæ ad eamdem Rem pertinentia Folio \\\n", - "193 Concilia, & quæ ad eamdem Rem pertinentia Folio \n", - "4585 Epistolographi Græci & Latini Octavo und kleiner \n", - "5137 Chronologia Technica & Historica Folio \n", - "5179 Chronographi, Seu Chronica, & Historiæ Universales Folio \n", - "8487 De Academiis, Universitatibus, Scholis, Colleg. &c Quarto \n", - "\n", - " handwritten page number entry_ID \n", - "192 34 14.376_058_01 \\\n", - "193 34 14.376_058_02 \n", - "4585 617 14.377_221_00 \n", - "5137 711 14.377_319_00 \n", - "5179 719 14.377_327_03 \n", - "8487 1280 14.378_448_02 \n", + " format handwritten page number entry_ID \n", + "133 Quarto 21 14.376_045_00 \\\n", + "338 Octavo und kleiner 61 14.376_085_04 \n", + "930 Folio 148 14.376_180_00 \n", "\n", - " entry \n", - "192 64 Piccolomini (æneæ Sylvij) Fasciculus expetendarum ac fugiendarum rerum, Seu comment. de Concilij Basiliensis gestis; insunt præterea huic operi aliquot epistolæ, libelli, tractatus, & opuscula quæ, si futurum Concilium celebrari contigerit, summopere tanquam¬ cognitu digna & necessaria ad optimis expostulabunt. omnia ab Orthuio Gratio edita. Coloniæ. 1535. n. 168. \\\n", - "193 65.......... Idem Fasciculus ab innumeris mendis expurgat. vna cum appendice, seu tomo 2.° scriptorum veterum qui Ecclesiæ Rom. abusus ac errores detegunt & damnant, necessitatemque reformationis vrgent. opera & Studio Edwardi Brown. 2 Vol. Lond. 1690. Rich. Chiswel. n. 169. \n", - "4585 Fasciculus Latinarum Epistolarum Ludovici Molinæi cum interpretatione gallicâ. 12.° Eleutheropoli. 1676. n. 1589. \n", - "5137 1170 Fasciculus Temporum. editio antiqua, sine Loco & Venetijs 1480 XLVIII.R.19 anno. n. 2160. et S. A. n. 2324. XLVIII.R.18 \n", - "5179 1192 Chronica qua dicitur Fasciculus temporum per quemdam Carthusiensem edita, nunc emendata cum additionibus ad hæc usque tempora. Venetiis. 1480. Erhardus Ratdolt. \n", - "8487 Omeisij (Magni Dan.) Academiæ Altdorfinæ Gloria, Sive Orationum Fasciculus, Vniversitatis Noricæ Ortus & Progressus, omniumque ipsius Professorum Vitæ & Scripta; accedunt I. Pauli Felwingeri additamenta quædam. Altdorfi. 1683. Meyerus. n. 1600. Fasti Consolari dell'Academia Fiorentina da Salvino Salvini. in Firenze. 1717. Tartini. n. 2151 Dissertationes Ioannis Melchioris Schwimmer de¬ Academicis omnium Facultatum Professoribus. Ienæ. 1671. Mullerus. n. 2152. \n", + " entry \n", + "133 Goësij (Willhelmi) Pilatus judex; cui accedunt¬ Theologi cujusdam in Pilatum judicem Stricturæ, cum ejusdem Goësij notis & animadversionibus.¬ Hagæ Comitis. 1677. Ioan. Tongerloo. n. 200. \\\n", + "338 S. Augustin de la veritable Religion et des moeurs de L'Eglise catholique trad. en Franç. (par M. Dubois) 8.° Paris. 1694. Louis Guerin. n. 200 \n", + "930 243 Sigonij (Car.) de antiquo Iure Populi Rom. Libri XI. nempè, de antiquo jure Civium Romanorum Libri II. de Iure antiquo Italiæ Libri III. de antiquo Iure¬ Provinciarum Libri III. ac de Iudiciis Libri III. Bononiæ 1574. Societas Typographorum. n. 200. \n", "\n", - " cleaned entry \n", - "192 Piccolomini aeneae Sylvij Fasciculus expetendarum ac fugiendarum rerum Seu comment de Concilij Basiliensis gestis insunt praeterea huic operi aliquot epistolae libelli tractatus & opuscula quae si futurum Concilium celebrari contigerit summopere tanquam cognitu digna & necessaria ad optimis expostulabunt omnia ab Orthuio Gratio edita Coloniae 1535 n 168 \n", - "193 Idem Fasciculus ab innumeris mendis expurgat vna cum appendice seu tomo 2° scriptorum veterum qui Ecclesiae Rom abusus ac errores detegunt & damnant necessitatemque reformationis vrgent opera & Studio Edwardi Brown 2 Vol Lond 1690 Rich Chiswel n 169 \n", - "4585 Fasciculus Latinarum Epistolarum Ludovici Molinaei cum interpretatione gallica 12° Eleutheropoli 1676 n 1589 \n", - "5137 Fasciculus Temporum editio antiqua sine Loco & Venetijs 1480 XLVIIIR19 anno n 2160 et S A n 2324 XLVIIIR18 \n", - "5179 Chronica qua dicitur Fasciculus temporum per quemdam Carthusiensem edita nunc emendata cum additionibus ad haec usque tempora Venetiis 1480 Erhardus Ratdolt \n", - "8487 Omeisij Magni Dan Academiae Altdorfinae Gloria Sive Orationum Fasciculus Vniversitatis Noricae Ortus & Progressus omniumque ipsius Professorum Vitae & Scripta accedunt I Pauli Felwingeri additamenta quaedam Altdorfi 1683 Meyerus n 1600 Fasti Consolari dell'Academia Fiorentina da Salvino Salvini in Firenze 1717 Tartini n 2151 Dissertationes Ioannis Melchioris Schwimmer de Academicis omnium Facultatum Professoribus Ienae 1671 Mullerus n 2152 " + " cleaned entry \n", + "133 Goesij Willhelmi Pilatus judex cui accedunt Theologi cujusdam in Pilatum judicem Stricturae cum ejusdem Goesij notis & animadversionibus Hagae Comitis 1677 Ioan Tongerloo n 200 \n", + "338 S Augustin de la veritable Religion et des moeurs de L'Eglise catholique trad en Franç par M Dubois 8° Paris 1694 Louis Guerin n 200 \n", + "930 Sigonij Car de antiquo Iure Populi Rom Libri XI nempe de antiquo jure Civium Romanorum Libri II de Iure antiquo Italiae Libri III de antiquo Iure Provinciarum Libri III ac de Iudiciis Libri III Bononiae 1574 Societas Typographorum n 200 " ] }, - "execution_count": 23, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -288,7 +232,7 @@ "def search_in_entry(df, string):\n", " return df[df['cleaned entry'].str.contains(string)]\n", "\n", - "info = search_in_entry(search_in_entry(entry_df, ''), 'Fasciculus')\n", + "info = search_in_entry(search_in_entry(entry_df, ''), 'n 200\\Z')\n", "print(len(info))\n", "info" ] diff --git a/Notebooks/XML_Aufbereitung.ipynb b/Notebooks/XML_Aufbereitung.ipynb index 1f4ee4ac9296f64cf0198bf4d07d6bdbc72617c7..27a17b9af590f57be0f4fcdf1a83f53b5d838c90 100644 --- a/Notebooks/XML_Aufbereitung.ipynb +++ b/Notebooks/XML_Aufbereitung.ipynb @@ -2,27 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, - "id": "8a7c8849-b1a3-4f88-b534-cec8b4c13f09", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [ - "%pip install -r ../requirements.txt -q" - ] - }, - { - "cell_type": "code", - "execution_count": 380, + "execution_count": 1, "id": "5b24e324-6659-482d-8d82-39c1d604f0d3", "metadata": { "tags": [] @@ -1599,8 +1579,357 @@ }, { "cell_type": "code", - "execution_count": 403, - "id": "4953438b-2426-4827-a6de-51c89e9e3e65", + "execution_count": 404, + "id": "3a779456-4810-4428-85d7-b0c5277717d8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "entry_df.to_excel('../Daten/Katalogabgleich/Einträge.xlsx')" + ] + }, + { + "cell_type": "markdown", + "id": "98bf1e20-d09c-41d5-8bc3-725a851f6ab3", + "metadata": {}, + "source": [ + "# Add matches from handwritten catalog to modern catalog into the TEI_XMLs" + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "id": "0d8fa911-2986-4a4d-af5b-2adf3bddada3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "entry_df = pd.read_excel('../Daten/Katalogabgleich/Einträge.xlsx', index_col=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "id": "0a86cfbe-33b3-4a3a-b384-270b5228359a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "BE_df = pd.read_excel('../Daten/Vorhersagen/WIP_final_BE_4.xlsx', index_col=0)\n", + "\n", + "# drop signature duplicates?!\n", + "to_drop = [2292, 13801, 18647]\n", + "\n", + "BE_df.drop(to_drop, inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "d0c055e8-59c4-41c7-b0d7-f709c2734e5a", + "metadata": {}, + "source": [ + "## Download all available IIIF manifests" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "id": "2e24b3fe-2908-4d88-8df1-6045815bd4ae", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0e6779a4cc604b988daac74fa308d6c3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/21356 [00:00<?, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from tqdm.notebook import tqdm\n", + "\n", + "urls = BE_df[['IIIF Manifest', 'Barcode']].dropna(subset=['IIIF Manifest'])\n", + "\n", + "for i, url in tqdm(urls.iterrows(), total=len(urls)):\n", + " filename = url['Barcode'] + '.json'\n", + " man = requests.get(url['IIIF Manifest']).content\n", + " with open(f'data/iiif_manifests/{filename}', 'wb') as fh:\n", + " fh.write(man)" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "id": "fdff3c2f-0326-412f-8347-da864f306322", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "21356\n" + ] + } + ], + "source": [ + "import glob\n", + "\n", + "mans = glob.glob('data/iiif_manifests/*.json')\n", + "\n", + "print(len(mans))" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "id": "a0c928c7-8bc2-4e89-997b-34be5a9d2a54", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6d335c8424de4d169a44b9e1aeeb0079", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/22871 [00:00<?, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def get_AC_num(BE_row):\n", + " bc = BE_row['Barcode']\n", + " fn = f'data/iiif_manifests/{bc}.json'\n", + " try:\n", + " with open(fn, 'r') as fh:\n", + " man = fh.read()\n", + " metadata = json.loads(man)['metadata']\n", + " ac = ''\n", + " for dic in metadata:\n", + " if dic['label'] == 'IDNR':\n", + " ac = dic['value']\n", + " return ac\n", + " except FileNotFoundError as e:\n", + " return ''\n", + "\n", + "for i, BE_row in tqdm(BE_df.iterrows(), total=len(BE_df)):\n", + " BE_df.at[i, 'AC Nummer'] = get_AC_num(BE_row)" + ] + }, + { + "cell_type": "markdown", + "id": "71d603b3-5ead-4ecd-9039-961325a60973", + "metadata": {}, + "source": [ + "## Obtain AC number from ASTOR repository via signature" + ] + }, + { + "cell_type": "code", + "execution_count": 210, + "id": "28fc3270-5d3e-4b20-a149-37d433afc197", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "BE_no_AC = BE_df[BE_df['AC Nummer'] == '']" + ] + }, + { + "cell_type": "code", + "execution_count": 207, + "id": "7fd80561-9e5d-498c-b2f9-428e293fd3a2", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "16d72df27a6548f6a148f3dd4158d2f2", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1515 [00:00<?, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import os\n", + "from dotenv import load_dotenv\n", + "load_dotenv()\n", + "ASTOR_API_KEY = os.getenv('ASTOR_API_KEY')\n", + "\n", + "for i, BE_row in tqdm(BE_no_AC.iterrows(), total=len(BE_no_AC)):\n", + " sig = BE_row['Signatur']\n", + " sig_esc = sig.replace('.', '\\.').replace('(', '\\(').replace(')', '\\)')\n", + " sig_sru = json.loads(requests.get(f'https://astor.onb.ac.at/discovery/internal/search?query=signature:{sig_esc}&from=1&rows=200&apikey={ASTOR_API_KEY}').content)\n", + " if 'documents' in sig_sru.keys():\n", + " docs = sig_sru['documents']\n", + " for doc in docs:\n", + " if doc['signature'] == sig:\n", + " BE_df.at[i, 'AC Nummer'] = doc['idnr']\n", + " break" + ] + }, + { + "cell_type": "markdown", + "id": "30500e79-5227-4c21-9825-365bf6992c2a", + "metadata": { + "tags": [] + }, + "source": [ + "## Obtain AC number from catalagoue via signature and SRU" + ] + }, + { + "cell_type": "code", + "execution_count": 211, + "id": "c293013f-b44d-44ec-9529-c40a2863d9f3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7fa10904058e425c9670c9d5f09b1713", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/67 [00:00<?, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from lxml import etree\n", + "\n", + "ns = {\n", + " 'srw': 'http://www.loc.gov/zing/srw/',\n", + " 'marc': 'http://www.loc.gov/MARC21/slim'\n", + "}\n", + "\n", + "lang_data = pd.read_csv('data/iso-639-3.tab', sep='\\t')\n", + "\n", + "def english_language_from_code(lang_code):\n", + " find_by_Id = lang_data[lang_data['Id'] == lang_code]\n", + " find_by_Part2b = lang_data[lang_data['Part2b'] == lang_code]\n", + " if len(find_by_Id):\n", + " name = find_by_Id['Ref_Name'].values[0]\n", + " elif len(find_by_Part2b):\n", + " name = find_by_Part2b['Ref_Name'].values[0]\n", + " else:\n", + " name = ''\n", + " return name\n", + "\n", + "def extract_catalog_data_from_signature(sig):\n", + " metadata_lis = []\n", + " sru = f'https://obv-at-oenb.alma.exlibrisgroup.com/view/sru/43ACC_ONB?version=1.2&query=alma.accessionNumber%3D%22{sig}%22&operation=searchRetrieve'\n", + " sru_request = requests.get(sru)\n", + " marcxml = sru_request.content\n", + " tree = etree.fromstring(marcxml)\n", + " records = tree.xpath('.//marc:record', namespaces=ns)\n", + " for rec in records:\n", + " metadata = {}\n", + " marc_paths = {\n", + " 'Titel': './/marc:datafield[@tag=\"245\"]/marc:subfield[@code=\"a\"]',\n", + " 'Autor': './/marc:datafield[@tag=\"100\"]/marc:subfield[@code=\"a\"]',\n", + " 'Mitwirkender': './/marc:datafield[@tag=\"700\"]/marc:subfield[@code=\"a\"]',\n", + " 'Signatur': './/marc:datafield[@tag=\"AVA\"]/marc:subfield[@code=\"d\"]',\n", + " 'Veröffentlichungsdatum': './/marc:datafield[@tag=\"264\"]/marc:subfield[@code=\"c\"]',\n", + " 'Veröffentlichungsort': './/marc:datafield[@tag=\"264\"]/marc:subfield[@code=\"a\"]',\n", + " 'Sprache': './/marc:datafield[@tag=\"041\"]/marc:subfield[@code=\"a\"]',\n", + " 'AC Nummer': './/marc:controlfield[@tag=\"009\"]'\n", + " }\n", + "\n", + " for key, path in marc_paths.items():\n", + " values = [elm.text for elm in rec.xpath(path, namespaces=ns)]\n", + " if key == 'Sprache':\n", + " values = [english_language_from_code(val) for val in values]\n", + "\n", + " metadata[key] = '; '.join(values)\n", + " # metadata['Signatur'] = sig\n", + " metadata_lis.append(metadata)\n", + " return metadata_lis\n", + "\n", + "for i, BE_row in tqdm(BE_no_AC.iterrows(), total=len(BE_no_AC)):\n", + " if BE_row['AC Nummer'] != '':\n", + " continue\n", + " sig = BE_row['Signatur']\n", + " sig_esc = sig.replace('.', '\\.').replace('(', '\\(').replace(')', '\\)').replace('-', '\\-').replace(',', '\\,')\n", + " metadata_lis = extract_catalog_data_from_signature(sig_esc)\n", + " for dic in metadata_lis:\n", + " if '; ' in sig:\n", + " siglis = dic['Signatur'].split('; ')\n", + " else:\n", + " siglis = [sig]\n", + " for s in siglis:\n", + " if sig == s:\n", + " BE_df.at[i, 'AC Nummer'] = dic['AC Nummer']\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be259674-5257-43e4-ba19-cbd12b3bfc29", + "metadata": {}, + "outputs": [], + "source": [ + "useless_entries = [2074, 4200, 5976]\n", + "\n", + "BE_df.drop(useless_entries, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 228, + "id": "805140d6-8b72-40ac-b297-e0ff24bc74a5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "BE_df.at[11481, 'AC Nummer'] = 'AC10374727'\n", + "BE_df.at[12810, 'AC Nummer'] = 'AC11979464'\n", + "BE_df.at[19399, 'AC Nummer'] = 'AC10075950'\n", + "BE_df.at[19785, 'AC Nummer'] = 'AC10103649'\n", + "BE_df.at[19958, 'AC Nummer'] = 'AC10058904'" + ] + }, + { + "cell_type": "code", + "execution_count": 229, + "id": "4a8d583c-f5a2-4626-a133-c48f80f0a2fc", "metadata": { "tags": [] }, @@ -1626,211 +1955,1056 @@ " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", - " <th>volume</th>\n", - " <th>page number</th>\n", - " <th>category</th>\n", - " <th>subcategory</th>\n", - " <th>format</th>\n", - " <th>handwritten page number</th>\n", - " <th>entry_ID</th>\n", - " <th>entry</th>\n", + " <th>Signatur</th>\n", + " <th>Barcode</th>\n", + " <th>Titel</th>\n", + " <th>Autor</th>\n", + " <th>Mitwirkender</th>\n", + " <th>Anfang Veröffentlichungsdatum</th>\n", + " <th>Ende Veröffentlichungsdatum</th>\n", + " <th>Veröffentlichungsdatum</th>\n", + " <th>Veröffentlichungsort</th>\n", + " <th>Veröffentlichungsort (normiert)</th>\n", + " <th>...</th>\n", + " <th>Formatangabe</th>\n", + " <th>hs. Katalogseite Handschrift</th>\n", + " <th>hs. Katalogeintrag ID</th>\n", + " <th>hs. Katalogeintrag</th>\n", + " <th>hs. Katalog Image URL</th>\n", + " <th>dup_title</th>\n", + " <th>copy_from</th>\n", + " <th>Einfache Klassifizierung</th>\n", + " <th>Komplexe Klassifizierung</th>\n", + " <th>AC Nummer</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", - " <th>0</th>\n", - " <td>14.376</td>\n", - " <td>25</td>\n", - " <td>Theologia</td>\n", - " <td>Textus & Versiones Sacræ Scripturæ</td>\n", - " <td>Folio</td>\n", - " <td>1</td>\n", - " <td>14.376_025_00</td>\n", - " <td>1 Biblia Sacra Polyglotta curis Cardinalis Xim...</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>14.376</td>\n", - " <td>25</td>\n", - " <td>Theologia</td>\n", - " <td>Textus & Versiones Sacræ Scripturæ</td>\n", - " <td>Folio</td>\n", - " <td>1</td>\n", - " <td>14.376_025_01</td>\n", - " <td>2 Biblia Sacra Polyglotta Philippi II. Regis C...</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>14.376</td>\n", - " <td>25</td>\n", - " <td>Theologia</td>\n", - " <td>Textus & Versiones Sacræ Scripturæ</td>\n", - " <td>Folio</td>\n", - " <td>1</td>\n", - " <td>14.376_025_02</td>\n", - " <td>3 Biblia Sacra Polyglotta Studio & curâ Briani...</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>14.376</td>\n", - " <td>25</td>\n", - " <td>Theologia</td>\n", - " <td>Textus & Versiones Sacræ Scripturæ</td>\n", - " <td>Folio</td>\n", + " <th>657</th>\n", + " <td>BE.1.N.75.(Teil.1)</td>\n", + " <td>NaN</td>\n", + " <td>Prvi [Drugi] del Novoga Teslamenta [!], vatom ...</td>\n", + " <td>Trubar, Primož 1508-1586 [Bearb.]</td>\n", + " <td>Ungnad, Hans von; Maximilian II. Heiliges Römi...</td>\n", + " <td>1563.0</td>\n", + " <td>NaN</td>\n", + " <td>1562-1563</td>\n", + " <td>V Tubingi [Urach]</td>\n", + " <td>Tübingen</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>1.0</td>\n", + " <td>-1.0</td>\n", + " <td>False</td>\n", " <td>1</td>\n", - " <td>14.376_025_03</td>\n", - " <td>4 Biblia Sacra Latina Moguntina dicta, prima o...</td>\n", + " <td></td>\n", " </tr>\n", " <tr>\n", - " <th>4</th>\n", - " <td>14.376</td>\n", - " <td>25</td>\n", - " <td>Theologia</td>\n", - " <td>Textus & Versiones Sacræ Scripturæ</td>\n", - " <td>Folio</td>\n", - " <td>1</td>\n", - " <td>14.376_025_04</td>\n", - " <td>5 Biblia Sacra Latina Moguntina, editio altera...</td>\n", + " <th>12124</th>\n", + " <td>BE.4.S.81</td>\n", + " <td>B1572545</td>\n", + " <td>Description des Monumens Musulmans du Cabinet ...</td>\n", + " <td>Reinaud, Joseph Toussaint 1795-1867</td>\n", + " <td>NaN</td>\n", + " <td>1828.0</td>\n", + " <td>NaN</td>\n", + " <td>1828</td>\n", + " <td>Paris</td>\n", + " <td>Paris</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>0.0</td>\n", + " <td>-1.0</td>\n", + " <td>False</td>\n", + " <td>0</td>\n", + " <td></td>\n", " </tr>\n", " <tr>\n", - " <th>...</th>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", + " <th>16795</th>\n", + " <td>BE.7.E.7.(2)</td>\n", + " <td>B1782971</td>\n", + " <td>Appendix ad historiam literariam Gulielmi cave...</td>\n", + " <td>Wharton, Henricus</td>\n", + " <td>Gere, Robertus</td>\n", + " <td>1743.0</td>\n", + " <td>NaN</td>\n", + " <td>1743</td>\n", + " <td>Oxonii</td>\n", + " <td>Oxford</td>\n", " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>0.0</td>\n", + " <td>-1.0</td>\n", + " <td>False</td>\n", + " <td>0</td>\n", + " <td></td>\n", " </tr>\n", " <tr>\n", - " <th>9398</th>\n", - " <td>14.378</td>\n", - " <td>582</td>\n", - " <td>Imaginum Delineatarum Collectio</td>\n", - " <td></td>\n", + " <th>17957</th>\n", + " <td>BE.7.T.64</td>\n", + " <td>NaN</td>\n", + " <td>Journal Universel, Ou Mémoires Pour servir à l...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>1743.0</td>\n", + " <td>1748.0</td>\n", + " <td>1743-1748</td>\n", + " <td>La Haye</td>\n", + " <td>Den Haag</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>1.0</td>\n", + " <td>-1.0</td>\n", + " <td>False</td>\n", + " <td>0</td>\n", " <td></td>\n", - " <td>1402</td>\n", - " <td>14.378_582_00</td>\n", - " <td>CCCXXXI Vn Portefeüilles contenant des Dessein...</td>\n", " </tr>\n", " <tr>\n", - " <th>9399</th>\n", - " <td>14.378</td>\n", - " <td>582</td>\n", - " <td>Imaginum Delineatarum Collectio</td>\n", - " <td></td>\n", + " <th>18105</th>\n", + " <td>BE.7.V.58.(Adl)</td>\n", + " <td>B1380793</td>\n", + " <td>Leben und letzte Stunden Christinä von Munk we...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>1757.0</td>\n", + " <td>NaN</td>\n", + " <td>1757</td>\n", + " <td>Kopenhagen usw.</td>\n", + " <td>Kopenhagen</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>0.0</td>\n", + " <td>-1.0</td>\n", + " <td>False</td>\n", + " <td>0</td>\n", " <td></td>\n", - " <td>1402</td>\n", - " <td>14.378_582_01</td>\n", - " <td>CCCXXXII Vn Recueil des Portraits peints en m...</td>\n", " </tr>\n", " <tr>\n", - " <th>9400</th>\n", - " <td>14.378</td>\n", - " <td>582</td>\n", - " <td>Imaginum Delineatarum Collectio</td>\n", - " <td></td>\n", + " <th>19065</th>\n", + " <td>BE.8.K.58.(Vol.Tab.,1)</td>\n", + " <td>B1633235</td>\n", + " <td>Geschichte der Griechischen Litteratur, von de...</td>\n", + " <td>Schoell, Friedrich 1766-1833</td>\n", + " <td>Schoell, Maximilien Samson Frederic; Pinder, M...</td>\n", + " <td>1828.0</td>\n", + " <td>1830.0</td>\n", + " <td>1828-1830</td>\n", + " <td>Berlin</td>\n", + " <td>Berlin</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>1.0</td>\n", + " <td>-1.0</td>\n", + " <td>False</td>\n", + " <td>0</td>\n", " <td></td>\n", - " <td>1402</td>\n", - " <td>14.378_582_02</td>\n", - " <td>CCCXXXIII Dix Vol. de Plantes peintes en mini...</td>\n", " </tr>\n", " <tr>\n", - " <th>9401</th>\n", - " <td>14.378</td>\n", - " <td>582</td>\n", - " <td>Imaginum Delineatarum Collectio</td>\n", - " <td></td>\n", + " <th>19066</th>\n", + " <td>BE.8.K.58.(Vol.Tab.,2)</td>\n", + " <td>3461960-50</td>\n", + " <td>Geschichte der Griechischen Litteratur, von de...</td>\n", + " <td>Schoell, Friedrich 1766-1833</td>\n", + " <td>Schoell, Maximilien Samson Frederic; Pinder, M...</td>\n", + " <td>1828.0</td>\n", + " <td>1830.0</td>\n", + " <td>1828-1830</td>\n", + " <td>Berlin</td>\n", + " <td>Berlin</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>1.0</td>\n", + " <td>-1.0</td>\n", + " <td>False</td>\n", + " <td>0</td>\n", " <td></td>\n", - " <td>1402</td>\n", - " <td>14.378_582_03</td>\n", - " <td>CCCXXXIV Cinq Vol. d'Oiseaux peints en Miniat...</td>\n", " </tr>\n", " <tr>\n", - " <th>9402</th>\n", - " <td>14.378</td>\n", - " <td>582</td>\n", - " <td>Imaginum Delineatarum Collectio</td>\n", - " <td></td>\n", + " <th>20641</th>\n", + " <td>BE.9.C.4.(Vol.2-5)</td>\n", + " <td>B1657002</td>\n", + " <td>Fauna Japonica Sive Descriptio animalium, quae...</td>\n", + " <td>Siebold, Philipp Franz <<von>> 1796-1866</td>\n", + " <td>Temminck, Coenraad Jacob; Schlegel, Hermann; H...</td>\n", + " <td>1833.0</td>\n", + " <td>1850.0</td>\n", + " <td>1833-1850</td>\n", + " <td>Lugduni Batavorum</td>\n", + " <td>Leiden</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>1.0</td>\n", + " <td>-1.0</td>\n", + " <td>False</td>\n", + " <td>0</td>\n", " <td></td>\n", - " <td>1402</td>\n", - " <td>14.378_582_04</td>\n", - " <td>CCCXXXV Divers Portraits, Ceremonies, Marches ...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", - "<p>9403 rows × 8 columns</p>\n", + "<p>8 rows × 38 columns</p>\n", "</div>" ], "text/plain": [ - " volume page number category \n", - "0 14.376 25 Theologia \\\n", - "1 14.376 25 Theologia \n", - "2 14.376 25 Theologia \n", - "3 14.376 25 Theologia \n", - "4 14.376 25 Theologia \n", - "... ... ... ... \n", - "9398 14.378 582 Imaginum Delineatarum Collectio \n", - "9399 14.378 582 Imaginum Delineatarum Collectio \n", - "9400 14.378 582 Imaginum Delineatarum Collectio \n", - "9401 14.378 582 Imaginum Delineatarum Collectio \n", - "9402 14.378 582 Imaginum Delineatarum Collectio \n", + " Signatur Barcode \n", + "657 BE.1.N.75.(Teil.1) NaN \\\n", + "12124 BE.4.S.81 B1572545 \n", + "16795 BE.7.E.7.(2) B1782971 \n", + "17957 BE.7.T.64 NaN \n", + "18105 BE.7.V.58.(Adl) B1380793 \n", + "19065 BE.8.K.58.(Vol.Tab.,1) B1633235 \n", + "19066 BE.8.K.58.(Vol.Tab.,2) 3461960-50 \n", + "20641 BE.9.C.4.(Vol.2-5) B1657002 \n", + "\n", + " Titel \n", + "657 Prvi [Drugi] del Novoga Teslamenta [!], vatom ... \\\n", + "12124 Description des Monumens Musulmans du Cabinet ... \n", + "16795 Appendix ad historiam literariam Gulielmi cave... \n", + "17957 Journal Universel, Ou Mémoires Pour servir à l... \n", + "18105 Leben und letzte Stunden Christinä von Munk we... \n", + "19065 Geschichte der Griechischen Litteratur, von de... \n", + "19066 Geschichte der Griechischen Litteratur, von de... \n", + "20641 Fauna Japonica Sive Descriptio animalium, quae... \n", + "\n", + " Autor \n", + "657 Trubar, Primož 1508-1586 [Bearb.] \\\n", + "12124 Reinaud, Joseph Toussaint 1795-1867 \n", + "16795 Wharton, Henricus \n", + "17957 NaN \n", + "18105 NaN \n", + "19065 Schoell, Friedrich 1766-1833 \n", + "19066 Schoell, Friedrich 1766-1833 \n", + "20641 Siebold, Philipp Franz <<von>> 1796-1866 \n", + "\n", + " Mitwirkender \n", + "657 Ungnad, Hans von; Maximilian II. Heiliges Römi... \\\n", + "12124 NaN \n", + "16795 Gere, Robertus \n", + "17957 NaN \n", + "18105 NaN \n", + "19065 Schoell, Maximilien Samson Frederic; Pinder, M... \n", + "19066 Schoell, Maximilien Samson Frederic; Pinder, M... \n", + "20641 Temminck, Coenraad Jacob; Schlegel, Hermann; H... \n", + "\n", + " Anfang Veröffentlichungsdatum Ende Veröffentlichungsdatum \n", + "657 1563.0 NaN \\\n", + "12124 1828.0 NaN \n", + "16795 1743.0 NaN \n", + "17957 1743.0 1748.0 \n", + "18105 1757.0 NaN \n", + "19065 1828.0 1830.0 \n", + "19066 1828.0 1830.0 \n", + "20641 1833.0 1850.0 \n", + "\n", + " Veröffentlichungsdatum Veröffentlichungsort \n", + "657 1562-1563 V Tubingi [Urach] \\\n", + "12124 1828 Paris \n", + "16795 1743 Oxonii \n", + "17957 1743-1748 La Haye \n", + "18105 1757 Kopenhagen usw. \n", + "19065 1828-1830 Berlin \n", + "19066 1828-1830 Berlin \n", + "20641 1833-1850 Lugduni Batavorum \n", "\n", - " subcategory format handwritten page number \n", - "0 Textus & Versiones Sacræ Scripturæ Folio 1 \\\n", - "1 Textus & Versiones Sacræ Scripturæ Folio 1 \n", - "2 Textus & Versiones Sacræ Scripturæ Folio 1 \n", - "3 Textus & Versiones Sacræ Scripturæ Folio 1 \n", - "4 Textus & Versiones Sacræ Scripturæ Folio 1 \n", - "... ... ... ... \n", - "9398 1402 \n", - "9399 1402 \n", - "9400 1402 \n", - "9401 1402 \n", - "9402 1402 \n", + " Veröffentlichungsort (normiert) ... Formatangabe \n", + "657 Tübingen ... NaN \\\n", + "12124 Paris ... NaN \n", + "16795 Oxford ... NaN \n", + "17957 Den Haag ... NaN \n", + "18105 Kopenhagen ... NaN \n", + "19065 Berlin ... NaN \n", + "19066 Berlin ... NaN \n", + "20641 Leiden ... NaN \n", "\n", - " entry_ID entry \n", - "0 14.376_025_00 1 Biblia Sacra Polyglotta curis Cardinalis Xim... \n", - "1 14.376_025_01 2 Biblia Sacra Polyglotta Philippi II. Regis C... \n", - "2 14.376_025_02 3 Biblia Sacra Polyglotta Studio & curâ Briani... \n", - "3 14.376_025_03 4 Biblia Sacra Latina Moguntina dicta, prima o... \n", - "4 14.376_025_04 5 Biblia Sacra Latina Moguntina, editio altera... \n", - "... ... ... \n", - "9398 14.378_582_00 CCCXXXI Vn Portefeüilles contenant des Dessein... \n", - "9399 14.378_582_01 CCCXXXII Vn Recueil des Portraits peints en m... \n", - "9400 14.378_582_02 CCCXXXIII Dix Vol. de Plantes peintes en mini... \n", - "9401 14.378_582_03 CCCXXXIV Cinq Vol. d'Oiseaux peints en Miniat... \n", - "9402 14.378_582_04 CCCXXXV Divers Portraits, Ceremonies, Marches ... \n", + " hs. Katalogseite Handschrift hs. Katalogeintrag ID hs. Katalogeintrag \n", + "657 NaN NaN NaN \\\n", + "12124 NaN NaN NaN \n", + "16795 NaN NaN NaN \n", + "17957 NaN NaN NaN \n", + "18105 NaN NaN NaN \n", + "19065 NaN NaN NaN \n", + "19066 NaN NaN NaN \n", + "20641 NaN NaN NaN \n", "\n", - "[9403 rows x 8 columns]" + " hs. Katalog Image URL dup_title copy_from Einfache Klassifizierung \n", + "657 NaN 1.0 -1.0 False \\\n", + "12124 NaN 0.0 -1.0 False \n", + "16795 NaN 0.0 -1.0 False \n", + "17957 NaN 1.0 -1.0 False \n", + "18105 NaN 0.0 -1.0 False \n", + "19065 NaN 1.0 -1.0 False \n", + "19066 NaN 1.0 -1.0 False \n", + "20641 NaN 1.0 -1.0 False \n", + "\n", + " Komplexe Klassifizierung AC Nummer \n", + "657 1 \n", + "12124 0 \n", + "16795 0 \n", + "17957 0 \n", + "18105 0 \n", + "19065 0 \n", + "19066 0 \n", + "20641 0 \n", + "\n", + "[8 rows x 38 columns]" ] }, - "execution_count": 403, + "execution_count": 229, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "entry_df" + "BE_df[BE_df['AC Nummer'] == '']" ] }, { "cell_type": "code", - "execution_count": 404, - "id": "3a779456-4810-4428-85d7-b0c5277717d8", - "metadata": { - "tags": [] - }, + "execution_count": 230, + "id": "3f61076e-8384-4080-9003-06b9ea774fe6", + "metadata": {}, "outputs": [], "source": [ - "entry_df.to_excel('../Daten/Katalogabgleich/Einträge.xlsx')" + "matches_sig = BE_df[['AC Nummer', 'hs. Katalogeintrag ID', 'hs. Katalog Konfidenz']].dropna(subset=['hs. Katalogeintrag ID'])\n", + "entry_df['AC number'] = -1\n", + "entry_df['AC cert'] = -1\n", + "\n", + "for i, m in matches_sig.iterrows():\n", + " ac_num = m['AC Nummer']\n", + " if ac_num == '':\n", + " continue\n", + " hs_id = m['hs. Katalogeintrag ID']\n", + " hs_cert = m['hs. Katalog Konfidenz']\n", + " \n", + " if ' | ' in hs_id:\n", + " hs_id = hs_id.split(' | ')\n", + " hs_cert = hs_cert.split(' | ')\n", + " else:\n", + " hs_id = [hs_id]\n", + " hs_cert = [hs_cert]\n", + " \n", + " for hs_i, hs_c in zip(hs_id, hs_cert):\n", + " ind = entry_df[entry_df['entry_ID'] == hs_i].index.values[0]\n", + " \n", + " if entry_df.at[ind, 'AC number'] == -1:\n", + " entry_df.at[ind, 'AC number'] = [ac_num]\n", + " entry_df.at[ind, 'AC cert'] = [hs_c]\n", + " else:\n", + " entry_df.at[ind, 'AC number'] += [ac_num]\n", + " entry_df.at[ind, 'AC cert'] += [hs_c]" ] }, { "cell_type": "code", - "execution_count": null, - "id": "0a86cfbe-33b3-4a3a-b384-270b5228359a", + "execution_count": 248, + "id": "f83d1d13-5dee-4bf3-a06d-ed58b643adfa", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "BE_df.to_excel('data/wip_BE_data/BE_df_WIP.xlsx')\n", + "entry_df.to_excel('data/wip_BE_data/entry_df_WIP.xlsx')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "a21f6db7-19c0-4bf4-a185-d94c647750a2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "BE_df = pd.read_excel('data/wip_BE_data/BE_df_WIP.xlsx', index_col=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "3aa2e989-255e-42d9-aa9f-986772b478cc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "entry_df = pd.read_excel('data/wip_BE_data/entry_df_WIP.xlsx', index_col=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "0746de2c-3343-48b1-b921-4215f594ab79", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "entry_df.at[8510, 'AC number'] = \"['AC07010383']\"\n", + "entry_df.at[8510, 'AC cert'] = \"['sicher']\"" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "7a448905-e284-44b9-bdb1-38710466b341", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-1\n", + "['AC09762517']\n" + ] + } + ], + "source": [ + "def read_list_from_str(s):\n", + " if s != -1:\n", + " lis = eval(s)\n", + " return lis\n", + " else:\n", + " return s\n", + "\n", + "print(read_list_from_str(\"-1\"))\n", + "print(read_list_from_str(\"['AC09762517']\"))\n", + "\n", + "entry_df['AC number'] = entry_df['AC number'].apply(lambda x: read_list_from_str(x))\n", + "entry_df['AC cert'] = entry_df['AC cert'].apply(lambda x: read_list_from_str(x))" + ] + }, + { + "cell_type": "markdown", + "id": "27eda982-6931-4c31-addb-912bbe743cdf", + "metadata": {}, + "source": [ + "## Add `<idno>` tags for n signature using simple string replacement (ignoring nested occurrences)" + ] + }, + { + "cell_type": "code", + "execution_count": 185, + "id": "02ccd717-a6e4-4754-b617-485b16b73ca7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "with open('../../digital-edition/Step1_Aufbereitung/Cod. 14.376_tei.xml', 'r') as tei_xml_input:\n", + " f1 = tei_xml_input.readlines()\n", + "\n", + "with open('../../digital-edition/Step1_Aufbereitung/Cod. 14.377_tei.xml', 'r') as tei_xml_input:\n", + " f2 = tei_xml_input.readlines()\n", + "\n", + "with open('../../digital-edition/Step1_Aufbereitung/Cod. 14.378_tei.xml', 'r') as tei_xml_input:\n", + " f3 = tei_xml_input.readlines()\n", + "\n", + "raw_files = [f1, f2, f3]" + ] + }, + { + "cell_type": "code", + "execution_count": 186, + "id": "2792c546-aafc-4224-97a3-4f8db5fc585c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "for file in raw_files:\n", + " for i, line in enumerate(file):\n", + " n_re = re.compile(' ([nN]\\.? ?\\d{1,4})')\n", + " match = re.search(n_re, line)\n", + " if match:\n", + " n_tag = f'<idno type=\"n_signature\">{match[1]}</idno>'\n", + " file[i] = line.replace(match[1], n_tag)" + ] + }, + { + "cell_type": "code", + "execution_count": 188, + "id": "c4baa4ed-65de-4c20-99a3-275fb49ed6e9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "with open('../../digital-edition/Step1_Aufbereitung/Cod. 14.376_tei.xml', 'w') as out:\n", + " out.writelines(f1)\n", + "\n", + "with open('../../digital-edition/Step1_Aufbereitung/Cod. 14.377_tei.xml', 'w') as out:\n", + " out.writelines(f2)\n", + "\n", + "with open('../../digital-edition/Step1_Aufbereitung/Cod. 14.378_tei.xml', 'w') as out:\n", + " out.writelines(f3)" + ] + }, + { + "cell_type": "markdown", + "id": "3ac728dd-4fe8-4d8d-9049-5891b556a19e", + "metadata": {}, + "source": [ + "## Parse files as XML" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "ae4d5368-bda2-4cdf-a170-f7e0eda103c3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "with open('../../digital-edition/Step1_Aufbereitung/Cod. 14.376_tei.xml', 'r') as tei_xml_input:\n", + " content = tei_xml_input.read()\n", + " tei_1 = bs(content, \"lxml-xml\")\n", + "\n", + "with open('../../digital-edition/Step1_Aufbereitung/Cod. 14.377_tei.xml', 'r') as tei_xml_input:\n", + " content = tei_xml_input.read()\n", + " tei_2 = bs(content, \"lxml-xml\")\n", + "\n", + "with open('../../digital-edition/Step1_Aufbereitung/Cod. 14.378_tei.xml', 'r') as tei_xml_input:\n", + " content = tei_xml_input.read()\n", + " tei_3 = bs(content, \"lxml-xml\")\n", + "\n", + "dod_ids = [51202, 51184, 51219]\n", + "cod_prefixes = {51202: '14.376', 51184: '14.377', 51219: '14.378'}\n", + "tei = {51202: tei_1, 51184: tei_2, 51219: tei_3}" + ] + }, + { + "cell_type": "markdown", + "id": "d59c6a3a-e06b-4beb-a273-9594f8473cbf", + "metadata": {}, + "source": [ + "## Add `<div type=\"knowledge_class\"></div>` tags and modify pagenumber tag" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "b73227d4-a0f1-4113-ad2c-093507aaf3a4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "for key in tei:\n", + " headers = tei[key].find_all(\"div\", type=\"head\")\n", + " for head in headers:\n", + " cat = head.find(type=\"category\")\n", + " subcat = head.find(type=\"subcategory\")\n", + " if cat is not None:\n", + " cat['subtype'] = cat['value']\n", + " del cat['value']\n", + " cat = cat.extract()\n", + " cat_lis = [cat]\n", + " if subcat is not None:\n", + " subcat['subtype'] = subcat['value']\n", + " del subcat['value']\n", + " subcat = subcat.extract()\n", + " cat_lis.append(subcat)\n", + " new_div = bs('''<div type=\"knowledge_class\"></div>''', \"lxml-xml\")\n", + " new_div.div.extend(cat_lis)\n", + " head.insert(1, new_div)\n", + "\n", + " form = head.find(type=\"format\")\n", + " if form is not None:\n", + " form['subtype'] = form['value']\n", + " del form['value']\n", + " pnum = head.find(type=\"pagenumber\")\n", + " if pnum is not None:\n", + " pnum['type'] = 'pageNum'" + ] + }, + { + "cell_type": "markdown", + "id": "46340680-bc79-4bcb-a4ab-d17c9883ba86", + "metadata": {}, + "source": [ + "## Add links to modern catalog" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "ebb4894b-76db-4146-b62a-86c30f2f8609", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def zip_without_m1(row):\n", + " if row[0] == -1:\n", + " return -1\n", + " else:\n", + " return list(zip(row[0], row[1]))\n", + "\n", + "entry_df['AC comb'] = entry_df[['AC number', 'AC cert']].apply(lambda x: zip_without_m1(x), axis=1)\n", + "entry_df['AC comb set'] = entry_df['AC comb'].apply(lambda x: list(set(x)) if x != -1 else -1)\n", + "entry_df['AC comb set len'] = entry_df['AC comb set'].apply(lambda x: len(x) if x != -1 else -1)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "ba37d08d-f064-4ba0-aaf5-d33c9893bb1c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "links = entry_df[entry_df['AC number'] != -1]\n", + "p1 = tei_1.find_all('pb')\n", + "p2 = tei_2.find_all('pb')\n", + "p3 = tei_3.find_all('pb')\n", + "\n", + "ps = {\n", + " \"14.376\": p1,\n", + " \"14.377\": p2,\n", + " \"14.378\": p3\n", + "}\n", + "\n", + "for i, row in links.iterrows():\n", + " entry = ps[str(row['volume'])][row['page number'] - 1].find_next('p').find_all(type='entry')[int(row['entry_ID'][-2:])]\n", + " ref_lis = []\n", + " for ref in row['AC comb set']:\n", + " ref_tag = bs(f'''<ref cert=\"{'high' if ref[1] == 'sicher' else 'low'}\" target=\"https://data.onb.ac.at/rec/{ref[0]}\" type=\"catalog\"/>''', \"lxml-xml\")\n", + " ref_lis.append(ref_tag)\n", + " entry.extend(ref_lis)" + ] + }, + { + "cell_type": "markdown", + "id": "deadc616-1dc6-4d7e-9392-5a92923026fc", + "metadata": {}, + "source": [ + "## Remove `choice`, `expan` and `abbr` tags" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "629a49b3-75c7-4e3d-b3aa-5c6165c32e06", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "for key in tei:\n", + " tags = tei[key].find_all('choice') + tei[key].find_all('expan') + tei[key].find_all('abbr')\n", + " for tag in tags:\n", + " tag.unwrap()" + ] + }, + { + "cell_type": "markdown", + "id": "b5c447ce-40c1-4f5b-b8f5-41d5d4bb5558", + "metadata": {}, + "source": [ + "## Move `note` tags to `ref` tags if there is no corresponding `ref` already" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "ec90629a-8afb-4a65-b6bb-782830fd8bcf", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Falsche Jahresangabe\n", + " \n", + "\n", + " Guinee, altes Synonym für Pfund?\n", + " \n", + "\n", + " Falsche Nummerierung (139 wäre richtig)\n", + " \n" + ] + } + ], + "source": [ + "for key in tei:\n", + " notes = tei[key].find_all('note')\n", + " for note in notes:\n", + " re_ac = re.compile('AC\\d{8}')\n", + " match_ac = re.search(re_ac, str(note.string))\n", + " if not match_ac:\n", + " print(note.string)\n", + " else:\n", + " note_ac = match_ac[0]\n", + " entry_div = note.find_previous('div', type='entry')\n", + " refs = entry_div.find_all('ref')\n", + " if refs is not None:\n", + " is_new_ac = True\n", + " for ref in refs:\n", + " ref_ac = ref['target'].split('/')[-1]\n", + " if note_ac == ref_ac:\n", + " is_new_ac = False\n", + " break\n", + " if is_new_ac:\n", + " new_ref_tag = bs(f'''<ref cert=\"high\" target=\"https://data.onb.ac.at/rec/{note_ac}\" type=\"catalog\"/>''', \"lxml-xml\")\n", + " entry_div.append(new_ref_tag)\n", + " note.decompose()" + ] + }, + { + "cell_type": "markdown", + "id": "aa8a809c-adb7-44c0-9c8b-4a762bbeace3", + "metadata": {}, + "source": [ + "## Add information to `<idno>` tags" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "bd52a621-e46c-49d8-a5c5-a243ee85fba4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "for key in tei:\n", + " idnos = tei[key].find_all('idno')\n", + " for id_tag in idnos:\n", + " num = re.search('\\d{1,4}', id_tag.string)[0]\n", + " form = id_tag.find_previous(type='format')['subtype']\n", + " id_tag['subtype'] = f'{form}, {num}'" + ] + }, + { + "cell_type": "markdown", + "id": "d15e8da2-da31-4068-8aae-320f5534c523", + "metadata": {}, + "source": [ + "## Change `roetel`, `bleistift` and `add` tags" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "a4a7bc48-ec82-4b67-b0dd-055d27dd008e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "for key in tei:\n", + " adds = tei[key].find_all('add')\n", + " for tag in adds:\n", + " tag['hand'] = 'other'" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "710a147b-ea34-47ca-a381-5b477d8f12ab", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "for key in tei:\n", + " bls = tei[key].find_all('bleistift')\n", + " for tag in bls:\n", + " tag.name = 'add'\n", + " tag['hand'] = 'other'\n", + " tag['rend'] = 'bleistift'" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "e7e81cbf-e473-4ae4-b846-2aea6b2e5c81", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "for key in tei:\n", + " roet = tei[key].find_all('roetel')\n", + " for tag in roet:\n", + " tag.name = 'add'\n", + " tag['hand'] = 'other'\n", + " tag['rend'] = 'roetel'" + ] + }, + { + "cell_type": "markdown", + "id": "9ef5fcf1-74fe-4c55-95a8-a2a8ffe4e265", + "metadata": {}, + "source": [ + "## Remove `<continued/>` tags" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "de097669-9180-4510-a9ac-d215433a7fca", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "for key in tei:\n", + " contd = tei[key].find_all('continued')\n", + " for tag in contd:\n", + " tag.decompose()" + ] + }, + { + "cell_type": "markdown", + "id": "8b699159-7ac2-4ced-9cb0-4a5e7e10df34", + "metadata": {}, + "source": [ + "## Combine nested `<add>` tags" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "4687407c-d205-4aab-8c86-fafa73b997b9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "for key in tei:\n", + " adds = tei[key].find_all('add')\n", + " for tag in adds:\n", + " for child in tag.children:\n", + " if child.name == 'add':\n", + " if (('rend' not in tag.attrs) and ('rend' in child.attrs)):\n", + " tag.unwrap()\n", + " if (('rend' in tag.attrs) and ('rend' not in child.attrs)):\n", + " child.unwrap()" + ] + }, + { + "cell_type": "markdown", + "id": "f4a47880-5ecf-41e1-b2ea-624bfb301c99", + "metadata": {}, + "source": [ + "## Change `<num>` tag" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "32240f6b-1ecc-47d1-8f2d-0cd14f899b3d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "for key in tei:\n", + " num_lis = tei[key].find_all('num')\n", + " for tag in num_lis:\n", + " num = re.search('\\d{1,4}', tag.string)[0]\n", + " form = tag.find_previous(type='format')['subtype']\n", + " tag.name = 'idno'\n", + " tag['type'] = 'n_signature'\n", + " tag['subtype'] = f'{form}, {num}'" + ] + }, + { + "cell_type": "markdown", + "id": "4a34042b-f93a-4072-884f-66fef48d6a30", + "metadata": {}, + "source": [ + "## List all tags in the documents" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "55b131a0-58aa-4eb1-ba47-92b3bb46ffaa", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "34\n", + "['TEI', 'ab', 'add', 'bibl', 'body', 'change', 'corr', 'div', 'encodingDesc', 'facsimile', 'fileDesc', 'graphic', 'hi', 'idno', 'l', 'listChange', 'note', 'p', 'pb', 'profileDesc', 'publicationStmt', 'publisher', 'ref', 'revisionDesc', 'sic', 'sourceDesc', 'surface', 'teiHeader', 'text', 'title', 'titleStmt', 'unclear', 'xenoData', 'zone']\n" + ] + } + ], + "source": [ + "all_tags = set()\n", + "for key in tei:\n", + " tag_lis = tei[key].find_all()\n", + " for tag in tag_lis:\n", + " all_tags.add(tag.name)\n", + "\n", + "print(len(all_tags))\n", + "print(sorted(list(all_tags)))" + ] + }, + { + "cell_type": "markdown", + "id": "85b00fab-7faa-43c4-a294-b0e5febc8024", + "metadata": {}, + "source": [ + "## Add description of changes for second step" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "a50d2147-4ee5-4d6b-a950-b4aca1f5e5ca", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<listChange>\n", + "<change when=\"2022-12-12\" who=\"#AT\">\n", + " Import of facsimiles to Transkribus\n", + " </change>\n", + "<change from=\"2022-12-13\" to=\"2023-01-26\" who=\"#AT #SM #MK\">\n", + " Manual transcription of pages to be used as Ground truth for training process in Transkribus\n", + " </change>\n", + "<change from=\"2023-02-20\" to=\"2023-02-28\" who=\"#SM\">\n", + " Automated transcription of the first three volumes Cod. 14.376-14.378\n", + " </change>\n", + "<change from=\"2023-03-01\" to=\"2023-12-13\" who=\"#AT #SM #MK #GF #PE #TG #AR\">\n", + " Correction of automated transcripts in Transkribus, adding custom tags (add, corr, bleistift, roetel, continued, comment)\n", + " </change>\n", + "<change when=\"2023-12-18\" who=\"#SM\">\n", + " Export from Transkribus to TEI-XML\n", + " </change>\n", + "<change from=\"2024-01-18\" to=\"2024-10-10\" who=\"#SM\">\n", + "Layout analysis of pages to obtain entries. Create header for all pages with knowledge classes, page number, and format specification if applicable. Added matching data for entries connecting them to the modern catalog. Add tags for n signature.\n", + "</change></listChange>\n", + "<listChange>\n", + "<change when=\"2022-12-12\" who=\"#AT\">\n", + " Import of facsimiles to Transkribus\n", + " </change>\n", + "<change from=\"2022-12-13\" to=\"2023-01-26\" who=\"#AT #SM #MK\">\n", + " Manual transcription of pages to be used as Ground truth for training process in Transkribus\n", + " </change>\n", + "<change from=\"2023-02-20\" to=\"2023-02-28\" who=\"#SM\">\n", + " Automated transcription of the first three volumes Cod. 14.376-14.378\n", + " </change>\n", + "<change from=\"2023-03-01\" to=\"2023-12-13\" who=\"#AT #SM #MK #GF #PE #TG #AR\">\n", + " Correction of automated transcripts in Transkribus, adding custom tags (add, corr, bleistift, roetel, continued, comment)\n", + " </change>\n", + "<change when=\"2023-12-18\" who=\"#SM\">\n", + " Export from Transkribus to TEI-XML\n", + " </change>\n", + "<change from=\"2024-01-18\" to=\"2024-10-10\" who=\"#SM\">\n", + "Layout analysis of pages to obtain entries. Create header for all pages with knowledge classes, page number, and format specification if applicable. Added matching data for entries connecting them to the modern catalog. Add tags for n signature.\n", + "</change></listChange>\n", + "<listChange>\n", + "<change when=\"2022-12-12\" who=\"#AT\">\n", + " Import of facsimiles to Transkribus\n", + " </change>\n", + "<change from=\"2022-12-13\" to=\"2023-01-26\" who=\"#AT #SM #MK\">\n", + " Manual transcription of pages to be used as Ground truth for training process in Transkribus\n", + " </change>\n", + "<change from=\"2023-02-20\" to=\"2023-02-28\" who=\"#SM\">\n", + " Automated transcription of the first three volumes Cod. 14.376-14.378\n", + " </change>\n", + "<change from=\"2023-03-01\" to=\"2023-12-13\" who=\"#AT #SM #MK #GF #PE #TG #AR\">\n", + " Correction of automated transcripts in Transkribus, adding custom tags (add, corr, bleistift, roetel, continued, comment)\n", + " </change>\n", + "<change when=\"2023-12-18\" who=\"#SM\">\n", + " Export from Transkribus to TEI-XML\n", + " </change>\n", + "<change from=\"2024-01-18\" to=\"2024-10-10\" who=\"#SM\">\n", + "Layout analysis of pages to obtain entries. Create header for all pages with knowledge classes, page number, and format specification if applicable. Added matching data for entries connecting them to the modern catalog. Add tags for n signature.\n", + "</change></listChange>\n" + ] + } + ], + "source": [ + "for key in tei:\n", + " change = bs('''<change from=\"2024-01-18\" to=\"2024-10-10\" who=\"#SM\">\n", + "Layout analysis of pages to obtain entries. Create header for all pages with knowledge classes, page number, and format specification if applicable. Added matching data for entries connecting them to the modern catalog. Add tags for n signature.\n", + "</change>\n", + "''', \"lxml-xml\")\n", + " listChange = tei[key].listChange\n", + " listChange.append(change)\n", + " print(listChange)" + ] + }, + { + "cell_type": "markdown", + "id": "ed89defa-7231-42e7-bab8-2b235eee5b24", + "metadata": {}, + "source": [ + "## Export TEI_XML files to Step 2" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "ee793bf1-5a43-4715-8b2b-b5780d6fd8b3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "with open('../../digital-edition/Step2_Matching/Cod. 14.376_tei.xml', 'w') as tei_xml_output:\n", + " tei_xml_output.write(tei_1.prettify(formatter='minimal'))\n", + "with open('../../digital-edition/Step2_Matching/Cod. 14.377_tei.xml', 'w') as tei_xml_output:\n", + " tei_xml_output.write(tei_2.prettify(formatter='minimal'))\n", + "with open('../../digital-edition/Step2_Matching/Cod. 14.378_tei.xml', 'w') as tei_xml_output:\n", + " tei_xml_output.write(tei_3.prettify(formatter='minimal'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7d6908c-6008-49f7-b854-3b3f66c34986", "metadata": {}, "outputs": [], "source": []