diff --git a/Daten/Vorhersagen/WIP_final_BE_3.xlsx b/Daten/Vorhersagen/WIP_final_BE_3.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..1052487927814fa10aae0202113c456fd09eed02 Binary files /dev/null and b/Daten/Vorhersagen/WIP_final_BE_3.xlsx differ diff --git a/Notebooks/Albertina_Bestaende.ipynb b/Notebooks/Albertina_Bestaende.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..8c2d83d16eb51be34fce0f0a1d492a20b187638e --- /dev/null +++ b/Notebooks/Albertina_Bestaende.ipynb @@ -0,0 +1,530 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 11, + "id": "4234a6f9-d208-4e5c-ba1d-114470811b67", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import re\n", + "import numpy as np\n", + "from tqdm.notebook import tqdm\n", + "import matplotlib.pyplot as plt\n", + "import requests\n", + "import json\n", + "from lxml import etree\n", + "from IPython.display import display\n", + "\n", + "pd.set_option('display.max_colwidth', None)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "f1e3a8e5-dee4-4ed1-ac72-5bf1dc2d868b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "onb_sig = [13, 15, 16, 17, 29, 43, 51, 52, 65, 81, 83]\n", + "\n", + "alb_df = pd.read_excel('../Daten/Katalog/BibliothecaEugeniana_StandortAlbertina.xlsx')\n", + "alb_df['ÖNB Signatur'] = False\n", + "\n", + "for ind in onb_sig:\n", + " alb_df.at[ind, 'ÖNB Signatur'] = True\n", + " \n", + "onb_baende = alb_df[alb_df['ÖNB Signatur']]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "9a59727f-3174-40b4-86fe-651509359c6e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "lang_data = pd.read_csv('data/iso-639-3.tab', sep='\\t')\n", + "\n", + "def english_language_from_code(lang_code):\n", + " find_by_Id = lang_data[lang_data['Id'] == lang_code]\n", + " find_by_Part2b = lang_data[lang_data['Part2b'] == lang_code]\n", + " if len(find_by_Id):\n", + " name = find_by_Id['Ref_Name'].values[0]\n", + " elif len(find_by_Part2b):\n", + " name = find_by_Part2b['Ref_Name'].values[0]\n", + " else:\n", + " name = ''\n", + " return name\n", + "\n", + "ns = {\n", + " 'srw': 'http://www.loc.gov/zing/srw/',\n", + " 'marc': 'http://www.loc.gov/MARC21/slim'\n", + "}\n", + "\n", + "def extract_catalog_data_from_signature(sig):\n", + " metadata_lis = []\n", + " sru = f'https://obv-at-oenb.alma.exlibrisgroup.com/view/sru/43ACC_ONB?version=1.2&query=alma.accessionNumber%3D%22{sig}%22&operation=searchRetrieve'\n", + " sru_request = requests.get(sru)\n", + " marcxml = sru_request.content\n", + " tree = etree.fromstring(marcxml)\n", + " records = tree.xpath('.//marc:record', namespaces=ns)\n", + " for rec in records:\n", + " metadata = {}\n", + " marc_paths = {\n", + " 'Titel': './/marc:datafield[@tag=\"245\"]/marc:subfield[@code=\"a\"]',\n", + " 'Autor': './/marc:datafield[@tag=\"100\"]/marc:subfield[@code=\"a\"]',\n", + " 'Mitwirkender': './/marc:datafield[@tag=\"700\"]/marc:subfield[@code=\"a\"]',\n", + " 'Signatur': './/marc:datafield[@tag=\"AVA\"]/marc:subfield[@code=\"d\"]',\n", + " 'Veröffentlichungsdatum': './/marc:datafield[@tag=\"264\"]/marc:subfield[@code=\"c\"]',\n", + " 'Veröffentlichungsort': './/marc:datafield[@tag=\"264\"]/marc:subfield[@code=\"a\"]',\n", + " 'Sprache': './/marc:datafield[@tag=\"041\"]/marc:subfield[@code=\"a\"]'\n", + " }\n", + "\n", + " for key, path in marc_paths.items():\n", + " values = [elm.text for elm in rec.xpath(path, namespaces=ns)]\n", + " if key == 'Sprache':\n", + " values = [english_language_from_code(val) for val in values]\n", + "\n", + " metadata[key] = '; '.join(values)\n", + " # metadata['Signatur'] = sig\n", + " metadata_lis.append(metadata)\n", + " return metadata_lis" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "d336f2e0-87c7-40c4-ba62-30fae7d21ceb", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "onb_baende_metadata = [extract_catalog_data_from_signature(sig) for sig in onb_baende['Signatur (rot: ÖNB)']]" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "6fd96da5-7b09-426a-83ff-56ba41de5af3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def is_exact_signature(cand):\n", + " sigs = onb_baende['Signatur (rot: ÖNB)'].values\n", + " return cand in sigs" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "384857bb-2720-4699-92b3-a26e0e21bf9e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "onb_md = []\n", + "\n", + "for md in onb_baende_metadata:\n", + " filtered_md = [d for d in md if is_exact_signature(d['Signatur'])]\n", + " onb_md.append(filtered_md)" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "5632416d-ad96-4e1f-8563-11bec3025f60", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[[{'Titel': 'Aigentliche wahrhaffte Delineatio vnnd Abbildung aller fürstl. Auufzug vnd Ritterspilen in der fürstlichen Hauptstadt Stuttgart. (Tübingen 1618).',\n", + " 'Autor': 'Hulsen, Esaias von',\n", + " 'Mitwirkender': '',\n", + " 'Signatur': '24.D.6',\n", + " 'Veröffentlichungsdatum': '',\n", + " 'Veröffentlichungsort': 'Tübingen',\n", + " 'Sprache': 'German'},\n", + " {'Titel': 'Aigentliche wahrhaffte Delineatio vund Abbildung aller fürstlichen Auffzüg vnd Ritterspilen ... in der fürstlichen Hauptstatt Stuttgart (etc.)',\n", + " 'Autor': '',\n", + " 'Mitwirkender': '',\n", + " 'Signatur': '24.D.6',\n", + " 'Veröffentlichungsdatum': '1618',\n", + " 'Veröffentlichungsort': 'Tübingen',\n", + " 'Sprache': 'German'}],\n", + " [],\n", + " [],\n", + " [],\n", + " [{'Titel': \"Pompe funebre du tres-pieux et tres puissant prince Albert, archiduc d'Autriche (etc.) representee au naturel en tailles 12, dessinees par Jacques Francqvart et gravees par Corneille Galle. Avec unde diss. historique et morale d'Eryce Puteanus (etc.)\",\n", + " 'Autor': 'Francquart, Jacques',\n", + " 'Mitwirkender': 'Galle, Corneille; Puteanus, Erycius',\n", + " 'Signatur': '24.D.7',\n", + " 'Veröffentlichungsdatum': '1729',\n", + " 'Veröffentlichungsort': 'Bruxelles',\n", + " 'Sprache': 'French'}],\n", + " [{'Titel': 'Funera Caroli V. (tit. fict.)',\n", + " 'Autor': '',\n", + " 'Mitwirkender': '',\n", + " 'Signatur': '24.D.8',\n", + " 'Veröffentlichungsdatum': '1619',\n", + " 'Veröffentlichungsort': 'Hagae-Comit',\n", + " 'Sprache': ''}],\n", + " [{'Titel': 'Certamen equestre caeteraque solemnia. Holmiae Suecorum av 1672 celebrata cum Carolus XI. omnium cum applausu avati regnii regimen capescret. s. Das grosse Carrosel (Carrousel) und prächtige Ring-Rännen nebst dem, was sonsten fürtreffliches zu sehen war, alß König ... Carl der Elffte die Regierung seines Väterlichen Erb. Königreichs anno 1672 den 18. Dezembris in seiner kgl. Residenz zu Stockholm antratt.',\n", + " 'Autor': '',\n", + " 'Mitwirkender': '',\n", + " 'Signatur': '24.D.4',\n", + " 'Veröffentlichungsdatum': '1672',\n", + " 'Veröffentlichungsort': 'Stockholm',\n", + " 'Sprache': 'German'},\n", + " {'Titel': 'Das große Carrosel (Carroussel) und prächtige Ring-Rännen nebst dem, was sonsten fürtreffliches zu sehen war, alß ... König ... Carl der Eylffte die Regierung ... antratt',\n", + " 'Autor': '',\n", + " 'Mitwirkender': '',\n", + " 'Signatur': '24.D.4',\n", + " 'Veröffentlichungsdatum': '1672',\n", + " 'Veröffentlichungsort': 'Stockholm',\n", + " 'Sprache': 'German'}],\n", + " [{'Titel': 'Aufzüge und Ritterspiele, so bey des duchlauchtigsten, hochgebornen Fürsten und Herrn Herrn Friedrich Wilhelms Hertzogen zu Sachsen ... gehalten worden auf S. H. durchl. Residentz Vestung zu Altenburg in Monat Junio 1654',\n", + " 'Autor': '',\n", + " 'Mitwirkender': '',\n", + " 'Signatur': '24.D.5',\n", + " 'Veröffentlichungsdatum': '1658',\n", + " 'Veröffentlichungsort': 'Schleßwig',\n", + " 'Sprache': 'German'}],\n", + " [],\n", + " [],\n", + " [{'Titel': \"Feste fatte sopra l'Arno in Fiorenza per le nozze del Sno Pre di Toscana l'anno 1608 disegnata da Giulio Parigi.\",\n", + " 'Autor': 'Parigi, Giulio',\n", + " 'Mitwirkender': '',\n", + " 'Signatur': '839505-E',\n", + " 'Veröffentlichungsdatum': '1608',\n", + " 'Veröffentlichungsort': 'Florenz',\n", + " 'Sprache': 'Italian'}]]" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "onb_md" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "0fac2293-33f2-41dc-b6da-fc17cb4c2209", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Standort</th>\n", + " <th>Regal</th>\n", + " <th>Systematik</th>\n", + " <th>Signatur (rot: ÖNB)</th>\n", + " <th>Kurztitel</th>\n", + " <th>Anmerkungen</th>\n", + " <th>ÖNB Signatur</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>UG3 Depot</td>\n", + " <td>NaN</td>\n", + " <td>Galeriewerke</td>\n", + " <td>24.D.6</td>\n", + " <td>Festa eqvestria stvtgardiae celebrat</td>\n", + " <td>NaN</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>UG3 Depot</td>\n", + " <td>NaN</td>\n", + " <td>Galeriewerke</td>\n", + " <td>587972-F</td>\n", + " <td>Architet di L. B. Alberti, Tom. I</td>\n", + " <td>NaN</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>UG3 Depot</td>\n", + " <td>NaN</td>\n", + " <td>Galeriewerke</td>\n", + " <td>587972-F</td>\n", + " <td>Architetv di L. B. Alberti, Tom. II</td>\n", + " <td>NaN</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>UG3 Depot</td>\n", + " <td>NaN</td>\n", + " <td>Galeriewerke</td>\n", + " <td>587972-F</td>\n", + " <td>Architet di L. B. Alberti, Tom. III</td>\n", + " <td>NaN</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29</th>\n", + " <td>UG3 Depot</td>\n", + " <td>NaN</td>\n", + " <td>Galeriewerke</td>\n", + " <td>24.D.7</td>\n", + " <td>Pompe funèbre du très-pieux et très-puissant Prince Albert</td>\n", + " <td>NaN</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>43</th>\n", + " <td>UG3 Depot</td>\n", + " <td>NaN</td>\n", + " <td>Galeriewerke</td>\n", + " <td>24.D.8</td>\n", + " <td>Fvner Car. V. Imper</td>\n", + " <td>NaN</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>51</th>\n", + " <td>UG3 Depot</td>\n", + " <td>NaN</td>\n", + " <td>Galeriewerke</td>\n", + " <td>24.D.4</td>\n", + " <td>Certam eqvest Caroli XI. holmiae</td>\n", + " <td>NaN</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>52</th>\n", + " <td>UG3 Depot</td>\n", + " <td>NaN</td>\n", + " <td>Galeriewerke</td>\n", + " <td>24.D.5</td>\n", + " <td>Certam eqvestr in Saxonia</td>\n", + " <td>NaN</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>65</th>\n", + " <td>UG3 Depot</td>\n", + " <td>NaN</td>\n", + " <td>Galeriewerke</td>\n", + " <td>60.120-E</td>\n", + " <td>Academie de le spee par G. Thibvat</td>\n", + " <td>Etikett: Nicht ausheben! Umsignieren auf MF 3106</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>81</th>\n", + " <td>UG5 Depot</td>\n", + " <td>95.0</td>\n", + " <td>Illustrierte Bücher Thulden</td>\n", + " <td>392.023-E</td>\n", + " <td>Travavx d'Vlisse par Theodore van Thvlden</td>\n", + " <td>NaN</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>83</th>\n", + " <td>UG3 Depot</td>\n", + " <td>NaN</td>\n", + " <td>Galeriewerke</td>\n", + " <td>839505-E</td>\n", + " <td>Feste fatte sopral arno in fior</td>\n", + " <td>NaN</td>\n", + " <td>True</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Standort Regal Systematik Signatur (rot: ÖNB) \n", + "13 UG3 Depot NaN Galeriewerke 24.D.6 \\\n", + "15 UG3 Depot NaN Galeriewerke 587972-F \n", + "16 UG3 Depot NaN Galeriewerke 587972-F \n", + "17 UG3 Depot NaN Galeriewerke 587972-F \n", + "29 UG3 Depot NaN Galeriewerke 24.D.7 \n", + "43 UG3 Depot NaN Galeriewerke 24.D.8 \n", + "51 UG3 Depot NaN Galeriewerke 24.D.4 \n", + "52 UG3 Depot NaN Galeriewerke 24.D.5 \n", + "65 UG3 Depot NaN Galeriewerke 60.120-E \n", + "81 UG5 Depot 95.0 Illustrierte Bücher Thulden 392.023-E \n", + "83 UG3 Depot NaN Galeriewerke 839505-E \n", + "\n", + " Kurztitel \n", + "13 Festa eqvestria stvtgardiae celebrat \\\n", + "15 Architet di L. B. Alberti, Tom. I \n", + "16 Architetv di L. B. Alberti, Tom. II \n", + "17 Architet di L. B. Alberti, Tom. III \n", + "29 Pompe funèbre du très-pieux et très-puissant Prince Albert \n", + "43 Fvner Car. V. Imper \n", + "51 Certam eqvest Caroli XI. holmiae \n", + "52 Certam eqvestr in Saxonia \n", + "65 Academie de le spee par G. Thibvat \n", + "81 Travavx d'Vlisse par Theodore van Thvlden \n", + "83 Feste fatte sopral arno in fior \n", + "\n", + " Anmerkungen ÖNB Signatur \n", + "13 NaN True \n", + "15 NaN True \n", + "16 NaN True \n", + "17 NaN True \n", + "29 NaN True \n", + "43 NaN True \n", + "51 NaN True \n", + "52 NaN True \n", + "65 Etikett: Nicht ausheben! Umsignieren auf MF 3106 True \n", + "81 NaN True \n", + "83 NaN True " + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "onb_baende" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "id": "dac1e7bd-793c-4841-876c-09ce04c66e33", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Standort</th>\n", + " <th>Regal</th>\n", + " <th>Systematik</th>\n", + " <th>Signatur (rot: ÖNB)</th>\n", + " <th>Kurztitel</th>\n", + " <th>Anmerkungen</th>\n", + " <th>ÖNB Signatur</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>93</th>\n", + " <td>UG3 Depot</td>\n", + " <td>NaN</td>\n", + " <td>Galeriewerke</td>\n", + " <td>K.S.E-201</td>\n", + " <td>Bidloo Corpor Human Anatomia</td>\n", + " <td>NaN</td>\n", + " <td>False</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Standort Regal Systematik Signatur (rot: ÖNB) \n", + "93 UG3 Depot NaN Galeriewerke K.S.E-201 \\\n", + "\n", + " Kurztitel Anmerkungen ÖNB Signatur \n", + "93 Bidloo Corpor Human Anatomia NaN False " + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "alb_df[alb_df['Kurztitel'].str.contains('Human')]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a869f088-af09-462e-ad10-0d201ee982a6", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Notebooks/String_matching.ipynb b/Notebooks/String_matching.ipynb index de8428e3b3d8e30a0b4454b01845a84ee1313966..b3aa06cc5cf5ed438de913e387d6adfc1849c6e9 100644 --- a/Notebooks/String_matching.ipynb +++ b/Notebooks/String_matching.ipynb @@ -115,7 +115,7 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 1189, "id": "c0f4a42a-7e21-41e8-833c-2dd2f9d1985e", "metadata": { "tags": [] @@ -125,7 +125,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "7\n" + "1\n" ] }, { @@ -162,141 +162,36 @@ " </thead>\n", " <tbody>\n", " <tr>\n", - " <th>3207</th>\n", - " <td>14.377</td>\n", - " <td>72</td>\n", - " <td>Poëtica</td>\n", - " <td>Poëtæ Gallici unà cum Dramaticis</td>\n", - " <td>Folio</td>\n", - " <td>480</td>\n", - " <td>14.377_072_00</td>\n", - " <td>912 Les oeuvres Poëtiques du Sr. Rousseau, vide Codd. Mss.</td>\n", - " <td>Les oeuvres Poetiques du Sr Rousseau vide Codd Mss</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3253</th>\n", + " <th>4652</th>\n", " <td>14.377</td>\n", - " <td>77</td>\n", - " <td>Poëtica</td>\n", - " <td>Poëtæ Gallici unà cum Dramaticis</td>\n", - " <td>Quarto</td>\n", - " <td>485</td>\n", - " <td>14.377_077_02</td>\n", - " <td>Les Oeuvres Poëtiques du S. Rousseau. 2 Vol. grand Pap. Londres. 1723. Tonson. n. 638. NB. V. inter illos in fol.</td>\n", - " <td>Les Oeuvres Poetiques du S Rousseau 2 Vol grand Pap Londres 1723 Tonson n 638 NB V inter illos in fol</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3402</th>\n", - " <td>14.377</td>\n", - " <td>90</td>\n", - " <td>Poëtica</td>\n", - " <td>Poëtæ Gallici unà cum Dramaticis</td>\n", + " <td>231</td>\n", + " <td>Philologia</td>\n", + " <td>Epistolographi Gallici, & Italici</td>\n", " <td>Octavo und kleiner</td>\n", - " <td>494</td>\n", - " <td>14.377_090_07</td>\n", - " <td>Les Oeuvres Poëtiques du S. Rouseau, avec l'anti-Rousseau par (Gâcon ) 12.° 3 Vol. Rotterdam. 1712. n. 1231</td>\n", - " <td>Les Oeuvres Poetiques du S Rouseau avec l'anti-Rousseau par Gacon 12° 3 Vol Rotterdam 1712 n 1231</td>\n", - " </tr>\n", - " <tr>\n", - " <th>8930</th>\n", - " <td>14.378</td>\n", - " <td>513</td>\n", - " <td>Codices Manuscripti</td>\n", - " <td>NaN</td>\n", - " <td>Folio</td>\n", - " <td>1341</td>\n", - " <td>14.378_513_02</td>\n", - " <td>149 Les Oeuvres du S. Rousseau. Mss.</td>\n", - " <td>Les Oeuvres du S Rousseau Mss</td>\n", - " </tr>\n", - " <tr>\n", - " <th>9014</th>\n", - " <td>14.378</td>\n", - " <td>522</td>\n", - " <td>Codices Manuscripti</td>\n", - " <td>NaN</td>\n", - " <td>Quarto</td>\n", - " <td>1348</td>\n", - " <td>14.378_522_02</td>\n", - " <td>50 Les Epigrammes du S.r Rousseau approuvées par l'autheur. Mss. ce sont ? set aences jusque au mois de Fevrier 1710. n. CCVI</td>\n", - " <td>Les Epigrammes du Sr Rousseau approuvees par l'autheur Mss ce sont ? set aences jusque au mois de Fevrier 1710 n CCVI</td>\n", - " </tr>\n", - " <tr>\n", - " <th>9044</th>\n", - " <td>14.378</td>\n", - " <td>524</td>\n", - " <td>Codices Manuscripti</td>\n", - " <td>NaN</td>\n", - " <td>Quarto</td>\n", - " <td>1348*</td>\n", - " <td>14.378_524_07</td>\n", - " <td>80. Odes sacrées ou cantiques de Pseaumes par le S. Rousseau, Mspt. n. CCVII.</td>\n", - " <td>Odes sacrees ou cantiques de Pseaumes par le S Rousseau Mspt n CCVII</td>\n", - " </tr>\n", - " <tr>\n", - " <th>9065</th>\n", - " <td>14.378</td>\n", - " <td>530</td>\n", - " <td>Codices Manuscripti</td>\n", - " <td>NaN</td>\n", - " <td>Octavo und kleiner</td>\n", - " <td>1352</td>\n", - " <td>14.378_530_02</td>\n", - " <td>28 89. Rousseau Sr. Epigrammes approuvées par lui memne. chart: Sæc: XVIII.</td>\n", - " <td>89 Rousseau Sr Epigrammes approuvees par lui memne chart Saec XVIII</td>\n", + " <td>627</td>\n", + " <td>14.377_231_03</td>\n", + " <td>Lettres edifiantes & curieuses des Missionaires Iesuites. 12.° 21 Vol. Paris. 1717. & Ann. Suivantes n. 1622.</td>\n", + " <td>Lettres edifiantes & curieuses des Missionaires Iesuites 12° 21 Vol Paris 1717 & Ann Suivantes n 1622</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ - " volume page number category \n", - "3207 14.377 72 Poëtica \\\n", - "3253 14.377 77 Poëtica \n", - "3402 14.377 90 Poëtica \n", - "8930 14.378 513 Codices Manuscripti \n", - "9014 14.378 522 Codices Manuscripti \n", - "9044 14.378 524 Codices Manuscripti \n", - "9065 14.378 530 Codices Manuscripti \n", - "\n", - " subcategory format \n", - "3207 Poëtæ Gallici unà cum Dramaticis Folio \\\n", - "3253 Poëtæ Gallici unà cum Dramaticis Quarto \n", - "3402 Poëtæ Gallici unà cum Dramaticis Octavo und kleiner \n", - "8930 NaN Folio \n", - "9014 NaN Quarto \n", - "9044 NaN Quarto \n", - "9065 NaN Octavo und kleiner \n", + " volume page number category subcategory \n", + "4652 14.377 231 Philologia Epistolographi Gallici, & Italici \\\n", "\n", - " handwritten page number entry_ID \n", - "3207 480 14.377_072_00 \\\n", - "3253 485 14.377_077_02 \n", - "3402 494 14.377_090_07 \n", - "8930 1341 14.378_513_02 \n", - "9014 1348 14.378_522_02 \n", - "9044 1348* 14.378_524_07 \n", - "9065 1352 14.378_530_02 \n", + " format handwritten page number entry_ID \n", + "4652 Octavo und kleiner 627 14.377_231_03 \\\n", "\n", - " entry \n", - "3207 912 Les oeuvres Poëtiques du Sr. Rousseau, vide Codd. Mss. \\\n", - "3253 Les Oeuvres Poëtiques du S. Rousseau. 2 Vol. grand Pap. Londres. 1723. Tonson. n. 638. NB. V. inter illos in fol. \n", - "3402 Les Oeuvres Poëtiques du S. Rouseau, avec l'anti-Rousseau par (Gâcon ) 12.° 3 Vol. Rotterdam. 1712. n. 1231 \n", - "8930 149 Les Oeuvres du S. Rousseau. Mss. \n", - "9014 50 Les Epigrammes du S.r Rousseau approuvées par l'autheur. Mss. ce sont ? set aences jusque au mois de Fevrier 1710. n. CCVI \n", - "9044 80. Odes sacrées ou cantiques de Pseaumes par le S. Rousseau, Mspt. n. CCVII. \n", - "9065 28 89. Rousseau Sr. Epigrammes approuvées par lui memne. chart: Sæc: XVIII. \n", + " entry \n", + "4652 Lettres edifiantes & curieuses des Missionaires Iesuites. 12.° 21 Vol. Paris. 1717. & Ann. Suivantes n. 1622. \\\n", "\n", - " cleaned entry \n", - "3207 Les oeuvres Poetiques du Sr Rousseau vide Codd Mss \n", - "3253 Les Oeuvres Poetiques du S Rousseau 2 Vol grand Pap Londres 1723 Tonson n 638 NB V inter illos in fol \n", - "3402 Les Oeuvres Poetiques du S Rouseau avec l'anti-Rousseau par Gacon 12° 3 Vol Rotterdam 1712 n 1231 \n", - "8930 Les Oeuvres du S Rousseau Mss \n", - "9014 Les Epigrammes du Sr Rousseau approuvees par l'autheur Mss ce sont ? set aences jusque au mois de Fevrier 1710 n CCVI \n", - "9044 Odes sacrees ou cantiques de Pseaumes par le S Rousseau Mspt n CCVII \n", - "9065 89 Rousseau Sr Epigrammes approuvees par lui memne chart Saec XVIII " + " cleaned entry \n", + "4652 Lettres edifiantes & curieuses des Missionaires Iesuites 12° 21 Vol Paris 1717 & Ann Suivantes n 1622 " ] }, - "execution_count": 114, + "execution_count": 1189, "metadata": {}, "output_type": "execute_result" } @@ -305,7 +200,7 @@ "def search_in_entry(df, string):\n", " return df[df['cleaned entry'].str.contains(string)]\n", "\n", - "info = search_in_entry(search_in_entry(entry_df, 'Rousseau'), '')\n", + "info = search_in_entry(search_in_entry(entry_df, 'edifiantes'), '')\n", "print(len(info))\n", "info" ] @@ -344,7 +239,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 1176, "id": "20facf5d-d609-498e-9907-7ebdffc09e15", "metadata": { "tags": [] @@ -354,55 +249,55 @@ "name": "stdout", "output_type": "stream", "text": [ - "BE.1.M.2.(2)\n", - "B1762317\n", - "Appendix Ignatiana in qua continentur Ignatii epistolae germinae Ignatiique martyrium a Philone, Agathopode et aliis (etc.)\n", - "Usher, Jacobus\n", - "Londini\n", - "1647\n", - "nan\n" + "BE.5.V.45\n", + "+Z175390902\n", + "Principes de philosophie ou preuves naturelles de l'existence de Dieu et de l'immortalite de l'ame\n", + "Genest, Charles-Claude\n", + "Paris\n", + "1716\n", + "B\n" ] }, { "data": { "text/plain": [ - "Signatur BE.1.M.2.(2)\n", - "Barcode B1762317\n", - "Titel Appendix Ignatiana in qua continentur Ignatii epistolae germinae Ignatiique martyrium a Philone, Agathopode et aliis (etc.)\n", - "Autor Usher, Jacobus\n", - "Mitwirkender NaN\n", - "Anfang Veröffentlichungsdatum 1647.0\n", - "Ende Veröffentlichungsdatum NaN\n", - "Veröffentlichungsdatum 1647\n", - "Veröffentlichungsort Londini\n", - "Veröffentlichungsort (normiert) NaN\n", - "Sprache Latin\n", - "Schlagwörter Ignatius--Antiochenus---110\n", - "Schlagwörter (mit GND) Ignatius--Antiochenus---110$Dp--(DE-588)118555340;AT-OBV--ONB-AK\n", - "Vorbesitzer NaN\n", - "Typ NaN\n", - "Bemerkungen NaN\n", - "Gültiger Barcode NaN\n", - "Dateiname NaN\n", - "Wappenklassifizierung NaN\n", - "p_A NaN\n", - "p_B NaN\n", - "p_C NaN\n", - "p_N NaN\n", - "Farbklassifizierung NaN\n", - "p_blue NaN\n", - "p_red NaN\n", - "p_yellow NaN\n", - "Name: 463, dtype: object" + "Signatur BE.5.V.45\n", + "Barcode +Z175390902\n", + "Titel Principes de philosophie ou preuves naturelles de l'existence de Dieu et de l'immortalite de l'ame\n", + "Autor Genest, Charles-Claude\n", + "Mitwirkender NaN\n", + "Anfang Veröffentlichungsdatum 1716.0\n", + "Ende Veröffentlichungsdatum NaN\n", + "Veröffentlichungsdatum 1716\n", + "Veröffentlichungsort Paris\n", + "Veröffentlichungsort (normiert) NaN\n", + "Sprache French\n", + "Schlagwörter Gedicht; Gottesbeweis; Seele; Unsterblichkeit; Belletristische Darstellung; Lyrik; Französisch\n", + "Schlagwörter (mit GND) Gottesbeweis$Ds--(DE-588)4021668-8;Belletristische Darstellung$Af;AT-OBV--ONB-AK;Seele$Ds--(DE-588)4054146-0;Unsterblichkeit$Ds--(DE-588)4061874-2;Belletristische Darstellung$Af;AT-OBV--ONB-AK;Lyrik$Ds--(DE-588)4036774-5;Französisch$Ds--(DE-588)4113615-9;AT-OBV--ONB-AK\n", + "Vorbesitzer NaN\n", + "Typ Gedicht--bellobv\n", + "Bemerkungen NaN\n", + "Gültiger Barcode Z175390902\n", + "Dateiname Z175390902_00000001.jpg\n", + "Wappenklassifizierung B\n", + "p_A 0.000108\n", + "p_B 0.99959\n", + "p_C 0.000184\n", + "p_N 0.000118\n", + "Farbklassifizierung red\n", + "p_blue 0.000051\n", + "p_red 0.999859\n", + "p_yellow 0.00009\n", + "Name: 14220, dtype: object" ] }, - "execution_count": 11, + "execution_count": 1176, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "BE_entry = BE_df.loc[463]\n", + "BE_entry = BE_df.loc[14220]\n", "print(BE_entry['Signatur'])\n", "print(BE_entry['Barcode'])\n", "print(BE_entry['Titel'])\n", @@ -3087,8 +2982,647 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 156, "id": "a5507d98-a6be-4108-a70f-c09404f8de79", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Zwischenstand speichern\n", + "comp_BE_no_dup.to_excel('../Daten/Vorhersagen/WIP_final_BE_2.xlsx')" + ] + }, + { + "cell_type": "code", + "execution_count": 724, + "id": "315500ea-0225-4b2c-b568-082f5250dd4e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "comp_BE_no_dup = pd.read_excel('../Daten/Vorhersagen/WIP_final_BE_2.xlsx', index_col=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 772, + "id": "ef48a633-fb96-4ae2-b024-9f8e6f7beb01", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Signatur', 'Barcode', 'Titel', 'Autor', 'Mitwirkender',\n", + " 'Anfang Veröffentlichungsdatum', 'Ende Veröffentlichungsdatum',\n", + " 'Veröffentlichungsdatum', 'Veröffentlichungsort',\n", + " 'Veröffentlichungsort (normiert)', 'Sprache', 'Dateiname',\n", + " 'Wappenklassifizierung', 'p_A', 'p_B', 'p_C', 'p_N',\n", + " 'Farbklassifizierung', 'p_blue', 'p_red', 'p_yellow', 'IIIF Manifest',\n", + " 'hs. Katalog', 'hs. Katalog Konfidenz', 'hs. Katalogband',\n", + " 'hs. Katalogseite Digitalisat', 'Wissensklasse', 'Wissensunterklasse',\n", + " 'Formatangabe', 'hs. Katalogseite Handschrift', 'hs. Katalogeintrag ID',\n", + " 'hs. Katalogeintrag', 'hs. Katalog Image URL', 'dup_title',\n", + " 'copy_from'],\n", + " dtype='object')" + ] + }, + "execution_count": 772, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "comp_BE_no_dup.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 870, + "id": "ffc781e2-21f4-48c8-9e01-f57f677b11a6", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "22614 https://iiif.onb.ac.at/presentation/ABO/Z22196790X/manifest\n", + "897 https://iiif.onb.ac.at/presentation/ABO/Z165135008/manifest\n", + "898 https://iiif.onb.ac.at/presentation/ABO/Z165135100/manifest\n", + "900 https://iiif.onb.ac.at/presentation/ABO/Z165135203/manifest\n", + "901 https://iiif.onb.ac.at/presentation/ABO/Z165135306/manifest\n", + " ... \n", + "22236 https://iiif.onb.ac.at/presentation/ABO/Z200809106/manifest\n", + "22237 https://iiif.onb.ac.at/presentation/ABO/Z200809209/manifest\n", + "22238 https://iiif.onb.ac.at/presentation/ABO/Z200809301/manifest\n", + "22239 https://iiif.onb.ac.at/presentation/ABO/Z200809404/manifest\n", + "22240 https://iiif.onb.ac.at/presentation/ABO/Z200809507/manifest\n", + "Name: IIIF Manifest, Length: 3364, dtype: object" + ] + }, + "execution_count": 870, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "comp_BE_no_dup[comp_BE_no_dup['hs. Katalogeintrag ID'].isin(entry_df[entry_df['format'] == 'Octavo und kleiner']['entry_ID'])]['IIIF Manifest'].dropna()" + ] + }, + { + "cell_type": "markdown", + "id": "67d13156-4cc6-4db4-92f3-41031203ce7c", + "metadata": {}, + "source": [ + "# Cluster im Regal ermitteln" + ] + }, + { + "cell_type": "code", + "execution_count": 882, + "id": "3c414e9c-021d-4172-87ce-32fb6a08c9a6", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7950\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_4673/1024163498.py:6: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " has_hw_catalog['first_num'] = has_hw_catalog['control'].apply(lambda x: int(match.search(str(x))[0]))\n" + ] + } + ], + "source": [ + "s = '7950, 9247'\n", + "\n", + "match = re.compile('\\d{1,4}\\Z|\\d{1,4}|\\d{1,4}\\?,')\n", + "print(match.search(s)[0])\n", + "\n", + "has_hw_catalog['first_num'] = has_hw_catalog['control'].apply(lambda x: int(match.search(str(x))[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": 905, + "id": "cc5e0d88-a88e-4926-8f1a-65d0415ea3dc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "no_sort_m = has_hw_catalog['first_num'].values" + ] + }, + { + "cell_type": "code", + "execution_count": 1060, + "id": "acff6cc5-e585-47b2-a91f-f1191d47de04", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def get_longest_sequence(lis):\n", + " differences = [abs(lis[n+1] - lis[n]) for n in range(len(lis) - 1)]\n", + " ids = [entry_df.loc[n]['entry_ID'] for n in lis]\n", + " \n", + " is_nearby = [n < 10 for n in differences]\n", + " \n", + " c = 0\n", + " seq_length = []\n", + " for i, num in enumerate(is_nearby):\n", + " if num:\n", + " c += 1\n", + " else:\n", + " c = 0\n", + " seq_length.append((i, c))\n", + " \n", + " sort_seq = sorted(seq_length, key=lambda x: x[1], reverse=True)\n", + " places = pd.Series([n[0] - n[1] + 1 for n in sort_seq], name='location')\n", + " places_no_dup = places.drop_duplicates()\n", + " seq_df = pd.DataFrame(places_no_dup)\n", + " seq_df['length'] = [sort_seq[x][1] for x in seq_df.index]\n", + " seq_df['first_BE_location'] = seq_df['location'].apply(lambda x: has_hw_catalog.iloc[x]['input_id'])\n", + " seq_df['last_location'] = seq_df['location'] + seq_df['length']\n", + " seq_df['last_BE_location'] = seq_df['last_location'].apply(lambda x: has_hw_catalog.iloc[x]['input_id'])\n", + " seq_df['first_hw_id'] = seq_df['location'].apply(lambda x: ids[x])\n", + " seq_df['last_hw_id'] = seq_df['last_location'].apply(lambda x: ids[x])\n", + " \n", + " return seq_df" + ] + }, + { + "cell_type": "code", + "execution_count": 1061, + "id": "bbb4d50a-72f8-4368-9966-0d0ea590c06f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "seq = get_longest_sequence(no_sort_m)\n", + "seq['erste Signatur'] = seq['first_BE_location'].apply(lambda x: BE_df.loc[x]['Signatur'])\n", + "seq['letzte Signatur'] = seq['last_BE_location'].apply(lambda x: BE_df.loc[x]['Signatur'])" + ] + }, + { + "cell_type": "code", + "execution_count": 1089, + "id": "81ed055b-34f7-420f-8391-9ffc8f8efc89", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>location</th>\n", + " <th>length</th>\n", + " <th>first_BE_location</th>\n", + " <th>last_location</th>\n", + " <th>last_BE_location</th>\n", + " <th>first_hw_id</th>\n", + " <th>last_hw_id</th>\n", + " <th>erste Signatur</th>\n", + " <th>letzte Signatur</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>931</td>\n", + " <td>24</td>\n", + " <td>4261</td>\n", + " <td>955</td>\n", + " <td>4285</td>\n", + " <td>14.377_079_01</td>\n", + " <td>14.377_079_23</td>\n", + " <td>BE.11.L.31</td>\n", + " <td>BE.11.L.35.(4)</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>4116</td>\n", + " <td>22</td>\n", + " <td>15231</td>\n", + " <td>4138</td>\n", + " <td>15275</td>\n", + " <td>14.378_211_03</td>\n", + " <td>14.378_215_01</td>\n", + " <td>BE.6.M.1-16.(Vol.1)</td>\n", + " <td>BE.6.M.39</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>907</td>\n", + " <td>19</td>\n", + " <td>4237</td>\n", + " <td>926</td>\n", + " <td>4256</td>\n", + " <td>14.377_078_01</td>\n", + " <td>14.377_078_19</td>\n", + " <td>BE.11.L.18</td>\n", + " <td>BE.11.L.22.(3)</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>3524</td>\n", + " <td>18</td>\n", + " <td>13400</td>\n", + " <td>3542</td>\n", + " <td>13418</td>\n", + " <td>14.376_325_03</td>\n", + " <td>14.376_328_01</td>\n", + " <td>BE.5.N.30</td>\n", + " <td>BE.5.N.39</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>2599</td>\n", + " <td>17</td>\n", + " <td>10388</td>\n", + " <td>2616</td>\n", + " <td>10405</td>\n", + " <td>14.376_259_05</td>\n", + " <td>14.376_262_01</td>\n", + " <td>BE.3.W.14</td>\n", + " <td>BE.3.W.29</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>3674</td>\n", + " <td>17</td>\n", + " <td>13828</td>\n", + " <td>3691</td>\n", + " <td>13846</td>\n", + " <td>14.376_435_04</td>\n", + " <td>14.376_438_01</td>\n", + " <td>BE.5.R.30</td>\n", + " <td>BE.5.R.48</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>4240</td>\n", + " <td>17</td>\n", + " <td>15600</td>\n", + " <td>4257</td>\n", + " <td>15618</td>\n", + " <td>14.377_080_00</td>\n", + " <td>14.377_081_08</td>\n", + " <td>BE.6.Q.21</td>\n", + " <td>BE.6.Q.29</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24</th>\n", + " <td>1536</td>\n", + " <td>16</td>\n", + " <td>6440</td>\n", + " <td>1552</td>\n", + " <td>6456</td>\n", + " <td>14.378_425_01</td>\n", + " <td>14.378_426_06</td>\n", + " <td>BE.12.Q.27</td>\n", + " <td>BE.12.Q.41.(Adl)</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25</th>\n", + " <td>2508</td>\n", + " <td>16</td>\n", + " <td>10085</td>\n", + " <td>2524</td>\n", + " <td>10139</td>\n", + " <td>14.378_109_02</td>\n", + " <td>14.378_111_05</td>\n", + " <td>BE.3.S.10-12.(Vol.1)</td>\n", + " <td>BE.3.S.44</td>\n", + " </tr>\n", + " <tr>\n", + " <th>31</th>\n", + " <td>701</td>\n", + " <td>15</td>\n", + " <td>3500</td>\n", + " <td>716</td>\n", + " <td>3515</td>\n", + " <td>14.377_158_06</td>\n", + " <td>14.377_161_06</td>\n", + " <td>BE.10.X.39</td>\n", + " <td>BE.10.X.54</td>\n", + " </tr>\n", + " <tr>\n", + " <th>35</th>\n", + " <td>1942</td>\n", + " <td>15</td>\n", + " <td>8153</td>\n", + " <td>1957</td>\n", + " <td>8177</td>\n", + " <td>14.377_433_09</td>\n", + " <td>14.377_435_04</td>\n", + " <td>BE.2.R.27-32.(Vol.1)</td>\n", + " <td>BE.2.R.48-53.(Vol.1)</td>\n", + " </tr>\n", + " <tr>\n", + " <th>38</th>\n", + " <td>3094</td>\n", + " <td>15</td>\n", + " <td>12064</td>\n", + " <td>3109</td>\n", + " <td>12084</td>\n", + " <td>14.376_385_02</td>\n", + " <td>14.376_387_05</td>\n", + " <td>BE.4.S.28</td>\n", + " <td>BE.4.S.49</td>\n", + " </tr>\n", + " <tr>\n", + " <th>43</th>\n", + " <td>339</td>\n", + " <td>14</td>\n", + " <td>1455</td>\n", + " <td>353</td>\n", + " <td>1469</td>\n", + " <td>14.376_110_02</td>\n", + " <td>14.376_107_02</td>\n", + " <td>BE.1.X.35</td>\n", + " <td>BE.1.X.48</td>\n", + " </tr>\n", + " <tr>\n", + " <th>52</th>\n", + " <td>3226</td>\n", + " <td>14</td>\n", + " <td>12534</td>\n", + " <td>3240</td>\n", + " <td>12650</td>\n", + " <td>14.377_164_00</td>\n", + " <td>14.377_166_03</td>\n", + " <td>BE.4.Y.1.(Vol.1)</td>\n", + " <td>BE.4.Z.37</td>\n", + " </tr>\n", + " <tr>\n", + " <th>55</th>\n", + " <td>3906</td>\n", + " <td>14</td>\n", + " <td>14472</td>\n", + " <td>3920</td>\n", + " <td>14490</td>\n", + " <td>14.377_170_02</td>\n", + " <td>14.377_171_04</td>\n", + " <td>BE.5.Y.10</td>\n", + " <td>BE.5.Y.29</td>\n", + " </tr>\n", + " <tr>\n", + " <th>58</th>\n", + " <td>4900</td>\n", + " <td>14</td>\n", + " <td>19260</td>\n", + " <td>4914</td>\n", + " <td>19275</td>\n", + " <td>14.377_279_01</td>\n", + " <td>14.377_283_01</td>\n", + " <td>BE.8.N.15.(Vol.1)</td>\n", + " <td>BE.8.N.25</td>\n", + " </tr>\n", + " <tr>\n", + " <th>59</th>\n", + " <td>5360</td>\n", + " <td>14</td>\n", + " <td>20942</td>\n", + " <td>5374</td>\n", + " <td>20973</td>\n", + " <td>14.377_349_02</td>\n", + " <td>14.377_352_04</td>\n", + " <td>BE.9.K.10.11.(Vol.1)</td>\n", + " <td>BE.9.K.36.37.(Vol.1)</td>\n", + " </tr>\n", + " <tr>\n", + " <th>69</th>\n", + " <td>3188</td>\n", + " <td>13</td>\n", + " <td>12443</td>\n", + " <td>3201</td>\n", + " <td>12466</td>\n", + " <td>14.377_120_05</td>\n", + " <td>14.377_122_03</td>\n", + " <td>BE.4.X.17</td>\n", + " <td>BE.4.X.29.(Vol.1)</td>\n", + " </tr>\n", + " <tr>\n", + " <th>82</th>\n", + " <td>1108</td>\n", + " <td>12</td>\n", + " <td>4822</td>\n", + " <td>1120</td>\n", + " <td>4835</td>\n", + " <td>14.378_480_06</td>\n", + " <td>14.378_481_06</td>\n", + " <td>BE.11.Q.10</td>\n", + " <td>BE.11.Q.19</td>\n", + " </tr>\n", + " <tr>\n", + " <th>85</th>\n", + " <td>2170</td>\n", + " <td>12</td>\n", + " <td>8764</td>\n", + " <td>2182</td>\n", + " <td>8777</td>\n", + " <td>14.376_234_04</td>\n", + " <td>14.376_238_01</td>\n", + " <td>BE.2.X.52</td>\n", + " <td>BE.2.X.65-68.(Vol.1)</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " location length first_BE_location last_location last_BE_location \n", + "0 931 24 4261 955 4285 \\\n", + "3 4116 22 15231 4138 15275 \n", + "8 907 19 4237 926 4256 \n", + "13 3524 18 13400 3542 13418 \n", + "17 2599 17 10388 2616 10405 \n", + "19 3674 17 13828 3691 13846 \n", + "21 4240 17 15600 4257 15618 \n", + "24 1536 16 6440 1552 6456 \n", + "25 2508 16 10085 2524 10139 \n", + "31 701 15 3500 716 3515 \n", + "35 1942 15 8153 1957 8177 \n", + "38 3094 15 12064 3109 12084 \n", + "43 339 14 1455 353 1469 \n", + "52 3226 14 12534 3240 12650 \n", + "55 3906 14 14472 3920 14490 \n", + "58 4900 14 19260 4914 19275 \n", + "59 5360 14 20942 5374 20973 \n", + "69 3188 13 12443 3201 12466 \n", + "82 1108 12 4822 1120 4835 \n", + "85 2170 12 8764 2182 8777 \n", + "\n", + " first_hw_id last_hw_id erste Signatur letzte Signatur \n", + "0 14.377_079_01 14.377_079_23 BE.11.L.31 BE.11.L.35.(4) \n", + "3 14.378_211_03 14.378_215_01 BE.6.M.1-16.(Vol.1) BE.6.M.39 \n", + "8 14.377_078_01 14.377_078_19 BE.11.L.18 BE.11.L.22.(3) \n", + "13 14.376_325_03 14.376_328_01 BE.5.N.30 BE.5.N.39 \n", + "17 14.376_259_05 14.376_262_01 BE.3.W.14 BE.3.W.29 \n", + "19 14.376_435_04 14.376_438_01 BE.5.R.30 BE.5.R.48 \n", + "21 14.377_080_00 14.377_081_08 BE.6.Q.21 BE.6.Q.29 \n", + "24 14.378_425_01 14.378_426_06 BE.12.Q.27 BE.12.Q.41.(Adl) \n", + "25 14.378_109_02 14.378_111_05 BE.3.S.10-12.(Vol.1) BE.3.S.44 \n", + "31 14.377_158_06 14.377_161_06 BE.10.X.39 BE.10.X.54 \n", + "35 14.377_433_09 14.377_435_04 BE.2.R.27-32.(Vol.1) BE.2.R.48-53.(Vol.1) \n", + "38 14.376_385_02 14.376_387_05 BE.4.S.28 BE.4.S.49 \n", + "43 14.376_110_02 14.376_107_02 BE.1.X.35 BE.1.X.48 \n", + "52 14.377_164_00 14.377_166_03 BE.4.Y.1.(Vol.1) BE.4.Z.37 \n", + "55 14.377_170_02 14.377_171_04 BE.5.Y.10 BE.5.Y.29 \n", + "58 14.377_279_01 14.377_283_01 BE.8.N.15.(Vol.1) BE.8.N.25 \n", + "59 14.377_349_02 14.377_352_04 BE.9.K.10.11.(Vol.1) BE.9.K.36.37.(Vol.1) \n", + "69 14.377_120_05 14.377_122_03 BE.4.X.17 BE.4.X.29.(Vol.1) \n", + "82 14.378_480_06 14.378_481_06 BE.11.Q.10 BE.11.Q.19 \n", + "85 14.376_234_04 14.376_238_01 BE.2.X.52 BE.2.X.65-68.(Vol.1) " + ] + }, + "execution_count": 1089, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seq[:20]" + ] + }, + { + "cell_type": "markdown", + "id": "5d5632b9-5446-44ef-9590-0592641018c0", + "metadata": {}, + "source": [ + "# Nacharbeiten bei duplizierten Titeln" + ] + }, + { + "cell_type": "code", + "execution_count": 1196, + "id": "dd3fd2d0-1558-4505-afe8-7af70c49cb0e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dup_candidates = comp_BE_no_dup[comp_BE_no_dup['dup_title'] & (comp_BE_no_dup['hs. Katalog'] == 1) & (comp_BE_no_dup['copy_from'] == -1)]\n", + "low_sim = []\n", + "\n", + "for i, row in dup_candidates.iterrows():\n", + " title = row['Titel']\n", + " sig = row['Signatur']\n", + " dup_group = comp_BE_no_dup[(comp_BE_no_dup['Titel'] == title) & (comp_BE_no_dup['copy_from'] != -1)]\n", + " for j, dup in dup_group.iterrows():\n", + " sim_score = fuzz.ratio(sig, dup['Signatur'])\n", + " if 80 < sim_score < 90: # Signatur ähnlich\n", + " dup['sim_score'] = sim_score\n", + " dup['other_sig'] = sig\n", + " low_sim.append(dup)\n", + "\n", + "low_sim_df = pd.DataFrame(low_sim)\n", + "\n", + "dod_id = {\n", + " '14.376': 51202, \n", + " '14.377': 51184,\n", + " '14.378': 51219\n", + "}\n", + "\n", + "lettres_edifiantes = comp_BE_no_dup[comp_BE_no_dup['Titel'] == 'Lettres Edifiantes Et Curieuses, Ecrites Des Missions Etrangeres, par quelques Missionaires de la Compagnie de Jesus']\n", + "l_m = entry_df.loc[4652]\n", + "\n", + "for i, row in lettres_edifiantes.iterrows():\n", + " comp_BE_no_dup.at[i, 'hs. Katalog'] = 1\n", + " comp_BE_no_dup.at[i, 'hs. Katalog Konfidenz'] = 'sicher'\n", + " comp_BE_no_dup.at[i, 'hs. Katalogband'] = l_m['volume']\n", + " comp_BE_no_dup.at[i, 'hs. Katalogseite Digitalisat'] = str(l_m['page number'])\n", + " comp_BE_no_dup.at[i, 'Wissensklasse'] = l_m['category']\n", + " comp_BE_no_dup.at[i, 'Wissensunterklasse'] = l_m['subcategory']\n", + " comp_BE_no_dup.at[i, 'Formatangabe'] = l_m['format']\n", + " comp_BE_no_dup.at[i, 'hs. Katalogseite Handschrift'] = l_m['handwritten page number']\n", + " comp_BE_no_dup.at[i, 'hs. Katalogeintrag ID'] = l_m['entry_ID']\n", + " comp_BE_no_dup.at[i, 'hs. Katalogeintrag'] = l_m['entry']\n", + " comp_BE_no_dup.at[i, 'hs. Katalog Image URL'] = f\"https://iiif.onb.ac.at/images/DOD/{dod_id[str(l_m['volume'])]}/{l_m['page number']:08}.jp2/full/full/0/native.jpg\"\n", + "\n", + "delete_hw_match_for = [14220, 13616, 15587]\n", + "take_from_entry_df = [3411, '-', 3247]\n", + "\n", + "for i, num in enumerate(delete_hw_match_for):\n", + " if take_from_entry_df[i] != '-':\n", + " entry = entry_df.loc[take_from_entry_df[i]]\n", + " comp_BE_no_dup.at[num, 'hs. Katalog'] = 1\n", + " comp_BE_no_dup.at[num, 'hs. Katalog Konfidenz'] = 'sicher'\n", + " comp_BE_no_dup.at[num, 'hs. Katalogband'] = entry['volume']\n", + " comp_BE_no_dup.at[num, 'hs. Katalogseite Digitalisat'] = str(entry['page number'])\n", + " comp_BE_no_dup.at[num, 'Wissensklasse'] = entry['category']\n", + " comp_BE_no_dup.at[num, 'Wissensunterklasse'] = entry['subcategory']\n", + " comp_BE_no_dup.at[num, 'Formatangabe'] = entry['format']\n", + " comp_BE_no_dup.at[num, 'hs. Katalogseite Handschrift'] = entry['handwritten page number']\n", + " comp_BE_no_dup.at[num, 'hs. Katalogeintrag ID'] = entry['entry_ID']\n", + " comp_BE_no_dup.at[num, 'hs. Katalogeintrag'] = entry['entry']\n", + " comp_BE_no_dup.at[num, 'hs. Katalog Image URL'] = f\"https://iiif.onb.ac.at/images/DOD/{dod_id[str(entry['volume'])]}/{entry['page number']:08}.jp2/full/full/0/native.jpg\"\n", + " else:\n", + " comp_BE_no_dup.at[num, 'hs. Katalog'] = 0\n", + " comp_BE_no_dup.at[num, 'hs. Katalog Konfidenz'] = ''\n", + " comp_BE_no_dup.at[num, 'hs. Katalogband'] = ''\n", + " comp_BE_no_dup.at[num, 'hs. Katalogseite Digitalisat'] = ''\n", + " comp_BE_no_dup.at[num, 'Wissensklasse'] = ''\n", + " comp_BE_no_dup.at[num, 'Wissensunterklasse'] = ''\n", + " comp_BE_no_dup.at[num, 'Formatangabe'] = ''\n", + " comp_BE_no_dup.at[num, 'hs. Katalogseite Handschrift'] = ''\n", + " comp_BE_no_dup.at[num, 'hs. Katalogeintrag ID'] = ''\n", + " comp_BE_no_dup.at[num, 'hs. Katalogeintrag'] = ''\n", + " comp_BE_no_dup.at[num, 'hs. Katalog Image URL'] = ''\n", + " comp_BE_no_dup.at[num, 'copy_from'] = -1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd067c70-ba6d-4d82-9489-3e9b332dad3f", + "metadata": {}, + "outputs": [], + "source": [ + "comp_BE_no_dup.to_excel('../Daten/Vorhersagen/WIP_final_BE_3.xlsx')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12fcd3ad-222e-4afe-baea-7be69b3ae5cc", "metadata": {}, "outputs": [], "source": []