{ "cells": [ { "cell_type": "code", "execution_count": 73, "id": "7a3837ac-cced-4e01-bf57-265e40729692", "metadata": { "tags": [] }, "outputs": [], "source": [ "import pandas as pd\n", "import re\n", "import numpy as np\n", "import thefuzz\n", "import tqdm\n", "import matplotlib.pyplot as plt\n", "import requests\n", "import json\n", "\n", "pd.set_option('display.max_colwidth', None)" ] }, { "cell_type": "code", "execution_count": 74, "id": "29ca0dc8-cae7-4f12-bd60-fd74ea6ae5ac", "metadata": { "tags": [] }, "outputs": [], "source": [ "BE_df = pd.read_excel('../Daten/Vorhersagen/Katalogauszug und Vorhersagen.xlsx', index_col=0)" ] }, { "cell_type": "code", "execution_count": 75, "id": "c1e1c42a-962f-40bc-bb17-b62e8089feb7", "metadata": { "tags": [] }, "outputs": [], "source": [ "entry_df = pd.read_excel('../Daten/Katalogabgleich/Einträge.xlsx', index_col=0)" ] }, { "cell_type": "code", "execution_count": 76, "id": "50d15898-4687-46b7-b7e0-528d7cf9aec0", "metadata": { "tags": [] }, "outputs": [], "source": [ "def prepare_string(string):\n", " new = re.sub(r'[àáâãå]', 'a', string)\n", " new = re.sub(r'[èéêë]', 'e', new)\n", " new = re.sub(r'[ìíîï]', 'i', new)\n", " new = re.sub(r'[òóôõ]', 'o', new)\n", " new = re.sub(r'[ùúû]', 'u', new)\n", " new = re.sub(r'æ', 'ae', new)\n", " new = re.sub('[.,:;()¬]|^[CLXVI]+? |^\\d+? |^\\d+?\\.+? |^\\.+ ?|= |# ', '', new)\n", " return new\n", "\n", "entry_df['cleaned entry'] = entry_df['entry'].apply(lambda x: prepare_string(x))" ] }, { "cell_type": "code", "execution_count": 6, "id": "990dfeee-1141-4acb-8a3d-a7af0573f5be", "metadata": { "tags": [] }, "outputs": [], "source": [ "BE_early = BE_df[BE_df['Anfang Veröffentlichungsdatum'] < 1736]\n", "BE_early_no_dup = BE_early.drop_duplicates('Titel')\n", "BE_no_year = BE_df[BE_df['Anfang Veröffentlichungsdatum'].isna()].drop_duplicates('Titel')\n", "BE_manual = BE_df.loc[BE_early_no_dup.index.union(BE_no_year.index)]" ] }, { "cell_type": "code", "execution_count": 7, "id": "bcd301fe-cb80-4b1c-b65f-465fce5ed915", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "array([['BE.1.T.33', '+Z167651701',\n", " \"Les deux livres de S. Augustin, de la veritable religion et des moeurs de l'eglise catholique, traduits en francois, sur l'edition latine des Peres Benedictins de la congregation de S. Maur\",\n", " 'Augustinus, Aurelius Heiliger 354-430', nan, 1693.0, nan,\n", " '1693', 'Paris', 'Paris', 'French', nan, nan, nan, nan, nan,\n", " 'Z167651701', 'Z167651701_00000001.jpg', 'B', 0.0008314285660162,\n", " 0.9985696077346802, 0.0003245634725317, 0.0002743884397204,\n", " 'blue', 0.9836959838867188, 0.0060312687419354,\n", " 0.0102726686745882]], dtype=object)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "BE_manual[BE_manual['Signatur'].str.contains('BE.1.T.33')].values" ] }, { "cell_type": "code", "execution_count": 77, "id": "c0f4a42a-7e21-41e8-833c-2dd2f9d1985e", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
volumepage numbercategorysubcategoryformathandwritten page numberentry_IDentrycleaned entry
350814.37798PoëticaPoëtæ Gallici unà cum DramaticisOctavo und kleiner50214.377_098_09Poësies de Jacques Poille. Paris. 1623. Th. Blaise n. 1257.Poesies de Jacques Poille Paris 1623 Th Blaise n 1257
\n", "
" ], "text/plain": [ " volume page number category subcategory \n", "3508 14.377 98 Poëtica Poëtæ Gallici unà cum Dramaticis \\\n", "\n", " format handwritten page number entry_ID \n", "3508 Octavo und kleiner 502 14.377_098_09 \\\n", "\n", " entry \n", "3508 Poësies de Jacques Poille. Paris. 1623. Th. Blaise n. 1257. \\\n", "\n", " cleaned entry \n", "3508 Poesies de Jacques Poille Paris 1623 Th Blaise n 1257 " ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def search_in_entry(df, string):\n", " return df[df['cleaned entry'].str.contains(string)]\n", "\n", "info = search_in_entry(search_in_entry(entry_df, 'Poille'), '')\n", "print(len(info))\n", "info" ] }, { "cell_type": "code", "execution_count": 368, "id": "c4394718-cfd3-459e-8923-3ef255a41012", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "volume 14.377\n", "page number 28\n", "category Poëtica\n", "subcategory Poëtæ Græci\n", "format Octavo und kleiner\n", "handwritten page number 436\n", "entry_ID 14.377_028_02\n", "entry ..........Le Plutus & Les Nuéces du même trad. du grec par Mad.elle Anne le Fevre. 12.° Paris. 1684. Thierry. n. 1062.\n", "cleaned entry Le Plutus & Les Nueces du meme trad du grec par Madelle Anne le Fevre 12° Paris 1684 Thierry n 1062\n", "Name: 2857, dtype: object" ] }, "execution_count": 368, "metadata": {}, "output_type": "execute_result" } ], "source": [ "entry_df.loc[2857]" ] }, { "cell_type": "code", "execution_count": 414, "id": "20facf5d-d609-498e-9907-7ebdffc09e15", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "BE.6.S.66\n", "+Z201401009\n", "Florus Danicus ; Otthiniae Impressus a Christiano Schrödero, Anno 1698\n", "Bering, Vitus 1617-1675\n", "Tyrnaviae\n", "1716\n", "N\n" ] }, { "data": { "text/plain": [ "Signatur BE.6.S.66\n", "Barcode +Z201401009\n", "Titel Florus Danicus ; Otthiniae Impressus a Christiano Schrödero, Anno 1698\n", "Autor Bering, Vitus 1617-1675\n", "Mitwirkender NaN\n", "Anfang Veröffentlichungsdatum 1716.0\n", "Ende Veröffentlichungsdatum NaN\n", "Veröffentlichungsdatum 1716\n", "Veröffentlichungsort Tyrnaviae\n", "Veröffentlichungsort (normiert) Tyrnau\n", "Sprache Latin\n", "Schlagwörter Dänemark; Geschichte\n", "Schlagwörter (mit GND) Dänemark$Dg--(DE-588)4010877-6;Geschichte$Az;AT-OBV--ONB-AK\n", "Vorbesitzer NaN\n", "Typ NaN\n", "Bemerkungen NaN\n", "Gültiger Barcode Z201401009\n", "Dateiname Z201401009_00000001.jpg\n", "Wappenklassifizierung N\n", "p_A 0.022146\n", "p_B 0.008077\n", "p_C 0.014462\n", "p_N 0.955316\n", "Farbklassifizierung NaN\n", "p_blue NaN\n", "p_red NaN\n", "p_yellow NaN\n", "Name: 15975, dtype: object" ] }, "execution_count": 414, "metadata": {}, "output_type": "execute_result" } ], "source": [ "BE_entry = BE_df.loc[15975]\n", "print(BE_entry['Signatur'])\n", "print(BE_entry['Barcode'])\n", "print(BE_entry['Titel'])\n", "print(BE_entry['Autor'])\n", "print(BE_entry['Veröffentlichungsort'])\n", "print(BE_entry['Veröffentlichungsdatum'])\n", "print(BE_entry['Wappenklassifizierung'])\n", "BE_entry" ] }, { "cell_type": "markdown", "id": "a499f1aa-20d8-4e5c-9386-abd2b7023974", "metadata": {}, "source": [ "# Create Matching data\n", "\n", "## Match from existing catalogue data for BE signatures. Combine author (+ printer etc), title, year and place and try to match it with an entry in the handwritten catalogue." ] }, { "cell_type": "code", "execution_count": null, "id": "f5388945-7294-420a-a524-c3372e3c761d", "metadata": { "tags": [] }, "outputs": [], "source": [ "better_matches = []\n", "scorer = thefuzz.fuzz.token_set_ratio\n", "\n", "for index, row in tqdm.notebook.tqdm(BE_manual.iterrows(), total=len(BE_manual)):\n", " keys = ['Autor', 'Mitwirkender', 'Titel', 'Veröffentlichungsort', 'Anfang Veröffentlichungsdatum']\n", " comb_string = ''\n", " for key in keys:\n", " val = row[key]\n", " if not pd.isna(val):\n", " if key == 'Autor' or key == 'Mitwirkender':\n", " if ',' in val: # falls name, vorname\n", " val = val.split(',')[0]\n", " val = val.split(' ')[0]\n", " elif key == 'Titel':\n", " val = prepare_string(val)\n", " elif key == 'Anfang Veröffentlichungsdatum':\n", " val = str(int(val))\n", " else: # key == 'Veröffentlichungsort'\n", " pass\n", " comb_string += val + ' '\n", " \n", " matches_lis = thefuzz.process.extract(comb_string, entry_df['cleaned entry'], scorer=scorer, limit=5)\n", " flat_matches = []\n", " for match in matches_lis:\n", " flat_matches.append(match[0])\n", " flat_matches.append(match[1])\n", " flat_matches.append(match[2])\n", " better_matches.append([comb_string] + flat_matches)\n", "\n", "matches_df = pd.DataFrame(better_matches, columns=['input', 'match_1', 'score_1', 'id_1', 'match_2', 'score_2', 'id_2', 'match_3', 'score_3', 'id_3', 'match_4', 'score_4', 'id_4', 'match_5', 'score_5', 'id_5'])\n", "matches_df['control'] = ''\n", "matches_df" ] }, { "cell_type": "code", "execution_count": null, "id": "e0f42259-6df9-4e3b-80e1-1dbe4642545c", "metadata": { "tags": [] }, "outputs": [], "source": [ "matches_df_no_score = matches_df.drop(['score_1', 'score_2', 'score_3', 'score_4', 'score_5'], axis=1)\n", "matches_df_no_score.insert(1, 'input_id', BE_manual.index)\n", "matches_df_no_score" ] }, { "cell_type": "code", "execution_count": 749, "id": "51aae223-d87e-4fa0-8a97-25c33b7c8d65", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Export in batches of 1000\n", "matches_df_no_score[:1000].to_excel('../Daten/Katalogabgleich/Kandidaten/Kandidaten_1000.xlsx', index=False)\n", "matches_df_no_score[1000:2000].to_excel('../Daten/Katalogabgleich/Kandidaten/Kandidaten_2000.xlsx', index=False)\n", "matches_df_no_score[2000:3000].to_excel('../Daten/Katalogabgleich/Kandidaten/Kandidaten_3000.xlsx', index=False)\n", "matches_df_no_score[3000:4000].to_excel('../Daten/Katalogabgleich/Kandidaten/Kandidaten_4000.xlsx', index=False)\n", "matches_df_no_score[4000:5000].to_excel('../Daten/Katalogabgleich/Kandidaten/Kandidaten_5000.xlsx', index=False)\n", "matches_df_no_score[5000:6000].to_excel('../Daten/Katalogabgleich/Kandidaten/Kandidaten_6000.xlsx', index=False)\n", "matches_df_no_score[6000:7000].to_excel('../Daten/Katalogabgleich/Kandidaten/Kandidaten_7000.xlsx', index=False)\n", "matches_df_no_score[7000:8000].to_excel('../Daten/Katalogabgleich/Kandidaten/Kandidaten_8000.xlsx', index=False)\n", "matches_df_no_score[8000:].to_excel('../Daten/Katalogabgleich/Kandidaten/Kandidaten_8756.xlsx', index=False)" ] }, { "cell_type": "code", "execution_count": 3, "id": "68eb5b1e-a23b-4014-aa7d-903878c14a3d", "metadata": {}, "outputs": [], "source": [ "non_BE = BE_df[~BE_df['Signatur'].str.contains('BE')]\n", "non_BE_wappen = non_BE[~non_BE['Wappenklassifizierung'].isna()]" ] }, { "cell_type": "code", "execution_count": 34, "id": "d34f7a91-6f09-4c93-8376-105dc804764f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SignaturWappenklassifizierungBarcodeMitwirkender
22595Be.10.J.38.(Vol.1,1)N+Z17136510XCotta; Nees von Esenbeck, Christian Gottfried Daniel 1776-1858; Eschweiler, Franz Gerhard 1796-1831
22599Be.5.Q.69.(Vol.5)N+Z197357203NaN
2261222.M.23B+Z43165504Manuzio, Aldo Pio 1450-1515; Philostratus, Flavius 160-245; Eusebius Caesariensis 260-339; Rinuccinus, Alemannus 1426-1504; Acciaiuoli, Zanobi 1461-1519; Ευσεβιος Καισαρειας
2261322.N.14B+Z221977709Manuzio, Paolo; Manuzio, Paolo 1512-1574
2261422.N.27 (Vol.1)B+Z43162801Manuzio, Aldo Pio 1450-1515; Theophanes Confessor 752-817; Cosmas Hierosolymitanus -781; Marcus de Idronto ca. um 770; Johannes Damascenus 675-749; Clarius, Daniel ca. 15./16. Jh.
2261522.N.67B+Z221954400Manuzio, Paolo 1512-1574; Auria, Andrea <<d'>>
2261622.Q.37N+Z22196790XAsulanus, Franciscus -1546; Aldo Manuzio Senior Haeredes; Carloni, Antonio
2261860164-D.1A+Z194791901NaN
2261960164-D.2A+Z254932002NaN
\n", "
" ], "text/plain": [ " Signatur Wappenklassifizierung Barcode \\\n", "22595 Be.10.J.38.(Vol.1,1) N +Z17136510X \n", "22599 Be.5.Q.69.(Vol.5) N +Z197357203 \n", "22612 22.M.23 B +Z43165504 \n", "22613 22.N.14 B +Z221977709 \n", "22614 22.N.27 (Vol.1) B +Z43162801 \n", "22615 22.N.67 B +Z221954400 \n", "22616 22.Q.37 N +Z22196790X \n", "22618 60164-D.1 A +Z194791901 \n", "22619 60164-D.2 A +Z254932002 \n", "\n", " Mitwirkender \n", "22595 Cotta; Nees von Esenbeck, Christian Gottfried Daniel 1776-1858; Eschweiler, Franz Gerhard 1796-1831 \n", "22599 NaN \n", "22612 Manuzio, Aldo Pio 1450-1515; Philostratus, Flavius 160-245; Eusebius Caesariensis 260-339; Rinuccinus, Alemannus 1426-1504; Acciaiuoli, Zanobi 1461-1519; Ευσεβιος Καισαρειας \n", "22613 Manuzio, Paolo; Manuzio, Paolo 1512-1574 \n", "22614 Manuzio, Aldo Pio 1450-1515; Theophanes Confessor 752-817; Cosmas Hierosolymitanus -781; Marcus de Idronto ca. um 770; Johannes Damascenus 675-749; Clarius, Daniel ca. 15./16. Jh. \n", "22615 Manuzio, Paolo 1512-1574; Auria, Andrea <> \n", "22616 Asulanus, Franciscus -1546; Aldo Manuzio Senior Haeredes; Carloni, Antonio \n", "22618 NaN \n", "22619 NaN " ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "non_BE_wappen[['Signatur', 'Wappenklassifizierung', 'Barcode', 'Mitwirkender']]" ] }, { "cell_type": "code", "execution_count": 33, "id": "512515e9-84a4-4e97-a16d-2f28df90abde", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['Signatur', 'Barcode', 'Titel', 'Autor', 'Mitwirkender',\n", " 'Anfang Veröffentlichungsdatum', 'Ende Veröffentlichungsdatum',\n", " 'Veröffentlichungsdatum', 'Veröffentlichungsort',\n", " 'Veröffentlichungsort (normiert)', 'Sprache', 'Schlagwörter',\n", " 'Schlagwörter (mit GND)', 'Vorbesitzer', 'Typ', 'Bemerkungen',\n", " 'Gültiger Barcode', 'Dateiname', 'Wappenklassifizierung', 'p_A', 'p_B',\n", " 'p_C', 'p_N', 'Farbklassifizierung', 'p_blue', 'p_red', 'p_yellow'],\n", " dtype='object')" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "non_BE_wappen.columns" ] }, { "cell_type": "code", "execution_count": 79, "id": "b8de2583-c260-46df-ab7c-15b4feff0a33", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['filename', 'prediction', 'p_A', 'p_B', 'p_C', 'p_N', 'color', 'p_blue',\n", " 'p_red', 'p_yellow', 'man_prediction', 'man_color', 'valid_bc',\n", " 'Permanent Call Number', '992', '993', '695', '866',\n", " 'Library Code (Active)', 'Location Code', 'Barcode', 'Item Call Number',\n", " 'Title', 'Author', 'Author (contributor)', 'Begin Publication Date',\n", " 'End Publication Date', 'Publication Date', 'Publication Place',\n", " 'Publisher', 'Unnamed: 17', 'MMS Id', '856', 'Subjects',\n", " 'Subjects (Names)'],\n", " dtype='object')" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pred = pd.read_csv('../Daten/Vorhersagen/catalogue_pred_combined.csv')\n", "pred.columns" ] }, { "cell_type": "code", "execution_count": 80, "id": "042eb3f3-6d53-4403-a230-513d36ea8b44", "metadata": {}, "outputs": [], "source": [ "no_sign_wappen = pred[pred['Permanent Call Number'].isna()]" ] }, { "cell_type": "code", "execution_count": 81, "id": "87462bd0-ad17-4ea5-9486-f4b23605d74d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "803" ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(no_sign_wappen)" ] }, { "cell_type": "code", "execution_count": 82, "id": "d7326106-193b-4841-ac3c-becae1b27ddf", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
filenamepredictionp_Ap_Bp_Cp_Ncolorp_bluep_redp_yellow...Begin Publication DateEnd Publication DatePublication DatePublication PlacePublisherUnnamed: 17MMS Id856SubjectsSubjects (Names)
0Z103519105_00000001.jpgA0.9985980.0004450.0004000.000558red0.0004190.9977300.001851...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4Z10353460X_00000001.jpgA0.9947110.0023870.0016790.001224red0.0025590.9825440.014897...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
13Z105561605_00000001.jpgB0.0001420.9995900.0000700.000198red0.0005020.9955070.003991...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
16Z137115906_00000001.jpgA0.9977220.0013120.0004610.000505blue0.9931580.0037760.003066...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
17Z13711690X_00000001.jpgA0.9994520.0001850.0001490.000213blue0.9942730.0053530.000374...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
..................................................................
6154Z260077107_00000001.jpgB0.0002480.9984840.0004420.000827red0.0002130.9943170.005469...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
6155Z43162801_00000001.jpgB0.0006840.9976340.0011800.000502red0.0014500.9971980.001353...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
6156Z43163301_00000001.jpgB0.0008720.9981390.0005830.000406red0.0003310.9984730.001197...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
6157Z43165504_00000001.jpgB0.0006700.9983240.0005600.000445red0.0011880.9963740.002438...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
6158Z43169509_00000001.jpgA0.9951560.0017110.0021910.000941red0.0001200.9997100.000170...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", "

803 rows × 35 columns

\n", "
" ], "text/plain": [ " filename prediction p_A p_B p_C \\\n", "0 Z103519105_00000001.jpg A 0.998598 0.000445 0.000400 \n", "4 Z10353460X_00000001.jpg A 0.994711 0.002387 0.001679 \n", "13 Z105561605_00000001.jpg B 0.000142 0.999590 0.000070 \n", "16 Z137115906_00000001.jpg A 0.997722 0.001312 0.000461 \n", "17 Z13711690X_00000001.jpg A 0.999452 0.000185 0.000149 \n", "... ... ... ... ... ... \n", "6154 Z260077107_00000001.jpg B 0.000248 0.998484 0.000442 \n", "6155 Z43162801_00000001.jpg B 0.000684 0.997634 0.001180 \n", "6156 Z43163301_00000001.jpg B 0.000872 0.998139 0.000583 \n", "6157 Z43165504_00000001.jpg B 0.000670 0.998324 0.000560 \n", "6158 Z43169509_00000001.jpg A 0.995156 0.001711 0.002191 \n", "\n", " p_N color p_blue p_red p_yellow ... \\\n", "0 0.000558 red 0.000419 0.997730 0.001851 ... \n", "4 0.001224 red 0.002559 0.982544 0.014897 ... \n", "13 0.000198 red 0.000502 0.995507 0.003991 ... \n", "16 0.000505 blue 0.993158 0.003776 0.003066 ... \n", "17 0.000213 blue 0.994273 0.005353 0.000374 ... \n", "... ... ... ... ... ... ... \n", "6154 0.000827 red 0.000213 0.994317 0.005469 ... \n", "6155 0.000502 red 0.001450 0.997198 0.001353 ... \n", "6156 0.000406 red 0.000331 0.998473 0.001197 ... \n", "6157 0.000445 red 0.001188 0.996374 0.002438 ... \n", "6158 0.000941 red 0.000120 0.999710 0.000170 ... \n", "\n", " Begin Publication Date End Publication Date Publication Date \\\n", "0 NaN NaN NaN \n", "4 NaN NaN NaN \n", "13 NaN NaN NaN \n", "16 NaN NaN NaN \n", "17 NaN NaN NaN \n", "... ... ... ... \n", "6154 NaN NaN NaN \n", "6155 NaN NaN NaN \n", "6156 NaN NaN NaN \n", "6157 NaN NaN NaN \n", "6158 NaN NaN NaN \n", "\n", " Publication Place Publisher Unnamed: 17 MMS Id 856 Subjects \\\n", "0 NaN NaN NaN NaN NaN NaN \n", "4 NaN NaN NaN NaN NaN NaN \n", "13 NaN NaN NaN NaN NaN NaN \n", "16 NaN NaN NaN NaN NaN NaN \n", "17 NaN NaN NaN NaN NaN NaN \n", "... ... ... ... ... ... ... \n", "6154 NaN NaN NaN NaN NaN NaN \n", "6155 NaN NaN NaN NaN NaN NaN \n", "6156 NaN NaN NaN NaN NaN NaN \n", "6157 NaN NaN NaN NaN NaN NaN \n", "6158 NaN NaN NaN NaN NaN NaN \n", "\n", " Subjects (Names) \n", "0 NaN \n", "4 NaN \n", "13 NaN \n", "16 NaN \n", "17 NaN \n", "... ... \n", "6154 NaN \n", "6155 NaN \n", "6156 NaN \n", "6157 NaN \n", "6158 NaN \n", "\n", "[803 rows x 35 columns]" ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "no_sign_wappen" ] }, { "cell_type": "code", "execution_count": 98, "id": "07da4f64-6dbd-4a57-9122-338a9c66bf5b", "metadata": {}, "outputs": [], "source": [ "nsw = no_sign_wappen.dropna(axis=1)" ] }, { "cell_type": "code", "execution_count": 99, "id": "73c74495-9522-427a-88db-d8f1cca96cdd", "metadata": {}, "outputs": [], "source": [ "def get_content(url):\n", " retries = 0\n", " while retries < 3:\n", " try:\n", " resp = requests.get(url)\n", " if resp.status_code == 200:\n", " return resp.content\n", " else:\n", " retries += 1\n", " except requests.exceptions.Timeout:\n", " retries += 1\n", " except (requests.exceptions.TooManyRedirects, requests.exceptions.RequestException) as e:\n", " raise SystemExit(e)\n", "\n", "def extract_signature_from_manifest(bc):\n", " man_url = f'https://iiif.onb.ac.at/presentation/ABO/{bc}/manifest'\n", " metadata = json.loads(get_content(man_url))['metadata']\n", " metadata_str = json.dumps(metadata)\n", " loc_s = re.compile('(.+?)')\n", " ac_s = re.compile('AC[0-9]+')\n", " loc = loc_s.findall(metadata_str)\n", " if len(loc) > 0:\n", " loc = loc[0]\n", " else:\n", " loc = ''\n", " acnr = ac_s.findall(metadata_str)\n", " if len(acnr) > 0:\n", " acnr = acnr[0]\n", " else:\n", " acnr = ''\n", " return (loc, acnr)" ] }, { "cell_type": "code", "execution_count": 100, "id": "f4ef590d-775f-41c6-bd35-384aa9e5dd8e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('', 'AC09912489')" ] }, "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ "extract_signature_from_manifest('Z150809200')" ] }, { "cell_type": "code", "execution_count": 101, "id": "8920d0a5-6ead-4db8-8407-cd6d4c8657e7", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_7044/501467936.py:4: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " nsw.loc[i, 'Signatur'] = sig\n", "/tmp/ipykernel_7044/501467936.py:5: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " nsw.loc[i, 'AC-Nummer'] = ac\n" ] } ], "source": [ "for i in nsw.index:\n", " bc = nsw.loc[i, 'valid_bc']\n", " sig, ac = extract_signature_from_manifest(bc)\n", " nsw.loc[i, 'Signatur'] = sig\n", " nsw.loc[i, 'AC-Nummer'] = ac" ] }, { "cell_type": "code", "execution_count": 105, "id": "0ae7c215-f811-4b6d-897f-55328337ead4", "metadata": {}, "outputs": [], "source": [ "nsw_no_dup = nsw.drop_duplicates('valid_bc')" ] }, { "cell_type": "code", "execution_count": 110, "id": "e3916d07-7d22-40c3-aff2-eb9823840f96", "metadata": {}, "outputs": [], "source": [ "keine_BE = nsw_no_dup.drop(['filename', 'prediction', 'p_A', 'p_B', 'p_C', 'p_N', 'color', 'p_blue', 'p_red', 'p_yellow'], axis=1)" ] }, { "cell_type": "code", "execution_count": 114, "id": "62d169f7-2bcb-431b-8f6b-76d37194abc9", "metadata": {}, "outputs": [], "source": [ "keine_BE.rename(columns={'man_prediction': 'Wappen', 'man_color': 'Einbandfarbe', 'valid_bc': 'Barcode'}, inplace=True)" ] }, { "cell_type": "code", "execution_count": 116, "id": "3cc27a7c-b960-4360-81c8-6873fc39a95e", "metadata": {}, "outputs": [], "source": [ "keine_BE = keine_BE.reindex(columns=['Barcode', 'Signatur', 'AC-Nummer', 'Wappen', 'Einbandfarbe'])" ] }, { "cell_type": "code", "execution_count": 121, "id": "f2c1a4aa-3312-4353-9767-ef0009bf35da", "metadata": {}, "outputs": [], "source": [ "keine_BE.to_excel('../Daten/Vorhersagen/Eugeniana aber nicht BE.xlsx', index=None)" ] }, { "cell_type": "markdown", "id": "054a6f0d-251d-4cc5-84b9-0d777b7d1f7c", "metadata": {}, "source": [ "# Analyze manual matching data" ] }, { "cell_type": "code", "execution_count": 78, "id": "d6efcff5-0393-4835-b673-001e85877f13", "metadata": { "tags": [] }, "outputs": [], "source": [ "manual_file_names = [\n", " 'Ausgefüllt_1000.xlsx',\n", " 'Ausgefüllt_2000.xlsx',\n", " 'Ausgefüllt_3000.xlsx',\n", " 'Ausgefüllt_4000.xlsx',\n", " 'Ausgefüllt_5000.xlsx',\n", " 'Ausgefüllt_6000.xlsx',\n", " 'Ausgefüllt_7000.xlsx',\n", " 'Ausgefüllt_8000.xlsx',\n", " 'Ausgefüllt_8756.xlsx',\n", "]\n", "\n", "manual_matches_dfs = []\n", "\n", "for fname in manual_file_names:\n", " df = pd.read_excel(f'../Daten/Katalogabgleich/Ausgefüllt/{fname}')\n", " manual_matches_dfs.append(df)\n", "\n", "man_matches = pd.concat(manual_matches_dfs, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": 79, "id": "7a39c6a7-d81b-4f79-89a0-bafc96411d93", "metadata": { "tags": [] }, "outputs": [], "source": [ "has_hw_catalog = man_matches.dropna(subset=['control'])" ] }, { "cell_type": "markdown", "id": "881ea2ed-277b-46da-8f23-e3f3f56aae86", "metadata": {}, "source": [ "## Matching statistics" ] }, { "cell_type": "code", "execution_count": 81, "id": "cb7b6815-c782-4e00-bd7f-9abcf7f523f5", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "75 \n", "75 \n", "True\n" ] } ], "source": [ "def num_in_lis(num, lis_str):\n", " if pd.isna(lis_str):\n", " return False \n", " else:\n", " if isinstance(lis_str, int):\n", " lis = [lis_str]\n", " if isinstance(lis_str, str):\n", " if '?' in lis_str:\n", " return False\n", " lis = lis_str.split(', ')\n", " lis = [int(n) for n in lis]\n", " return num in lis\n", "\n", "example = man_matches.loc[9]\n", "print(example['id_1'], type(example['id_1']))\n", "print(example['control'], type(example['control']))\n", "print(num_in_lis(example['id_1'], example['control']))" ] }, { "cell_type": "code", "execution_count": 108, "id": "836d1b5e-ef2f-4ff8-9c03-298b029f73b2", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "5778 out of 8756 BE signatures from the appropriate time period were matched\n", "For 4564 matches the first one was correct\n", "For 5072 matches one of the five given ones in the table was correct\n", "124 matches have two or more mentions in the handrwitten catalog\n" ] } ], "source": [ "BE_signatures_matched = len(man_matches.dropna(subset=['control']))\n", "print(BE_signatures_matched, 'out of', len(man_matches), 'BE signatures from the appropriate time period were matched')\n", "\n", "first_matches = sum(man_matches.apply(lambda x: num_in_lis(x['id_1'], x['control']), axis=1))\n", "print('For', first_matches, 'matches the first one was correct')\n", "any_matches = sum(man_matches.apply(lambda x: num_in_lis(x['id_1'], x['control']) \n", " or num_in_lis(x['id_2'], x['control']) \n", " or num_in_lis(x['id_3'], x['control']) \n", " or num_in_lis(x['id_4'], x['control']) \n", " or num_in_lis(x['id_5'], x['control']), axis=1))\n", "print('For', any_matches, 'matches one of the five given ones in the table was correct')\n", "\n", "double_matches = man_matches['control'].str.contains(',').sum()\n", "print(double_matches, 'matches have two or more mentions in the handrwitten catalog')" ] }, { "cell_type": "code", "execution_count": 83, "id": "9f7bd1bf-a4cd-427a-b65a-5e2fce029212", "metadata": { "tags": [] }, "outputs": [], "source": [ "num_re = re.compile('\\d{1,4}(?!\\?)')\n", "all_matches = []\n", "for row in man_matches['control']:\n", " match = num_re.findall(str(row))\n", " if match:\n", " all_matches.append(match)\n", "\n", "flat_matches = [el for sublis in all_matches for el in sublis]\n", "set_matches = set(flat_matches)" ] }, { "cell_type": "code", "execution_count": 84, "id": "17434971-7462-4f00-8370-d5573f3ea72c", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "5086 entries out of 9403 entries from the handwritten catalog were matched (at least once)\n" ] }, { "data": { "text/plain": [ "[2, 4, 7, 9, 10, 11, 12, 14, 16, 18, 19, 20, 22, 25, 26, 28, 29, 32, 34, 35]" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "unique_matches_int = sorted([int(n) for n in set_matches])\n", "\n", "print(len(unique_matches_int), 'entries out of', len(entry_df), 'entries from the handwritten catalog were matched (at least once)')\n", "unique_matches_int[:20]" ] }, { "cell_type": "code", "execution_count": 111, "id": "25fcabf9-fb1c-4fe7-838a-8b595f0a2673", "metadata": { "tags": [] }, "outputs": [], "source": [ "no_matches_df = entry_df.loc[entry_df.index.difference(unique_matches_int)]\n", "matches_df = entry_df.loc[unique_matches_int]" ] }, { "cell_type": "code", "execution_count": 136, "id": "24bb9a74-0e99-4760-a3d5-05f8df71f85a", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'Historia Nova Europæ': 0.48382838283828383, 'Poëtica': 0.5410273515677118, 'Paralipomena Historica': 0.5508870214752568, 'Theologia': 0.7555555555555555, 'Philologia': 0.596694214876033, 'Historia Naturalis': 0.6634615384615384, 'Historia Ecclesiastica': 0.689119170984456, 'Artes': 0.5623342175066313, 'Philosophia': 0.6542857142857142, 'Historia Antiqua': 0.3862928348909657, 'Codices Manuscripti': 0.0034129692832764505, 'Imagines Incisæ': 0.04330708661417323, 'Geographia': 0.5317460317460317, 'Iurisprudentia': 0.6715686274509803, 'Historia Nova extra Europam': 0.5577889447236181, 'Rhetorica': 0.5864197530864198, 'Mathematica': 0.7559055118110236, 'Grammatica': 0.5819672131147541, 'Medicina': 0.6330275229357798, 'Chronologia': 0.5612244897959183, 'Miscellanea Historica': 0.3877551020408163, 'Imagines Variæ Incisæ': 0.0851063829787234, 'Effigierum Incisarum Collectio': 0.0, 'Imaginum Delineatarum Collectio': 0.0}\n" ] } ], "source": [ "all_counts = entry_df['category'].value_counts()\n", "match_counts = matches_df['category'].value_counts()\n", "no_match_counts = no_matches_df['category'].value_counts()\n", "\n", "ratios = {}\n", "for cat, count in all_counts.items():\n", " matched = 0\n", " if cat in match_counts.index:\n", " matched = match_counts[cat]\n", " ratios[cat] = matched / count\n", "\n", "print(ratios)" ] }, { "cell_type": "code", "execution_count": 170, "id": "6c952698-2bc7-4511-a688-3bcd30ed8196", "metadata": { "tags": [] }, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sorted_ratios = sorted(ratios.items(), key=lambda x: x[1], reverse=True)\n", "n, r = list(zip(*sorted_ratios))\n", "\n", "fix, ax = plt.subplots()\n", "ax.bar(n, [100 * ratio for ratio in r])\n", "plt.xticks(rotation=45, ha='right')\n", "ax.set_ylabel('Vollständigkeit [%]')\n", "ax.set_title('Verbindung von hs. Katalog und mod. Katalog')\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "b49f3686-efd3-478b-b493-b69b60ce3ad4", "metadata": {}, "source": [ "## Add new matching data to existing catalogue data" ] }, { "cell_type": "code", "execution_count": 171, "id": "c52f2098-221b-4912-841a-d54e25788143", "metadata": { "tags": [] }, "outputs": [], "source": [ "BE_df = pd.read_excel('../Daten/Vorhersagen/WIP_complete_BE.xlsx', index_col=0)" ] }, { "cell_type": "code", "execution_count": 172, "id": "32d281c0-4015-4913-a5ec-da22c6a9c8ad", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "Index(['Barcode', 'Titel', 'Autor', 'Mitwirkender',\n", " 'Anfang Veröffentlichungsdatum', 'Ende Veröffentlichungsdatum',\n", " 'Veröffentlichungsdatum', 'Veröffentlichungsort',\n", " 'Veröffentlichungsort (normiert)', 'Sprache', 'Schlagwörter',\n", " 'Schlagwörter (mit GND)', 'Vorbesitzer', 'Typ', 'Bemerkungen',\n", " 'Gültiger Barcode', 'Dateiname', 'Wappenklassifizierung', 'p_A', 'p_B',\n", " 'p_C', 'p_N', 'Farbklassifizierung', 'p_blue', 'p_red', 'p_yellow',\n", " 'hs. Katalog', 'hs. Katalog Konfidenz', 'hs. Katalogband',\n", " 'hs. Katalogseite Digitalisat', 'Wissensklasse', 'Wissensunterklasse',\n", " 'Formatangabe', 'hs. Katalogseite Handschrift', 'hs. Katalogeintrag ID',\n", " 'hs. Katalogeintrag', 'hs. Katalog Image URL', 'identifier',\n", " 'ABO-Barcode'],\n", " dtype='object')" ] }, "execution_count": 172, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# volume \tpage number \tcategory \tsubcategory \tformat \thandwritten page number \tentry_ID \tentry \t\n", "BE_df['hs. Katalog'] = 0\n", "\n", "BE_df.columns" ] }, { "cell_type": "code", "execution_count": 173, "id": "0ddd0ffe-3546-43fb-90ff-1d7206af138e", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "Index(['volume', 'page number', 'category', 'subcategory', 'format',\n", " 'handwritten page number', 'entry_ID', 'entry', 'cleaned entry'],\n", " dtype='object')" ] }, "execution_count": 173, "metadata": {}, "output_type": "execute_result" } ], "source": [ "entry_df.columns" ] }, { "cell_type": "code", "execution_count": null, "id": "b8d40359-5eba-49e6-b2a6-3bc37786dd4c", "metadata": { "tags": [] }, "outputs": [], "source": [ "dod_id = {\n", " '14.376': 51202, \n", " '14.377': 51184,\n", " '14.378': 51219\n", "}\n", "\n", "for i, entry in has_hw_catalog.iterrows():\n", " man_match = str(entry['control'])\n", " match_regex = re.compile('\\d{1,4}')\n", " matches = match_regex.findall(man_match)\n", " if matches:\n", " first_match_id = int(matches[0])\n", " corr_entry = entry_df.loc[first_match_id]\n", " BE_id = entry['input_id']\n", " BE_df.at[BE_id, 'hs. Katalog'] = 1\n", " if '?' not in man_match:\n", " BE_df.at[BE_id, 'hs. Katalog Konfidenz'] = 'sicher'\n", " else:\n", " BE_df.at[BE_id, 'hs. Katalog Konfidenz'] = 'unsicher'\n", " BE_df.at[BE_id, 'hs. Katalogband'] = str(corr_entry['volume'])\n", " BE_df.at[BE_id, 'hs. Katalogseite Digitalisat'] = corr_entry['page number']\n", " BE_df.at[BE_id, 'Wissensklasse'] = corr_entry['category']\n", " BE_df.at[BE_id, 'Wissensunterklasse'] = corr_entry['subcategory']\n", " BE_df.at[BE_id, 'Formatangabe'] = corr_entry['format']\n", " BE_df.at[BE_id, 'hs. Katalogseite Handschrift'] = corr_entry['handwritten page number']\n", " BE_df.at[BE_id, 'hs. Katalogeintrag ID'] = corr_entry['entry_ID']\n", " BE_df.at[BE_id, 'hs. Katalogeintrag'] = corr_entry['entry']\n", " BE_df.at[BE_id, 'hs. Katalog Image URL'] = f\"https://iiif.onb.ac.at/images/DOD/{dod_id[str(corr_entry['volume'])]}/{corr_entry['page number']:08}.jp2/full/full/0/native.jpg\"" ] }, { "cell_type": "code", "execution_count": 91, "id": "4ed3c187-ab7a-419a-8530-386a69143b3d", "metadata": { "tags": [] }, "outputs": [], "source": [ "# BE_df.to_excel('../Daten/Vorhersagen/Katalogauszug, Vorhersagen und hs. Katalogverbindungen.xlsx', index=False)\n", "BE_df.to_excel('../Daten/Vorhersagen/Complete_BE.xlsx', index=False)" ] }, { "cell_type": "code", "execution_count": 92, "id": "7f5dabc1-49bf-469a-9abe-c15fa20204ce", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "Index(['Signatur', 'Barcode', 'Titel', 'Autor', 'Mitwirkender',\n", " 'Anfang Veröffentlichungsdatum', 'Ende Veröffentlichungsdatum',\n", " 'Veröffentlichungsdatum', 'Veröffentlichungsort',\n", " 'Veröffentlichungsort (normiert)', 'Sprache', 'Schlagwörter',\n", " 'Schlagwörter (mit GND)', 'Vorbesitzer', 'Typ', 'Bemerkungen',\n", " 'Gültiger Barcode', 'Dateiname', 'Wappenklassifizierung', 'p_A', 'p_B',\n", " 'p_C', 'p_N', 'Farbklassifizierung', 'p_blue', 'p_red', 'p_yellow',\n", " 'hs. Katalog', 'hs. Katalog Konfidenz', 'hs. Katalogband',\n", " 'hs. Katalogseite Digitalisat', 'Wissensklasse', 'Wissensunterklasse',\n", " 'Formatangabe', 'hs. Katalogseite Handschrift', 'hs. Katalogeintrag ID',\n", " 'hs. Katalogeintrag', 'hs. Katalog Image URL'],\n", " dtype='object')" ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "BE_df.columns" ] }, { "cell_type": "markdown", "id": "88b825b9-8557-441d-b387-697db4ea2a5b", "metadata": {}, "source": [ "# Eugeniana-Inkunabeln aus dem handschriftlichen Katalog extrahieren" ] }, { "cell_type": "code", "execution_count": 513, "id": "bc7c8cf5-7afa-43c1-8427-ab7cd58323b3", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[1661]\n", "[1628]\n", "[1703, 5926]\n", "[1662]\n" ] } ], "source": [ "def extract_four_digit_number(entry):\n", " four_dig_re = re.compile('(?\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
volumepage numbercategorysubcategoryformathandwritten page numberentry_IDentrycleaned entryyears
314.37625TheologiaTextus & Versiones Sacræ ScripturæFolio114.376_025_034 Biblia Sacra Latina Moguntina dicta, prima omnium editio in Membranis. 2 Vol. Moguntiæ. 1462.¬ Ioan. Fust, & Petr. Schoiffer de Gerneshem. n. 2088. II.O.5.Biblia Sacra Latina Moguntina dicta prima omnium editio in Membranis 2 Vol Moguntiae 1462 Ioan Fust & Petr Schoiffer de Gerneshem n 2088 IIO5[1462]
414.37625TheologiaTextus & Versiones Sacræ ScripturæFolio114.376_025_045 Biblia Sacra Latina Moguntina, editio altera 2 Vol. chartâ Magnâ. Moguntiæ. 1472. Petr. Schoiffer. II.O.7. de Gerneshem. n. 2089.Biblia Sacra Latina Moguntina editio altera 2 Vol charta Magna Moguntiae 1472 Petr Schoiffer IIO7 de Gerneshem n 2089[1472]
614.37625TheologiaTextus & Versiones Sacræ ScripturæFolio114.376_025_067 Biblia Sacra latina vulgatæ Editionis. 2 Vol. chartâ III.B.3. Magnâ. 1474. absque loco Editionis n. 2090.Biblia Sacra latina vulgatae Editionis 2 Vol charta IIIB3 Magna 1474 absque loco Editionis n 2090[1474]
714.37625TheologiaTextus & Versiones Sacræ ScripturæFolio114.376_025_078 Biblia Sacra Latina cum Evangelistarum Canonibus III.B.7. & concordantiis. chartâ M. Coloniæ. 1479. de Homborch. n. 2093.Biblia Sacra Latina cum Evangelistarum Canonibus IIIB7 & concordantiis charta M Coloniae 1479 de Homborch n 2093[1479]
814.37626TheologiaTextus & Versiones Sacræ ScripturæFolio214.376_026_009. Biblia Sacra Latina. Venetiis. 1481. Leonard Wild de Ratisbonâ n. 2302. III. D. 11.Biblia Sacra Latina Venetiis 1481 Leonard Wild de Ratisbona n 2302 III D 11[1481]
.................................
901714.378522Codices ManuscriptiNaNQuarto134814.378_522_0553 Tractatus de Matrimonio à Fr. Lùcano Parmensi compilat. anno 1468. Ms. in Membranis. n. CCIX.Tractatus de Matrimonio a Fr Lucano Parmensi compilat anno 1468 Ms in Membranis n CCIX[1468]
903314.378523Codices ManuscriptiNaNQuarto1347*14.378_523_0968 Monita Agapæti Batilica ad Imper. Iustinian. Ms. græcè A 1500Monita Agapaeti Batilica ad Imper Iustinian Ms graece A 1500[1500]
905114.378527Codices ManuscriptiNaNOctavo und kleiner134914.378_527_0510 Succineta & accurata de scriptio Civitatis Constantinopol. a Constantino Magno anno 331. ad Sult. Mehemet II. anno 1453. cum descriptione Canalis Maris Nigri.¬ autore Ioan. Adamo Zizla. Ms. in 8.° germanicè.Succineta & accurata de scriptio Civitatis Constantinopol a Constantino Magno anno 331 ad Sult Mehemet II anno 1453 cum descriptione Canalis Maris Nigri autore Ioan Adamo Zizla Ms in 8° germanice[1453]
905214.378528Codices ManuscriptiNaNOctavo und kleiner135014.378_528_0084. I.Trionfi di Francesco Petrarca con miniature Ms. 8.° 11. Questo Ms.° è del tempo del medesimo Petrarca n. CCXXXIV. obüt Petrarca an. 1374. & codex in fine habet 1459. p. 519ITrionfi di Francesco Petrarca con miniature Ms 8° 11 Questo Ms° e del tempo del medesimo Petrarca n CCXXXIV obüt Petrarca an 1374 & codex in fine habet 1459 p 519[1374, 1459]
905914.378529Codices ManuscriptiNaNOctavo und kleiner135114.378_529_03Descriptio civitatis Constantinopoleos a Constantino M. anno 331. ad Sultanum Mehemet II. anno 1453. cum descriptione Canalis Maris Nigri. autore I. Adamo Zizla. 8.° Ms. germanicèDescriptio civitatis Constantinopoleos a Constantino M anno 331 ad Sultanum Mehemet II anno 1453 cum descriptione Canalis Maris Nigri autore I Adamo Zizla 8° Ms germanice[1453]
\n", "

366 rows × 10 columns

\n", "" ], "text/plain": [ " volume page number category \n", "3 14.376 25 Theologia \\\n", "4 14.376 25 Theologia \n", "6 14.376 25 Theologia \n", "7 14.376 25 Theologia \n", "8 14.376 26 Theologia \n", "... ... ... ... \n", "9017 14.378 522 Codices Manuscripti \n", "9033 14.378 523 Codices Manuscripti \n", "9051 14.378 527 Codices Manuscripti \n", "9052 14.378 528 Codices Manuscripti \n", "9059 14.378 529 Codices Manuscripti \n", "\n", " subcategory format \n", "3 Textus & Versiones Sacræ Scripturæ Folio \\\n", "4 Textus & Versiones Sacræ Scripturæ Folio \n", "6 Textus & Versiones Sacræ Scripturæ Folio \n", "7 Textus & Versiones Sacræ Scripturæ Folio \n", "8 Textus & Versiones Sacræ Scripturæ Folio \n", "... ... ... \n", "9017 NaN Quarto \n", "9033 NaN Quarto \n", "9051 NaN Octavo und kleiner \n", "9052 NaN Octavo und kleiner \n", "9059 NaN Octavo und kleiner \n", "\n", " handwritten page number entry_ID \n", "3 1 14.376_025_03 \\\n", "4 1 14.376_025_04 \n", "6 1 14.376_025_06 \n", "7 1 14.376_025_07 \n", "8 2 14.376_026_00 \n", "... ... ... \n", "9017 1348 14.378_522_05 \n", "9033 1347* 14.378_523_09 \n", "9051 1349 14.378_527_05 \n", "9052 1350 14.378_528_00 \n", "9059 1351 14.378_529_03 \n", "\n", " entry \n", "3 4 Biblia Sacra Latina Moguntina dicta, prima omnium editio in Membranis. 2 Vol. Moguntiæ. 1462.¬ Ioan. Fust, & Petr. Schoiffer de Gerneshem. n. 2088. II.O.5. \\\n", "4 5 Biblia Sacra Latina Moguntina, editio altera 2 Vol. chartâ Magnâ. Moguntiæ. 1472. Petr. Schoiffer. II.O.7. de Gerneshem. n. 2089. \n", "6 7 Biblia Sacra latina vulgatæ Editionis. 2 Vol. chartâ III.B.3. Magnâ. 1474. absque loco Editionis n. 2090. \n", "7 8 Biblia Sacra Latina cum Evangelistarum Canonibus III.B.7. & concordantiis. chartâ M. Coloniæ. 1479. de Homborch. n. 2093. \n", "8 9. Biblia Sacra Latina. Venetiis. 1481. Leonard Wild de Ratisbonâ n. 2302. III. D. 11. \n", "... ... \n", "9017 53 Tractatus de Matrimonio à Fr. Lùcano Parmensi compilat. anno 1468. Ms. in Membranis. n. CCIX. \n", "9033 68 Monita Agapæti Batilica ad Imper. Iustinian. Ms. græcè A 1500 \n", "9051 10 Succineta & accurata de scriptio Civitatis Constantinopol. a Constantino Magno anno 331. ad Sult. Mehemet II. anno 1453. cum descriptione Canalis Maris Nigri.¬ autore Ioan. Adamo Zizla. Ms. in 8.° germanicè. \n", "9052 84. I.Trionfi di Francesco Petrarca con miniature Ms. 8.° 11. Questo Ms.° è del tempo del medesimo Petrarca n. CCXXXIV. obüt Petrarca an. 1374. & codex in fine habet 1459. p. 519 \n", "9059 Descriptio civitatis Constantinopoleos a Constantino M. anno 331. ad Sultanum Mehemet II. anno 1453. cum descriptione Canalis Maris Nigri. autore I. Adamo Zizla. 8.° Ms. germanicè \n", "\n", " cleaned entry \n", "3 Biblia Sacra Latina Moguntina dicta prima omnium editio in Membranis 2 Vol Moguntiae 1462 Ioan Fust & Petr Schoiffer de Gerneshem n 2088 IIO5 \\\n", "4 Biblia Sacra Latina Moguntina editio altera 2 Vol charta Magna Moguntiae 1472 Petr Schoiffer IIO7 de Gerneshem n 2089 \n", "6 Biblia Sacra latina vulgatae Editionis 2 Vol charta IIIB3 Magna 1474 absque loco Editionis n 2090 \n", "7 Biblia Sacra Latina cum Evangelistarum Canonibus IIIB7 & concordantiis charta M Coloniae 1479 de Homborch n 2093 \n", "8 Biblia Sacra Latina Venetiis 1481 Leonard Wild de Ratisbona n 2302 III D 11 \n", "... ... \n", "9017 Tractatus de Matrimonio a Fr Lucano Parmensi compilat anno 1468 Ms in Membranis n CCIX \n", "9033 Monita Agapaeti Batilica ad Imper Iustinian Ms graece A 1500 \n", "9051 Succineta & accurata de scriptio Civitatis Constantinopol a Constantino Magno anno 331 ad Sult Mehemet II anno 1453 cum descriptione Canalis Maris Nigri autore Ioan Adamo Zizla Ms in 8° germanice \n", "9052 ITrionfi di Francesco Petrarca con miniature Ms 8° 11 Questo Ms° e del tempo del medesimo Petrarca n CCXXXIV obüt Petrarca an 1374 & codex in fine habet 1459 p 519 \n", "9059 Descriptio civitatis Constantinopoleos a Constantino M anno 331 ad Sultanum Mehemet II anno 1453 cum descriptione Canalis Maris Nigri autore I Adamo Zizla 8° Ms germanice \n", "\n", " years \n", "3 [1462] \n", "4 [1472] \n", "6 [1474] \n", "7 [1479] \n", "8 [1481] \n", "... ... \n", "9017 [1468] \n", "9033 [1500] \n", "9051 [1453] \n", "9052 [1374, 1459] \n", "9059 [1453] \n", "\n", "[366 rows x 10 columns]" ] }, "execution_count": 526, "metadata": {}, "output_type": "execute_result" } ], "source": [ "before_1501" ] }, { "cell_type": "code", "execution_count": 527, "id": "740e1929-f137-4853-82db-3d90905beed4", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
volumepage numbercategorysubcategoryformathandwritten page numberentry_IDentrycleaned entryyears
514.37625TheologiaTextus & Versiones Sacræ ScripturæFolio114.376_025_056 Biblia Sacra Latina cum glossa & Comment. 4 Vol. chartâ Magnâ. Editio perantiqua sine loco & II.O.1. anno n. 2086.Biblia Sacra Latina cum glossa & Comment 4 Vol charta Magna Editio perantiqua sine loco & IIO1 anno n 2086[]
1714.37627TheologiaTextus & Versiones Sacræ ScripturæFolio314.376_027_0218 La S.e Bible historiée dite d'Escholatre avec des fig. gravées en bois. 2 Vol. sans datte. Barthel. Verard. n. 165.La Se Bible historiee dite d'Escholatre avec des fig gravees en bois 2 Vol sans datte Barthel Verard n 165[]
2114.37627TheologiaTextus & Versiones Sacræ ScripturæFolio314.376_027_0622 Biblia Sacra germanicè Mss. in Membranis cum figuris pictis. n. II.Biblia Sacra germanice Mss in Membranis cum figuris pictis n II[]
2214.37627TheologiaTextus & Versiones Sacræ ScripturæFolio314.376_027_0723 Biblia Sacra Hollandicé Mss. in Membranis cum fig. pictis. 2 Vol. n. III.Biblia Sacra Hollandice Mss in Membranis cum fig pictis 2 Vol n III[]
2414.37627TheologiaTextus & Versiones Sacræ ScripturæFolio314.376_027_0925 Historia Veteris ac Novi Testamenti MS. in Membranis cum innumeris ferè Imaginibus miniatè depictis. n. 1.Historia Veteris ac Novi Testamenti MS in Membranis cum innumeris fere Imaginibus miniate depictis n 1[]
.................................
939814.378582Imaginum Delineatarum CollectioNaNNaN140214.378_582_00CCCXXXI Vn Portefeüilles contenant des Desseins de plusieurs Villes d'Espagne par Ant. Van-den-Wingarde. dont il n'y a pas de Catalogue. n. 990.Vn Portefeüilles contenant des Desseins de plusieurs Villes d'Espagne par Ant Van-den-Wingarde dont il n'y a pas de Catalogue n 990[]
939914.378582Imaginum Delineatarum CollectioNaNNaN140214.378_582_01CCCXXXII Vn Recueil des Portraits peints en miniature sur velain au nombre de 34. dont le Premier est celuy de Philippe le Hardy. n. CCIV.Vn Recueil des Portraits peints en miniature sur velain au nombre de 34 dont le Premier est celuy de Philippe le Hardy n CCIV[]
940014.378582Imaginum Delineatarum CollectioNaNNaN140214.378_582_02CCCXXXIII Dix Vol. de Plantes peintes en miniature par Nicol. Robert. vide. Hist. PlantarumDix Vol de Plantes peintes en miniature par Nicol Robert vide Hist Plantarum[]
940114.378582Imaginum Delineatarum CollectioNaNNaN140214.378_582_03CCCXXXIV Cinq Vol. d'Oiseaux peints en Miniature par Nic. Robert. vide. Histor. Animalium.Cinq Vol d'Oiseaux peints en Miniature par Nic Robert vide Histor Animalium[]
940214.378582Imaginum Delineatarum CollectioNaNNaN140214.378_582_04CCCXXXV Divers Portraits, Ceremonies, Marches &c. des Turcs & d'autres nations du Levant, peints en miniature n. CCLXX.Divers Portraits Ceremonies Marches &c des Turcs & d'autres nations du Levant peints en miniature n CCLXX[]
\n", "

1324 rows × 10 columns

\n", "
" ], "text/plain": [ " volume page number category \n", "5 14.376 25 Theologia \\\n", "17 14.376 27 Theologia \n", "21 14.376 27 Theologia \n", "22 14.376 27 Theologia \n", "24 14.376 27 Theologia \n", "... ... ... ... \n", "9398 14.378 582 Imaginum Delineatarum Collectio \n", "9399 14.378 582 Imaginum Delineatarum Collectio \n", "9400 14.378 582 Imaginum Delineatarum Collectio \n", "9401 14.378 582 Imaginum Delineatarum Collectio \n", "9402 14.378 582 Imaginum Delineatarum Collectio \n", "\n", " subcategory format handwritten page number \n", "5 Textus & Versiones Sacræ Scripturæ Folio 1 \\\n", "17 Textus & Versiones Sacræ Scripturæ Folio 3 \n", "21 Textus & Versiones Sacræ Scripturæ Folio 3 \n", "22 Textus & Versiones Sacræ Scripturæ Folio 3 \n", "24 Textus & Versiones Sacræ Scripturæ Folio 3 \n", "... ... ... ... \n", "9398 NaN NaN 1402 \n", "9399 NaN NaN 1402 \n", "9400 NaN NaN 1402 \n", "9401 NaN NaN 1402 \n", "9402 NaN NaN 1402 \n", "\n", " entry_ID \n", "5 14.376_025_05 \\\n", "17 14.376_027_02 \n", "21 14.376_027_06 \n", "22 14.376_027_07 \n", "24 14.376_027_09 \n", "... ... \n", "9398 14.378_582_00 \n", "9399 14.378_582_01 \n", "9400 14.378_582_02 \n", "9401 14.378_582_03 \n", "9402 14.378_582_04 \n", "\n", " entry \n", "5 6 Biblia Sacra Latina cum glossa & Comment. 4 Vol. chartâ Magnâ. Editio perantiqua sine loco & II.O.1. anno n. 2086. \\\n", "17 18 La S.e Bible historiée dite d'Escholatre avec des fig. gravées en bois. 2 Vol. sans datte. Barthel. Verard. n. 165. \n", "21 22 Biblia Sacra germanicè Mss. in Membranis cum figuris pictis. n. II. \n", "22 23 Biblia Sacra Hollandicé Mss. in Membranis cum fig. pictis. 2 Vol. n. III. \n", "24 25 Historia Veteris ac Novi Testamenti MS. in Membranis cum innumeris ferè Imaginibus miniatè depictis. n. 1. \n", "... ... \n", "9398 CCCXXXI Vn Portefeüilles contenant des Desseins de plusieurs Villes d'Espagne par Ant. Van-den-Wingarde. dont il n'y a pas de Catalogue. n. 990. \n", "9399 CCCXXXII Vn Recueil des Portraits peints en miniature sur velain au nombre de 34. dont le Premier est celuy de Philippe le Hardy. n. CCIV. \n", "9400 CCCXXXIII Dix Vol. de Plantes peintes en miniature par Nicol. Robert. vide. Hist. Plantarum \n", "9401 CCCXXXIV Cinq Vol. d'Oiseaux peints en Miniature par Nic. Robert. vide. Histor. Animalium. \n", "9402 CCCXXXV Divers Portraits, Ceremonies, Marches &c. des Turcs & d'autres nations du Levant, peints en miniature n. CCLXX. \n", "\n", " cleaned entry \n", "5 Biblia Sacra Latina cum glossa & Comment 4 Vol charta Magna Editio perantiqua sine loco & IIO1 anno n 2086 \\\n", "17 La Se Bible historiee dite d'Escholatre avec des fig gravees en bois 2 Vol sans datte Barthel Verard n 165 \n", "21 Biblia Sacra germanice Mss in Membranis cum figuris pictis n II \n", "22 Biblia Sacra Hollandice Mss in Membranis cum fig pictis 2 Vol n III \n", "24 Historia Veteris ac Novi Testamenti MS in Membranis cum innumeris fere Imaginibus miniate depictis n 1 \n", "... ... \n", "9398 Vn Portefeüilles contenant des Desseins de plusieurs Villes d'Espagne par Ant Van-den-Wingarde dont il n'y a pas de Catalogue n 990 \n", "9399 Vn Recueil des Portraits peints en miniature sur velain au nombre de 34 dont le Premier est celuy de Philippe le Hardy n CCIV \n", "9400 Dix Vol de Plantes peintes en miniature par Nicol Robert vide Hist Plantarum \n", "9401 Cinq Vol d'Oiseaux peints en Miniature par Nic Robert vide Histor Animalium \n", "9402 Divers Portraits Ceremonies Marches &c des Turcs & d'autres nations du Levant peints en miniature n CCLXX \n", "\n", " years \n", "5 [] \n", "17 [] \n", "21 [] \n", "22 [] \n", "24 [] \n", "... ... \n", "9398 [] \n", "9399 [] \n", "9400 [] \n", "9401 [] \n", "9402 [] \n", "\n", "[1324 rows x 10 columns]" ] }, "execution_count": 527, "metadata": {}, "output_type": "execute_result" } ], "source": [ "without_years" ] }, { "cell_type": "code", "execution_count": 530, "id": "1bacd9fd-af4a-4f50-b0bc-10a4a5af1ee8", "metadata": { "tags": [] }, "outputs": [], "source": [ "before_1501.to_excel('data/man_catalog/Vor 1501.xlsx')" ] }, { "cell_type": "code", "execution_count": 531, "id": "dbd5e8bc-a4a0-46f7-b0ca-8fe84de50e19", "metadata": { "tags": [] }, "outputs": [], "source": [ "without_years.to_excel('data/man_catalog/Ohne Jahresangabe.xlsx')" ] }, { "cell_type": "code", "execution_count": null, "id": "e0c65ab2-9951-4cab-a859-55b7d5427d57", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.10" } }, "nbformat": 4, "nbformat_minor": 5 }