diff --git a/Daten/Vorhersagen/WIP_final_BE_4.xlsx b/Daten/Vorhersagen/WIP_final_BE_4.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..924f96a707e2989dc1a5073c0fc333ea351b3cb8 Binary files /dev/null and b/Daten/Vorhersagen/WIP_final_BE_4.xlsx differ diff --git a/Notebooks/Completing_BE_data.ipynb b/Notebooks/Completing_BE_data.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..5258ed79b43b8bf322fddbf7435f0a8ccd817c3a --- /dev/null +++ b/Notebooks/Completing_BE_data.ipynb @@ -0,0 +1,980 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 20, + "id": "a910c4c5-3a61-462b-ac07-c9545fe7ae40", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import re\n", + "import numpy as np\n", + "from thefuzz import fuzz, process\n", + "from tqdm.notebook import tqdm\n", + "import matplotlib.pyplot as plt\n", + "import requests\n", + "import json\n", + "from IPython.display import display\n", + "\n", + "pd.set_option('display.max_colwidth', None)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "83013484-2a55-4819-8b30-b2f8cbbe7981", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "entry_df = pd.read_excel('../Daten/Katalogabgleich/Einträge.xlsx', index_col=0)\n", + "\n", + "def prepare_string(string):\n", + " new = re.sub(r'[à áâãå]', 'a', string)\n", + " new = re.sub(r'[èéêë]', 'e', new)\n", + " new = re.sub(r'[ìÃîï]', 'i', new)\n", + " new = re.sub(r'[òóôõ]', 'o', new)\n", + " new = re.sub(r'[ùúû]', 'u', new)\n", + " new = re.sub(r'æ', 'ae', new)\n", + " new = re.sub('[.,:;()¬]|^[CLXVI]+? |^\\d+? |^\\d+?\\.+? |^\\.+ ?|= |# ', '', new)\n", + " return new\n", + "\n", + "entry_df['cleaned entry'] = entry_df['entry'].apply(lambda x: prepare_string(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "id": "1279d6ea-48a2-4f65-9cfa-b1f92eac16f1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "BE_df = pd.read_excel('../Daten/Vorhersagen/WIP_final_BE_3.xlsx', index_col=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "id": "ddf5d11c-5f72-4bc8-ab8f-0a1e0f01e60d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# fix 22871 without metadata\n", + "i = 22871\n", + "BE_df.at[i, 'Titel'] = 'De La coronica general de toda Espana y especialmente del Reyno de Valencia. etc'\n", + "BE_df.at[i, 'Autor'] = 'Beuter, Pero-Anton'\n", + "BE_df.at[i, 'Mitwirkender'] = ''\n", + "BE_df.at[i, 'Anfang Veröffentlichungsdatum'] = '1546'\n", + "BE_df.at[i, 'Ende Veröffentlichungsdatum'] = '1551'\n", + "BE_df.at[i, 'Veröffentlichungsdatum'] = '1546-1551'\n", + "BE_df.at[i, 'Veröffentlichungsort'] = 'Valencia'\n", + "BE_df.at[i, 'Veröffentlichungsort (normiert)'] = 'Valencia'\n", + "BE_df.at[i, 'Sprache'] = 'Spanish'" + ] + }, + { + "cell_type": "markdown", + "id": "317bb2ea-42b9-4b34-b13e-d25b4c66da2b", + "metadata": {}, + "source": [ + "# nicht-BE-Signaturen matchen" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "e9c00ca9-c051-4e3a-93cf-133031ca9e7f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "no_BE = BE_df[~(BE_df['Signatur'].str.contains('BE') | BE_df['Signatur'].str.contains('Ink'))]" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "ca242a1c-baf8-4183-a565-a3797d6f4747", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a7b5d48f5fcd4cbbbf56291a871746c5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/804 [00:00<?, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "better_matches = []\n", + "scorer = fuzz.token_set_ratio\n", + "\n", + "for index, row in tqdm(no_BE.iterrows(), total=len(no_BE)):\n", + " keys = ['Autor', 'Mitwirkender', 'Titel', 'Veröffentlichungsort', 'Anfang Veröffentlichungsdatum']\n", + " comb_string = ''\n", + " for key in keys:\n", + " val = row[key]\n", + " if not pd.isna(val):\n", + " if key == 'Autor' or key == 'Mitwirkender':\n", + " if ',' in val: # falls name, vorname\n", + " val = val.split(',')[0]\n", + " val = val.split(' ')[0]\n", + " elif key == 'Titel':\n", + " val = prepare_string(val)\n", + " elif key == 'Anfang Veröffentlichungsdatum':\n", + " val = str(int(val))\n", + " else: # key == 'Veröffentlichungsort'\n", + " pass\n", + " comb_string += val + ' '\n", + " \n", + " matches_lis = process.extract(comb_string, entry_df['cleaned entry'], scorer=scorer, limit=5)\n", + " flat_matches = []\n", + " for match in matches_lis:\n", + " flat_matches.append(match[0])\n", + " flat_matches.append(match[1])\n", + " flat_matches.append(match[2])\n", + " better_matches.append([comb_string] + flat_matches)\n", + "\n", + "matches_df = pd.DataFrame(better_matches, columns=['input', 'match_1', 'score_1', 'id_1', 'match_2', 'score_2', 'id_2', 'match_3', 'score_3', 'id_3', 'match_4', 'score_4', 'id_4', 'match_5', 'score_5', 'id_5'])\n", + "matches_df['control'] = ''" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "e9dd6e64-a45d-4f25-9ad1-624cfc5268fb", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "matches_df_no_score = matches_df.drop(['score_1', 'score_2', 'score_3', 'score_4', 'score_5'], axis=1)\n", + "matches_df_no_score.insert(1, 'input_id', no_BE.index)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "b1f7e1a3-b886-496c-a9ca-ec484d196c24", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "matches_df_no_score.to_excel('../Daten/Katalogabgleich/Kandidaten/no_BE_Kandidaten.xlsx', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "a4639759-8344-452e-96ce-cfca485165a4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "9591" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(BE_df[BE_df['Wappenklassifizierung'].isin(['A', 'B', 'C']) | (BE_df['hs. Katalog'] == 1)])" + ] + }, + { + "cell_type": "markdown", + "id": "c964166c-dfad-4c55-bffd-0174e430e0db", + "metadata": {}, + "source": [ + "# Fehlende Inkunabeln in BE_df Tabelle eintragen" + ] + }, + { + "cell_type": "code", + "execution_count": 184, + "id": "88ad2906-9c95-4033-b922-791360ec3d6c", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "23670\n" + ] + } + ], + "source": [ + "ink_corrections = [\n", + " {'Signatur': 'Ink 12.F.7', 'Titel': 'Expositio super toto psalterio', 'Autor': 'Turrecremata, Johannes de', 'Mitwirkender': 'Udalricus Gallus', 'Veröffentlichungsdatum': '4. Oktober 1470', 'Anfang Veröffentlichungsdatum': 1470, 'Veröffentlichungsort': 'Rome', 'Veröffentlichungsort (normiert)': 'Rom', 'Sprache': 'Latin'},\n", + " {'Signatur': 'Ink 2.E.13', 'Titel': 'Psalterium latinum', 'Autor': '', 'Mitwirkender': 'Reyser, Georg', 'Veröffentlichungsdatum': 'nicht nach 1489', 'Anfang Veröffentlichungsdatum': 1489, 'Veröffentlichungsort': 'Würzburg', 'Veröffentlichungsort (normiert)': 'Würzburg', 'Sprache': 'Latin'},\n", + " {'Signatur': 'Ink 3.F.32', 'Titel': 'Erotemata', 'Autor': 'Chalcondylas, Demetrius', 'Mitwirkender': 'Uldericus Scinzenzeler', 'Veröffentlichungsdatum': 'um 1493', 'Anfang Veröffentlichungsdatum': 1493, 'Veröffentlichungsort': 'Milan', 'Veröffentlichungsort (normiert)': 'Mailand', 'Sprache': 'Ancient Greek (to 1453)'},\n", + " {'Signatur': 'Ink 32-248', 'Titel': 'De animalibus', 'Autor': 'Avicenna', 'Mitwirkender': 'Scotus, Michael; Gregori, Giovanni de', 'Veröffentlichungsdatum': 'um 1500', 'Anfang Veröffentlichungsdatum': 1500, 'Veröffentlichungsort': 'Venedig', 'Veröffentlichungsort (normiert)': 'Venedig', 'Sprache': 'Latin'},\n", + " {'Signatur': 'Ink 4.F.8', 'Titel': 'Opera', 'Autor': 'Sallustius Crispus, Gaius', 'Mitwirkender': 'Vindelinus de Spira', 'Veröffentlichungsdatum': 'um 1470', 'Anfang Veröffentlichungsdatum': 1470, 'Veröffentlichungsort': 'Venice', 'Veröffentlichungsort (normiert)': 'Venedig', 'Sprache': 'Latin'},\n", + " {'Signatur': 'Ink 5.B.1', 'Titel': 'Historia naturalis', 'Autor': 'Plinius Secundus, Gaius', 'Mitwirkender': 'Jenson, Nicolaus', 'Veröffentlichungsdatum': '1472', 'Anfang Veröffentlichungsdatum': 1472, 'Veröffentlichungsort': 'Venice', 'Veröffentlichungsort (normiert)': 'Venedig', 'Sprache': 'Latin'},\n", + " {'Signatur': 'Ink 5.C.11', 'Barcode': 'dtl_5304244', 'Titel': 'Lancelot du Lac', 'Autor': '', 'Mitwirkender': 'Vérard, Antoine', 'Veröffentlichungsdatum': '1494', 'Anfang Veröffentlichungsdatum': 1494, 'Veröffentlichungsort': 'Paris', 'Veröffentlichungsort (normiert)': 'Paris', 'Sprache': 'French'},\n", + " {'Signatur': 'Ink 5.E.16', 'Titel': 'Missale mixtum secundum regulam Beati Isidori, dictum Mozarabes', 'Autor': '', 'Mitwirkender': 'Hagembach, Peter', 'Veröffentlichungsdatum': '9. Jan. 1500', 'Anfang Veröffentlichungsdatum': 1500, 'Veröffentlichungsort': 'Toledo', 'Veröffentlichungsort (normiert)': 'Toledo', 'Sprache': 'Latin'},\n", + " {'Signatur': 'Ink 5.E.19', 'Titel': 'Commentarii', 'Autor': 'Caesar, Gaius Julius', 'Mitwirkender': 'Zarotus, Antonius', 'Veröffentlichungsdatum': '10. Feb. 1477', 'Anfang Veröffentlichungsdatum': 1477, 'Veröffentlichungsort': 'Milan', 'Veröffentlichungsort (normiert)': 'Mailand', 'Sprache': 'Latin'},\n", + " {'Signatur': 'Ink 5.E.24', 'Titel': 'Orthographia', 'Autor': 'Tortellius, Johannes', 'Mitwirkender': 'Pincius, Philippus', 'Veröffentlichungsdatum': '12. Apr. 1493', 'Anfang Veröffentlichungsdatum': 1493, 'Veröffentlichungsort': 'Venice', 'Veröffentlichungsort (normiert)': 'Venedig', 'Sprache': 'Latin'},\n", + " {'Signatur': 'Ink 5.E.25', 'Titel': 'Metamorphosis sive De asino aureo', 'Autor': 'Apuleius, Madaurensis', 'Mitwirkender': 'Faelli, Benedictus Hectoris', 'Veröffentlichungsdatum': '1. Aug. 1500', 'Anfang Veröffentlichungsdatum': 1500, 'Veröffentlichungsort': 'Bologna', 'Veröffentlichungsort (normiert)': 'Bologna', 'Sprache': 'Latin'},\n", + " {'Signatur': 'Ink 7.E.12', 'Titel': 'Scriptores rei rusticae', 'Autor': '', 'Mitwirkender': 'Beroaldus, Philippus; Benedictus Hectoris', 'Veröffentlichungsdatum': '19. Sept. 1494', 'Anfang Veröffentlichungsdatum': 1494, 'Veröffentlichungsort': 'Bologna', 'Veröffentlichungsort (normiert)': 'Bologna', 'Sprache': 'Latin'},\n", + " {'Signatur': 'Ink 9.F.2', 'Titel': 'Liber de vita christi ac pontificum omnium', 'Autor': 'Platina, Bartholomaeus', 'Mitwirkender': 'Koberger, Anton', 'Veröffentlichungsdatum': '11. Aug. 1481', 'Anfang Veröffentlichungsdatum': 1481, 'Veröffentlichungsort': 'Nürnberg', 'Veröffentlichungsort (normiert)': 'Nürnberg', 'Sprache': 'Latin'}\n", + "]\n", + "\n", + "max_BE_index = max(BE_df.index) + 1\n", + "print(max_BE_index)\n", + "ink_add_df = pd.DataFrame(ink_corrections)\n", + "ink_add_df.index = ink_add_df.index + max_BE_index" + ] + }, + { + "cell_type": "code", + "execution_count": 185, + "id": "9408275f-b159-454a-ab52-d77d86ef3ce9", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Signatur</th>\n", + " <th>Barcode</th>\n", + " <th>Titel</th>\n", + " <th>Autor</th>\n", + " <th>Mitwirkender</th>\n", + " <th>Anfang Veröffentlichungsdatum</th>\n", + " <th>Ende Veröffentlichungsdatum</th>\n", + " <th>Veröffentlichungsdatum</th>\n", + " <th>Veröffentlichungsort</th>\n", + " <th>Veröffentlichungsort (normiert)</th>\n", + " <th>...</th>\n", + " <th>hs. Katalogseite Digitalisat</th>\n", + " <th>Wissensklasse</th>\n", + " <th>Wissensunterklasse</th>\n", + " <th>Formatangabe</th>\n", + " <th>hs. Katalogseite Handschrift</th>\n", + " <th>hs. Katalogeintrag ID</th>\n", + " <th>hs. Katalogeintrag</th>\n", + " <th>hs. Katalog Image URL</th>\n", + " <th>dup_title</th>\n", + " <th>copy_from</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>23376</th>\n", + " <td>*28.A.79.(Vol.1)</td>\n", + " <td>Z222907107</td>\n", + " <td>Histoire des ouvrages des scavans</td>\n", + " <td>Basnage de Beauval, Henri</td>\n", + " <td>NaN</td>\n", + " <td>1687.0</td>\n", + " <td>1709.0</td>\n", + " <td>1687-1709</td>\n", + " <td>Rotterdam</td>\n", + " <td>Rotterdam</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>True</td>\n", + " <td>-1.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23383</th>\n", + " <td>*28.A.79.(Vol.10)</td>\n", + " <td>Z222908100</td>\n", + " <td>Histoire des ouvrages des scavans</td>\n", + " <td>Basnage de Beauval, Henri</td>\n", + " <td>NaN</td>\n", + " <td>1687.0</td>\n", + " <td>1709.0</td>\n", + " <td>1687-1709</td>\n", + " <td>Rotterdam</td>\n", + " <td>Rotterdam</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>True</td>\n", + " <td>-1.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23384</th>\n", + " <td>*28.A.79.(Vol.12)</td>\n", + " <td>Z222908306</td>\n", + " <td>Histoire des ouvrages des scavans</td>\n", + " <td>Basnage de Beauval, Henri</td>\n", + " <td>NaN</td>\n", + " <td>1687.0</td>\n", + " <td>1709.0</td>\n", + " <td>1687-1709</td>\n", + " <td>Rotterdam</td>\n", + " <td>Rotterdam</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>True</td>\n", + " <td>-1.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23385</th>\n", + " <td>*28.A.79.(Vol.13)</td>\n", + " <td>Z222908409</td>\n", + " <td>Histoire des ouvrages des scavans</td>\n", + " <td>Basnage de Beauval, Henri</td>\n", + " <td>NaN</td>\n", + " <td>1687.0</td>\n", + " <td>1709.0</td>\n", + " <td>1687-1709</td>\n", + " <td>Rotterdam</td>\n", + " <td>Rotterdam</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>True</td>\n", + " <td>-1.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23386</th>\n", + " <td>*28.A.79.(Vol.14)</td>\n", + " <td>Z222908501</td>\n", + " <td>Histoire des ouvrages des scavans</td>\n", + " <td>Basnage de Beauval, Henri</td>\n", + " <td>NaN</td>\n", + " <td>1687.0</td>\n", + " <td>1709.0</td>\n", + " <td>1687-1709</td>\n", + " <td>Rotterdam</td>\n", + " <td>Rotterdam</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>True</td>\n", + " <td>-1.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23636</th>\n", + " <td>Ink 9.F.4</td>\n", + " <td>NaN</td>\n", + " <td>Opera</td>\n", + " <td>Sallustius Crispus, Gaius</td>\n", + " <td>NaN</td>\n", + " <td>1481.0</td>\n", + " <td>NaN</td>\n", + " <td>23 Dec. 1481</td>\n", + " <td>Venice</td>\n", + " <td>Venedig</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>True</td>\n", + " <td>-3.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22640</th>\n", + " <td>Ink 9.F.5</td>\n", + " <td>1460328-10</td>\n", + " <td>Biblia ; Interpretationes Hebraicorum nominum</td>\n", + " <td>NaN</td>\n", + " <td>Wild, Leonhard</td>\n", + " <td>1481.0</td>\n", + " <td>NaN</td>\n", + " <td>1481</td>\n", + " <td>Venedig</td>\n", + " <td>Venedig</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>True</td>\n", + " <td>-3.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22650</th>\n", + " <td>SA.71.E.58</td>\n", + " <td>Z252861302</td>\n", + " <td>Dialogue sur la musique des anciens</td>\n", + " <td>Chateauneuf, Francois abbe de</td>\n", + " <td>NaN</td>\n", + " <td>1725.0</td>\n", + " <td>NaN</td>\n", + " <td>1725</td>\n", + " <td>Paris</td>\n", + " <td>Paris</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>False</td>\n", + " <td>-1.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22651</th>\n", + " <td>SA.71.F.74</td>\n", + " <td>Z252867808</td>\n", + " <td>Friderici Adolfi Lampe De Cymbalis Veterum Libri Tres</td>\n", + " <td>Ember, Paul</td>\n", + " <td>Hase, Cornelius <<von>>; Röell, Herman Alexander</td>\n", + " <td>1703.0</td>\n", + " <td>NaN</td>\n", + " <td>1703</td>\n", + " <td>Trajecti Ad Rhenum</td>\n", + " <td>Utrecht</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>False</td>\n", + " <td>-1.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23439</th>\n", + " <td>SA.73.B.48</td>\n", + " <td>Z25920770X</td>\n", + " <td>Claudii Ptolomaei harmonicorum libri tres. Ex Codd. Mss. Undecim, nunc primum graece editus. Johannes Wallis ... recensuit, ed. (etc.)</td>\n", + " <td>Ptolemaeus, Claudius</td>\n", + " <td>Wallis, Johannes</td>\n", + " <td>1682.0</td>\n", + " <td>NaN</td>\n", + " <td>1682</td>\n", + " <td>Oxford</td>\n", + " <td>Oxford</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>False</td>\n", + " <td>-1.0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>22874 rows × 35 columns</p>\n", + "</div>" + ], + "text/plain": [ + " Signatur Barcode \n", + "23376 *28.A.79.(Vol.1) Z222907107 \\\n", + "23383 *28.A.79.(Vol.10) Z222908100 \n", + "23384 *28.A.79.(Vol.12) Z222908306 \n", + "23385 *28.A.79.(Vol.13) Z222908409 \n", + "23386 *28.A.79.(Vol.14) Z222908501 \n", + "... ... ... \n", + "23636 Ink 9.F.4 NaN \n", + "22640 Ink 9.F.5 1460328-10 \n", + "22650 SA.71.E.58 Z252861302 \n", + "22651 SA.71.F.74 Z252867808 \n", + "23439 SA.73.B.48 Z25920770X \n", + "\n", + " Titel \n", + "23376 Histoire des ouvrages des scavans \\\n", + "23383 Histoire des ouvrages des scavans \n", + "23384 Histoire des ouvrages des scavans \n", + "23385 Histoire des ouvrages des scavans \n", + "23386 Histoire des ouvrages des scavans \n", + "... ... \n", + "23636 Opera \n", + "22640 Biblia ; Interpretationes Hebraicorum nominum \n", + "22650 Dialogue sur la musique des anciens \n", + "22651 Friderici Adolfi Lampe De Cymbalis Veterum Libri Tres \n", + "23439 Claudii Ptolomaei harmonicorum libri tres. Ex Codd. Mss. Undecim, nunc primum graece editus. Johannes Wallis ... recensuit, ed. (etc.) \n", + "\n", + " Autor \n", + "23376 Basnage de Beauval, Henri \\\n", + "23383 Basnage de Beauval, Henri \n", + "23384 Basnage de Beauval, Henri \n", + "23385 Basnage de Beauval, Henri \n", + "23386 Basnage de Beauval, Henri \n", + "... ... \n", + "23636 Sallustius Crispus, Gaius \n", + "22640 NaN \n", + "22650 Chateauneuf, Francois abbe de \n", + "22651 Ember, Paul \n", + "23439 Ptolemaeus, Claudius \n", + "\n", + " Mitwirkender \n", + "23376 NaN \\\n", + "23383 NaN \n", + "23384 NaN \n", + "23385 NaN \n", + "23386 NaN \n", + "... ... \n", + "23636 NaN \n", + "22640 Wild, Leonhard \n", + "22650 NaN \n", + "22651 Hase, Cornelius <<von>>; Röell, Herman Alexander \n", + "23439 Wallis, Johannes \n", + "\n", + " Anfang Veröffentlichungsdatum Ende Veröffentlichungsdatum \n", + "23376 1687.0 1709.0 \\\n", + "23383 1687.0 1709.0 \n", + "23384 1687.0 1709.0 \n", + "23385 1687.0 1709.0 \n", + "23386 1687.0 1709.0 \n", + "... ... ... \n", + "23636 1481.0 NaN \n", + "22640 1481.0 NaN \n", + "22650 1725.0 NaN \n", + "22651 1703.0 NaN \n", + "23439 1682.0 NaN \n", + "\n", + " Veröffentlichungsdatum Veröffentlichungsort \n", + "23376 1687-1709 Rotterdam \\\n", + "23383 1687-1709 Rotterdam \n", + "23384 1687-1709 Rotterdam \n", + "23385 1687-1709 Rotterdam \n", + "23386 1687-1709 Rotterdam \n", + "... ... ... \n", + "23636 23 Dec. 1481 Venice \n", + "22640 1481 Venedig \n", + "22650 1725 Paris \n", + "22651 1703 Trajecti Ad Rhenum \n", + "23439 1682 Oxford \n", + "\n", + " Veröffentlichungsort (normiert) ... hs. Katalogseite Digitalisat \n", + "23376 Rotterdam ... NaN \\\n", + "23383 Rotterdam ... NaN \n", + "23384 Rotterdam ... NaN \n", + "23385 Rotterdam ... NaN \n", + "23386 Rotterdam ... NaN \n", + "... ... ... ... \n", + "23636 Venedig ... NaN \n", + "22640 Venedig ... NaN \n", + "22650 Paris ... NaN \n", + "22651 Utrecht ... NaN \n", + "23439 Oxford ... NaN \n", + "\n", + " Wissensklasse Wissensunterklasse Formatangabe \n", + "23376 NaN NaN NaN \\\n", + "23383 NaN NaN NaN \n", + "23384 NaN NaN NaN \n", + "23385 NaN NaN NaN \n", + "23386 NaN NaN NaN \n", + "... ... ... ... \n", + "23636 NaN NaN NaN \n", + "22640 NaN NaN NaN \n", + "22650 NaN NaN NaN \n", + "22651 NaN NaN NaN \n", + "23439 NaN NaN NaN \n", + "\n", + " hs. Katalogseite Handschrift hs. Katalogeintrag ID \n", + "23376 NaN NaN \\\n", + "23383 NaN NaN \n", + "23384 NaN NaN \n", + "23385 NaN NaN \n", + "23386 NaN NaN \n", + "... ... ... \n", + "23636 NaN NaN \n", + "22640 NaN NaN \n", + "22650 NaN NaN \n", + "22651 NaN NaN \n", + "23439 NaN NaN \n", + "\n", + " hs. Katalogeintrag hs. Katalog Image URL dup_title copy_from \n", + "23376 NaN NaN True -1.0 \n", + "23383 NaN NaN True -1.0 \n", + "23384 NaN NaN True -1.0 \n", + "23385 NaN NaN True -1.0 \n", + "23386 NaN NaN True -1.0 \n", + "... ... ... ... ... \n", + "23636 NaN NaN True -3.0 \n", + "22640 NaN NaN True -3.0 \n", + "22650 NaN NaN False -1.0 \n", + "22651 NaN NaN False -1.0 \n", + "23439 NaN NaN False -1.0 \n", + "\n", + "[22874 rows x 35 columns]" + ] + }, + "execution_count": 185, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "BE_with_Ink_df = pd.concat([BE_df, ink_add_df])\n", + "BE_with_Ink_df.sort_values(by='Signatur', inplace=True)\n", + "BE_with_Ink_df" + ] + }, + { + "cell_type": "markdown", + "id": "bd4ecfac-f115-4f61-bb8c-557505db00e6", + "metadata": {}, + "source": [ + "# hs. Matches der Inkunabeln eintragen" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "id": "f8f8d054-aa6d-4629-8cdc-ddfae03e812e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ink_matches = pd.read_excel('../Daten/Katalog/Inkunabeln-Eugeniana.xlsx')\n", + "ink_matches_dropna = ink_matches.dropna(subset='entry_ID')" + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "id": "41d1a6ef-6751-4efb-bb19-38edfbe6b254", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "entries = pd.read_excel('../Daten/Katalogabgleich/Einträge.xlsx', index_col=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 193, + "id": "51d82520-cf1f-4369-81ea-2f146be81b65", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dod_id = {\n", + " '14.376': 51202, \n", + " '14.377': 51184,\n", + " '14.378': 51219\n", + "}\n", + "\n", + "BE_ids_matches = {}\n", + "\n", + "for i, row in ink_matches_dropna.iterrows():\n", + " sig = row['Signatur'].replace('.', '').replace(' ', '')\n", + " entry_id = row['entry_ID']\n", + " entry_index = entries[entries['entry_ID'] == entry_id].index\n", + " BE_index = BE_with_Ink_df[BE_with_Ink_df['Signatur'].str.replace(r'\\.| ', '', regex=True) == sig].index\n", + " if len(BE_index) == 0:\n", + " print('Signature', sig, 'not found')\n", + " elif len(BE_index) == 1:\n", + " i = BE_index.values[0]\n", + " if i not in BE_ids_matches.keys():\n", + " BE_ids_matches[i] = [entry_index.values[0]]\n", + " else:\n", + " BE_ids_matches[i] += [entry_index.values[0]]\n", + " else:\n", + " print('More than one match for signature', sig)" + ] + }, + { + "cell_type": "code", + "execution_count": 214, + "id": "2b30e44b-fcb1-4256-9213-a57d59b90a57", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys([23450, 23509, 23595, 23543, 23488, 23475, 23508, 23573, 23463, 23468, 23566, 23496, 23478, 23531, 23557, 23621, 23670, 23638, 22618, 23671, 23552, 23600, 23614, 23539, 23479, 23452, 23592, 23513, 23563, 23629, 23560, 23480, 23446, 23519, 23669, 23481, 23447, 23593, 23565, 23529, 23536, 23538, 23641, 23672, 23632, 23589, 23646, 23532, 23547, 23516, 23472, 22619, 23612, 23630, 23476, 23576, 23673, 22621, 23515, 23618, 22622, 23619, 23572, 23651, 23489, 23527, 23665, 23455, 23533, 23631, 23564, 23449, 23492, 23640, 23659, 23483, 23652, 23554, 23570, 23521, 23627, 22623, 23626, 23548, 23490, 23502, 23577, 23517, 23493, 23556, 23603, 23654, 23500, 23674, 23465, 23474, 22625, 23491, 23616, 23608, 23507, 23551, 23610, 23540, 23562, 23571, 23494, 23663, 23473, 23530, 23645, 23585, 23624, 23588, 23458, 23578, 22626, 22627, 23469, 22628, 23526, 23675, 22629, 23445, 23676, 22630, 23657, 23598, 23639, 22631, 23510, 23511, 23512, 23459, 23499, 23451, 23653, 23622, 23677, 23597, 23678, 23544, 23582, 23628, 23666, 23679, 23680, 22632, 23574, 23537, 23524, 23606, 23604, 23485, 23581, 23613, 23609, 23466, 23504, 23460, 23546, 23664, 23497, 23471, 23584, 23528, 23607, 23575, 23611, 23482, 23656, 23586, 23506, 23561, 23658, 22633, 23569, 23545, 23596, 22634, 23580, 23462, 23643, 23650, 23525, 23623, 23542, 23467, 23487, 22635, 23464, 23501, 22636, 23681, 23649, 23662, 23520, 23587, 23590, 23448, 23503, 23484, 23498, 23615, 23661, 23591, 23559, 23602, 22637, 23594, 23642, 23523, 23568, 23535, 23605, 23617, 23668, 23456, 23634, 23549, 22638, 23599, 23555, 23637, 23682, 22639, 23660, 23583, 23461, 23636, 22640])" + ] + }, + "execution_count": 214, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "BE_ids_matches.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 216, + "id": "c8b7b940-ef03-49f0-8fc4-d174faee86fe", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "for BE_id, matches in BE_ids_matches.items():\n", + " match_dict = {\n", + " 'hs. Katalog': 1,\n", + " 'hs. Katalog Konfidenz': [],\n", + " 'hs. Katalogband': [],\n", + " 'hs. Katalogseite Digitalisat': [],\n", + " 'Wissensklasse': [],\n", + " 'Wissensunterklasse': [],\n", + " 'Formatangabe': [],\n", + " 'hs. Katalogseite Handschrift': [],\n", + " 'hs. Katalogeintrag ID': [],\n", + " 'hs. Katalogeintrag': [],\n", + " 'hs. Katalog Image URL': []\n", + " }\n", + " for m in matches:\n", + " hw_entry = entries.loc[m]\n", + " match_dict['hs. Katalog Konfidenz'] += ['sicher']\n", + " match_dict['hs. Katalogband'] += [str(hw_entry['volume'])]\n", + " match_dict['hs. Katalogseite Digitalisat'] += [str(hw_entry['page number'])]\n", + " match_dict['Wissensklasse'] += [hw_entry['category']]\n", + " match_dict['Wissensunterklasse'] += [hw_entry['subcategory'] if not pd.isna(hw_entry['subcategory']) else '']\n", + " match_dict['Formatangabe'] += [hw_entry['format'] if not pd.isna(hw_entry['format']) else '']\n", + " match_dict['hs. Katalogseite Handschrift'] += [hw_entry['handwritten page number']]\n", + " match_dict['hs. Katalogeintrag ID'] += [hw_entry['entry_ID']]\n", + " match_dict['hs. Katalogeintrag'] += [hw_entry['entry']]\n", + " match_dict['hs. Katalog Image URL'] += [f\"https://iiif.onb.ac.at/images/DOD/{dod_id[str(hw_entry['volume'])]}/{hw_entry['page number']:08}.jp2/full/full/0/native.jpg\"]\n", + " \n", + " for key, val in match_dict.items():\n", + " if key != 'hs. Katalog':\n", + " val = ' | '.join(val)\n", + " BE_with_Ink_df.at[BE_id, key] = val" + ] + }, + { + "cell_type": "markdown", + "id": "3288bfdc-876f-4aa7-89ea-448584339183", + "metadata": {}, + "source": [ + "# simple und komplexe Klassifizierung der Eugeniana-Daten" + ] + }, + { + "cell_type": "code", + "execution_count": 244, + "id": "06fbcb36-61b6-44d9-bb5c-79df81e02f89", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "determined: 2 expected: 2\n", + "determined: 1 expected: 1\n", + "determined: 1 expected: 1\n", + "determined: 0 expected: 0\n" + ] + } + ], + "source": [ + "def complex_classify(BE_row):\n", + " if (BE_row['Wappenklassifizierung'] in ['A', 'B', 'C']) or (BE_row['hs. Katalog Konfidenz'] == 'sicher'):\n", + " return 2\n", + " elif (BE_row['hs. Katalog Konfidenz'] == 'unsicher') or (BE_row['Anfang Veröffentlichungsdatum'] <= 1736):\n", + " return 1\n", + " else:\n", + " return 0\n", + "\n", + "sample_ids = [22871, 2954, 6695, 9396]\n", + "expected_class = [2, 1, 1, 0]\n", + "for ind in sample_ids:\n", + " print('determined:', complex_classify(non_BE_df.loc[ind]), 'expected:', expected_class[sample_ids.index(ind)])\n", + " \n", + "BE_with_Ink_df['Einfache Klassifizierung'] = BE_with_Ink_df['Wappenklassifizierung'].isin(['A', 'B', 'C']) | (BE_with_Ink_df['hs. Katalog'] == 1)\n", + "BE_with_Ink_df['Komplexe Klassifizierung'] = BE_with_Ink_df.apply(lambda x: complex_classify(x), axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 256, + "id": "b9ee6630-7122-44d9-9d2f-c8113e5a8e97", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Einfache Klassifizierung\n", + "False 13069\n", + "True 9805\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 256, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "BE_with_Ink_df['Einfache Klassifizierung'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 255, + "id": "175c1147-0063-45e4-8498-39ee390ea7a0", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Komplexe Klassifizierung\n", + "0 10131\n", + "2 9568\n", + "1 3175\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 255, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "BE_with_Ink_df['Komplexe Klassifizierung'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ecc028b-9b46-4bf8-ae24-2aa9c58366bb", + "metadata": {}, + "outputs": [], + "source": [ + "BE_with_Ink_df.to_excel('../Daten/Vorhersagen/WIP_final_BE_4.xlsx', " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Notebooks/Non_BE_matching.ipynb b/Notebooks/Non_BE_matching.ipynb deleted file mode 100644 index d3e0d151be1294683383470eb20e4450188dffc1..0000000000000000000000000000000000000000 --- a/Notebooks/Non_BE_matching.ipynb +++ /dev/null @@ -1,207 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 20, - "id": "a910c4c5-3a61-462b-ac07-c9545fe7ae40", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import re\n", - "import numpy as np\n", - "from thefuzz import fuzz, process\n", - "from tqdm.notebook import tqdm\n", - "import matplotlib.pyplot as plt\n", - "import requests\n", - "import json\n", - "from IPython.display import display\n", - "\n", - "pd.set_option('display.max_colwidth', None)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "83013484-2a55-4819-8b30-b2f8cbbe7981", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "entry_df = pd.read_excel('../Daten/Katalogabgleich/Einträge.xlsx', index_col=0)\n", - "\n", - "def prepare_string(string):\n", - " new = re.sub(r'[à áâãå]', 'a', string)\n", - " new = re.sub(r'[èéêë]', 'e', new)\n", - " new = re.sub(r'[ìÃîï]', 'i', new)\n", - " new = re.sub(r'[òóôõ]', 'o', new)\n", - " new = re.sub(r'[ùúû]', 'u', new)\n", - " new = re.sub(r'æ', 'ae', new)\n", - " new = re.sub('[.,:;()¬]|^[CLXVI]+? |^\\d+? |^\\d+?\\.+? |^\\.+ ?|= |# ', '', new)\n", - " return new\n", - "\n", - "entry_df['cleaned entry'] = entry_df['entry'].apply(lambda x: prepare_string(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "1279d6ea-48a2-4f65-9cfa-b1f92eac16f1", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "non_BE_df = pd.read_excel('../Daten/Vorhersagen/WIP_final_BE_3.xlsx', index_col=0)" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "ddf5d11c-5f72-4bc8-ab8f-0a1e0f01e60d", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# fix 22871 without metadata\n", - "i = 22871\n", - "non_BE_df.at[i, 'Titel'] = 'De La coronica general de toda Espana y especialmente del Reyno de Valencia. etc'\n", - "non_BE_df.at[i, 'Autor'] = 'Beuter, Pero-Anton'\n", - "non_BE_df.at[i, 'Mitwirkender'] = ''\n", - "non_BE_df.at[i, 'Anfang Veröffentlichungsdatum'] = '1546'\n", - "non_BE_df.at[i, 'Ende Veröffentlichungsdatum'] = '1551'\n", - "non_BE_df.at[i, 'Veröffentlichungsdatum'] = '1546-1551'\n", - "non_BE_df.at[i, 'Veröffentlichungsort'] = 'Valencia'\n", - "non_BE_df.at[i, 'Veröffentlichungsort (normiert)'] = 'Valencia'\n", - "non_BE_df.at[i, 'Sprache'] = 'Spanish'" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "e9c00ca9-c051-4e3a-93cf-133031ca9e7f", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "no_BE = non_BE_df[~(non_BE_df['Signatur'].str.contains('BE') | non_BE_df['Signatur'].str.contains('Ink'))]" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "ca242a1c-baf8-4183-a565-a3797d6f4747", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "a7b5d48f5fcd4cbbbf56291a871746c5", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/804 [00:00<?, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "better_matches = []\n", - "scorer = fuzz.token_set_ratio\n", - "\n", - "for index, row in tqdm(no_BE.iterrows(), total=len(no_BE)):\n", - " keys = ['Autor', 'Mitwirkender', 'Titel', 'Veröffentlichungsort', 'Anfang Veröffentlichungsdatum']\n", - " comb_string = ''\n", - " for key in keys:\n", - " val = row[key]\n", - " if not pd.isna(val):\n", - " if key == 'Autor' or key == 'Mitwirkender':\n", - " if ',' in val: # falls name, vorname\n", - " val = val.split(',')[0]\n", - " val = val.split(' ')[0]\n", - " elif key == 'Titel':\n", - " val = prepare_string(val)\n", - " elif key == 'Anfang Veröffentlichungsdatum':\n", - " val = str(int(val))\n", - " else: # key == 'Veröffentlichungsort'\n", - " pass\n", - " comb_string += val + ' '\n", - " \n", - " matches_lis = process.extract(comb_string, entry_df['cleaned entry'], scorer=scorer, limit=5)\n", - " flat_matches = []\n", - " for match in matches_lis:\n", - " flat_matches.append(match[0])\n", - " flat_matches.append(match[1])\n", - " flat_matches.append(match[2])\n", - " better_matches.append([comb_string] + flat_matches)\n", - "\n", - "matches_df = pd.DataFrame(better_matches, columns=['input', 'match_1', 'score_1', 'id_1', 'match_2', 'score_2', 'id_2', 'match_3', 'score_3', 'id_3', 'match_4', 'score_4', 'id_4', 'match_5', 'score_5', 'id_5'])\n", - "matches_df['control'] = ''" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "e9dd6e64-a45d-4f25-9ad1-624cfc5268fb", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "matches_df_no_score = matches_df.drop(['score_1', 'score_2', 'score_3', 'score_4', 'score_5'], axis=1)\n", - "matches_df_no_score.insert(1, 'input_id', no_BE.index)" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "id": "b1f7e1a3-b886-496c-a9ca-ec484d196c24", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "matches_df_no_score.to_excel('../Daten/Katalogabgleich/Kandidaten/no_BE_Kandidaten.xlsx', index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a4639759-8344-452e-96ce-cfca485165a4", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}