diff --git a/Daten/Katalogabgleich/Kandidaten/no_BE_Kandidaten.xlsx b/Daten/Katalogabgleich/Kandidaten/no_BE_Kandidaten.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..351af10ef2d160654c7d15a7cf364858e9a380b7 Binary files /dev/null and b/Daten/Katalogabgleich/Kandidaten/no_BE_Kandidaten.xlsx differ diff --git a/Notebooks/Non_BE_matching.ipynb b/Notebooks/Non_BE_matching.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..d3e0d151be1294683383470eb20e4450188dffc1 --- /dev/null +++ b/Notebooks/Non_BE_matching.ipynb @@ -0,0 +1,207 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 20, + "id": "a910c4c5-3a61-462b-ac07-c9545fe7ae40", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import re\n", + "import numpy as np\n", + "from thefuzz import fuzz, process\n", + "from tqdm.notebook import tqdm\n", + "import matplotlib.pyplot as plt\n", + "import requests\n", + "import json\n", + "from IPython.display import display\n", + "\n", + "pd.set_option('display.max_colwidth', None)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "83013484-2a55-4819-8b30-b2f8cbbe7981", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "entry_df = pd.read_excel('../Daten/Katalogabgleich/Einträge.xlsx', index_col=0)\n", + "\n", + "def prepare_string(string):\n", + " new = re.sub(r'[à áâãå]', 'a', string)\n", + " new = re.sub(r'[èéêë]', 'e', new)\n", + " new = re.sub(r'[ìÃîï]', 'i', new)\n", + " new = re.sub(r'[òóôõ]', 'o', new)\n", + " new = re.sub(r'[ùúû]', 'u', new)\n", + " new = re.sub(r'æ', 'ae', new)\n", + " new = re.sub('[.,:;()¬]|^[CLXVI]+? |^\\d+? |^\\d+?\\.+? |^\\.+ ?|= |# ', '', new)\n", + " return new\n", + "\n", + "entry_df['cleaned entry'] = entry_df['entry'].apply(lambda x: prepare_string(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1279d6ea-48a2-4f65-9cfa-b1f92eac16f1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "non_BE_df = pd.read_excel('../Daten/Vorhersagen/WIP_final_BE_3.xlsx', index_col=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "ddf5d11c-5f72-4bc8-ab8f-0a1e0f01e60d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# fix 22871 without metadata\n", + "i = 22871\n", + "non_BE_df.at[i, 'Titel'] = 'De La coronica general de toda Espana y especialmente del Reyno de Valencia. etc'\n", + "non_BE_df.at[i, 'Autor'] = 'Beuter, Pero-Anton'\n", + "non_BE_df.at[i, 'Mitwirkender'] = ''\n", + "non_BE_df.at[i, 'Anfang Veröffentlichungsdatum'] = '1546'\n", + "non_BE_df.at[i, 'Ende Veröffentlichungsdatum'] = '1551'\n", + "non_BE_df.at[i, 'Veröffentlichungsdatum'] = '1546-1551'\n", + "non_BE_df.at[i, 'Veröffentlichungsort'] = 'Valencia'\n", + "non_BE_df.at[i, 'Veröffentlichungsort (normiert)'] = 'Valencia'\n", + "non_BE_df.at[i, 'Sprache'] = 'Spanish'" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "e9c00ca9-c051-4e3a-93cf-133031ca9e7f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "no_BE = non_BE_df[~(non_BE_df['Signatur'].str.contains('BE') | non_BE_df['Signatur'].str.contains('Ink'))]" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "ca242a1c-baf8-4183-a565-a3797d6f4747", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a7b5d48f5fcd4cbbbf56291a871746c5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/804 [00:00<?, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "better_matches = []\n", + "scorer = fuzz.token_set_ratio\n", + "\n", + "for index, row in tqdm(no_BE.iterrows(), total=len(no_BE)):\n", + " keys = ['Autor', 'Mitwirkender', 'Titel', 'Veröffentlichungsort', 'Anfang Veröffentlichungsdatum']\n", + " comb_string = ''\n", + " for key in keys:\n", + " val = row[key]\n", + " if not pd.isna(val):\n", + " if key == 'Autor' or key == 'Mitwirkender':\n", + " if ',' in val: # falls name, vorname\n", + " val = val.split(',')[0]\n", + " val = val.split(' ')[0]\n", + " elif key == 'Titel':\n", + " val = prepare_string(val)\n", + " elif key == 'Anfang Veröffentlichungsdatum':\n", + " val = str(int(val))\n", + " else: # key == 'Veröffentlichungsort'\n", + " pass\n", + " comb_string += val + ' '\n", + " \n", + " matches_lis = process.extract(comb_string, entry_df['cleaned entry'], scorer=scorer, limit=5)\n", + " flat_matches = []\n", + " for match in matches_lis:\n", + " flat_matches.append(match[0])\n", + " flat_matches.append(match[1])\n", + " flat_matches.append(match[2])\n", + " better_matches.append([comb_string] + flat_matches)\n", + "\n", + "matches_df = pd.DataFrame(better_matches, columns=['input', 'match_1', 'score_1', 'id_1', 'match_2', 'score_2', 'id_2', 'match_3', 'score_3', 'id_3', 'match_4', 'score_4', 'id_4', 'match_5', 'score_5', 'id_5'])\n", + "matches_df['control'] = ''" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "e9dd6e64-a45d-4f25-9ad1-624cfc5268fb", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "matches_df_no_score = matches_df.drop(['score_1', 'score_2', 'score_3', 'score_4', 'score_5'], axis=1)\n", + "matches_df_no_score.insert(1, 'input_id', no_BE.index)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "b1f7e1a3-b886-496c-a9ca-ec484d196c24", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "matches_df_no_score.to_excel('../Daten/Katalogabgleich/Kandidaten/no_BE_Kandidaten.xlsx', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4639759-8344-452e-96ce-cfca485165a4", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}