Skip to content
Snippets Groups Projects
Commit 7d520b2d authored by smayer's avatar smayer
Browse files

Add no BE matching data

parent 517c1446
No related branches found
No related tags found
No related merge requests found
File added
%% Cell type:code id:a910c4c5-3a61-462b-ac07-c9545fe7ae40 tags:
``` python
import pandas as pd
import re
import numpy as np
from thefuzz import fuzz, process
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import requests
import json
from IPython.display import display
pd.set_option('display.max_colwidth', None)
```
%% Cell type:code id:83013484-2a55-4819-8b30-b2f8cbbe7981 tags:
``` python
entry_df = pd.read_excel('../Daten/Katalogabgleich/Einträge.xlsx', index_col=0)
def prepare_string(string):
new = re.sub(r'[àáâãå]', 'a', string)
new = re.sub(r'[èéêë]', 'e', new)
new = re.sub(r'[ìíîï]', 'i', new)
new = re.sub(r'[òóôõ]', 'o', new)
new = re.sub(r'[ùúû]', 'u', new)
new = re.sub(r'æ', 'ae', new)
new = re.sub('[.,:;()¬]|^[CLXVI]+? |^\d+? |^\d+?\.+? |^\.+ ?|= |# ', '', new)
return new
entry_df['cleaned entry'] = entry_df['entry'].apply(lambda x: prepare_string(x))
```
%% Cell type:code id:1279d6ea-48a2-4f65-9cfa-b1f92eac16f1 tags:
``` python
non_BE_df = pd.read_excel('../Daten/Vorhersagen/WIP_final_BE_3.xlsx', index_col=0)
```
%% Cell type:code id:ddf5d11c-5f72-4bc8-ab8f-0a1e0f01e60d tags:
``` python
# fix 22871 without metadata
i = 22871
non_BE_df.at[i, 'Titel'] = 'De La coronica general de toda Espana y especialmente del Reyno de Valencia. etc'
non_BE_df.at[i, 'Autor'] = 'Beuter, Pero-Anton'
non_BE_df.at[i, 'Mitwirkender'] = ''
non_BE_df.at[i, 'Anfang Veröffentlichungsdatum'] = '1546'
non_BE_df.at[i, 'Ende Veröffentlichungsdatum'] = '1551'
non_BE_df.at[i, 'Veröffentlichungsdatum'] = '1546-1551'
non_BE_df.at[i, 'Veröffentlichungsort'] = 'Valencia'
non_BE_df.at[i, 'Veröffentlichungsort (normiert)'] = 'Valencia'
non_BE_df.at[i, 'Sprache'] = 'Spanish'
```
%% Cell type:code id:e9c00ca9-c051-4e3a-93cf-133031ca9e7f tags:
``` python
no_BE = non_BE_df[~(non_BE_df['Signatur'].str.contains('BE') | non_BE_df['Signatur'].str.contains('Ink'))]
```
%% Cell type:code id:ca242a1c-baf8-4183-a565-a3797d6f4747 tags:
``` python
better_matches = []
scorer = fuzz.token_set_ratio
for index, row in tqdm(no_BE.iterrows(), total=len(no_BE)):
keys = ['Autor', 'Mitwirkender', 'Titel', 'Veröffentlichungsort', 'Anfang Veröffentlichungsdatum']
comb_string = ''
for key in keys:
val = row[key]
if not pd.isna(val):
if key == 'Autor' or key == 'Mitwirkender':
if ',' in val: # falls name, vorname
val = val.split(',')[0]
val = val.split(' ')[0]
elif key == 'Titel':
val = prepare_string(val)
elif key == 'Anfang Veröffentlichungsdatum':
val = str(int(val))
else: # key == 'Veröffentlichungsort'
pass
comb_string += val + ' '
matches_lis = process.extract(comb_string, entry_df['cleaned entry'], scorer=scorer, limit=5)
flat_matches = []
for match in matches_lis:
flat_matches.append(match[0])
flat_matches.append(match[1])
flat_matches.append(match[2])
better_matches.append([comb_string] + flat_matches)
matches_df = pd.DataFrame(better_matches, columns=['input', 'match_1', 'score_1', 'id_1', 'match_2', 'score_2', 'id_2', 'match_3', 'score_3', 'id_3', 'match_4', 'score_4', 'id_4', 'match_5', 'score_5', 'id_5'])
matches_df['control'] = ''
```
%% Output
%% Cell type:code id:e9dd6e64-a45d-4f25-9ad1-624cfc5268fb tags:
``` python
matches_df_no_score = matches_df.drop(['score_1', 'score_2', 'score_3', 'score_4', 'score_5'], axis=1)
matches_df_no_score.insert(1, 'input_id', no_BE.index)
```
%% Cell type:code id:b1f7e1a3-b886-496c-a9ca-ec484d196c24 tags:
``` python
matches_df_no_score.to_excel('../Daten/Katalogabgleich/Kandidaten/no_BE_Kandidaten.xlsx', index=False)
```
%% Cell type:code id:a4639759-8344-452e-96ce-cfca485165a4 tags:
``` python
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment