Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
BED-general
Manage
Activity
Members
Plan
Wiki
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Package Registry
Model registry
Operate
Terraform modules
Analyze
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
BED
BED-general
Commits
7d520b2d
Commit
7d520b2d
authored
6 months ago
by
smayer
Browse files
Options
Downloads
Patches
Plain Diff
Add no BE matching data
parent
517c1446
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
Daten/Katalogabgleich/Kandidaten/no_BE_Kandidaten.xlsx
+0
-0
0 additions, 0 deletions
Daten/Katalogabgleich/Kandidaten/no_BE_Kandidaten.xlsx
Notebooks/Non_BE_matching.ipynb
+207
-0
207 additions, 0 deletions
Notebooks/Non_BE_matching.ipynb
with
207 additions
and
0 deletions
Daten/Katalogabgleich/Kandidaten/no_BE_Kandidaten.xlsx
0 → 100644
+
0
−
0
View file @
7d520b2d
File added
This diff is collapsed.
Click to expand it.
Notebooks/Non_BE_matching.ipynb
0 → 100644
+
207
−
0
View file @
7d520b2d
{
"cells": [
{
"cell_type": "code",
"execution_count": 20,
"id": "a910c4c5-3a61-462b-ac07-c9545fe7ae40",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import re\n",
"import numpy as np\n",
"from thefuzz import fuzz, process\n",
"from tqdm.notebook import tqdm\n",
"import matplotlib.pyplot as plt\n",
"import requests\n",
"import json\n",
"from IPython.display import display\n",
"\n",
"pd.set_option('display.max_colwidth', None)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "83013484-2a55-4819-8b30-b2f8cbbe7981",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"entry_df = pd.read_excel('../Daten/Katalogabgleich/Einträge.xlsx', index_col=0)\n",
"\n",
"def prepare_string(string):\n",
" new = re.sub(r'[àáâãå]', 'a', string)\n",
" new = re.sub(r'[èéêë]', 'e', new)\n",
" new = re.sub(r'[ìíîï]', 'i', new)\n",
" new = re.sub(r'[òóôõ]', 'o', new)\n",
" new = re.sub(r'[ùúû]', 'u', new)\n",
" new = re.sub(r'æ', 'ae', new)\n",
" new = re.sub('[.,:;()¬]|^[CLXVI]+? |^\\d+? |^\\d+?\\.+? |^\\.+ ?|= |# ', '', new)\n",
" return new\n",
"\n",
"entry_df['cleaned entry'] = entry_df['entry'].apply(lambda x: prepare_string(x))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "1279d6ea-48a2-4f65-9cfa-b1f92eac16f1",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"non_BE_df = pd.read_excel('../Daten/Vorhersagen/WIP_final_BE_3.xlsx', index_col=0)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "ddf5d11c-5f72-4bc8-ab8f-0a1e0f01e60d",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# fix 22871 without metadata\n",
"i = 22871\n",
"non_BE_df.at[i, 'Titel'] = 'De La coronica general de toda Espana y especialmente del Reyno de Valencia. etc'\n",
"non_BE_df.at[i, 'Autor'] = 'Beuter, Pero-Anton'\n",
"non_BE_df.at[i, 'Mitwirkender'] = ''\n",
"non_BE_df.at[i, 'Anfang Veröffentlichungsdatum'] = '1546'\n",
"non_BE_df.at[i, 'Ende Veröffentlichungsdatum'] = '1551'\n",
"non_BE_df.at[i, 'Veröffentlichungsdatum'] = '1546-1551'\n",
"non_BE_df.at[i, 'Veröffentlichungsort'] = 'Valencia'\n",
"non_BE_df.at[i, 'Veröffentlichungsort (normiert)'] = 'Valencia'\n",
"non_BE_df.at[i, 'Sprache'] = 'Spanish'"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "e9c00ca9-c051-4e3a-93cf-133031ca9e7f",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"no_BE = non_BE_df[~(non_BE_df['Signatur'].str.contains('BE') | non_BE_df['Signatur'].str.contains('Ink'))]"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "ca242a1c-baf8-4183-a565-a3797d6f4747",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "a7b5d48f5fcd4cbbbf56291a871746c5",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/804 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"better_matches = []\n",
"scorer = fuzz.token_set_ratio\n",
"\n",
"for index, row in tqdm(no_BE.iterrows(), total=len(no_BE)):\n",
" keys = ['Autor', 'Mitwirkender', 'Titel', 'Veröffentlichungsort', 'Anfang Veröffentlichungsdatum']\n",
" comb_string = ''\n",
" for key in keys:\n",
" val = row[key]\n",
" if not pd.isna(val):\n",
" if key == 'Autor' or key == 'Mitwirkender':\n",
" if ',' in val: # falls name, vorname\n",
" val = val.split(',')[0]\n",
" val = val.split(' ')[0]\n",
" elif key == 'Titel':\n",
" val = prepare_string(val)\n",
" elif key == 'Anfang Veröffentlichungsdatum':\n",
" val = str(int(val))\n",
" else: # key == 'Veröffentlichungsort'\n",
" pass\n",
" comb_string += val + ' '\n",
" \n",
" matches_lis = process.extract(comb_string, entry_df['cleaned entry'], scorer=scorer, limit=5)\n",
" flat_matches = []\n",
" for match in matches_lis:\n",
" flat_matches.append(match[0])\n",
" flat_matches.append(match[1])\n",
" flat_matches.append(match[2])\n",
" better_matches.append([comb_string] + flat_matches)\n",
"\n",
"matches_df = pd.DataFrame(better_matches, columns=['input', 'match_1', 'score_1', 'id_1', 'match_2', 'score_2', 'id_2', 'match_3', 'score_3', 'id_3', 'match_4', 'score_4', 'id_4', 'match_5', 'score_5', 'id_5'])\n",
"matches_df['control'] = ''"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "e9dd6e64-a45d-4f25-9ad1-624cfc5268fb",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"matches_df_no_score = matches_df.drop(['score_1', 'score_2', 'score_3', 'score_4', 'score_5'], axis=1)\n",
"matches_df_no_score.insert(1, 'input_id', no_BE.index)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "b1f7e1a3-b886-496c-a9ca-ec484d196c24",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"matches_df_no_score.to_excel('../Daten/Katalogabgleich/Kandidaten/no_BE_Kandidaten.xlsx', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a4639759-8344-452e-96ce-cfca485165a4",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
%% Cell type:code id:a910c4c5-3a61-462b-ac07-c9545fe7ae40 tags:
```
python
import
pandas
as
pd
import
re
import
numpy
as
np
from
thefuzz
import
fuzz
,
process
from
tqdm.notebook
import
tqdm
import
matplotlib.pyplot
as
plt
import
requests
import
json
from
IPython.display
import
display
pd
.
set_option
(
'
display.max_colwidth
'
,
None
)
```
%% Cell type:code id:83013484-2a55-4819-8b30-b2f8cbbe7981 tags:
```
python
entry_df
=
pd
.
read_excel
(
'
../Daten/Katalogabgleich/Einträge.xlsx
'
,
index_col
=
0
)
def
prepare_string
(
string
):
new
=
re
.
sub
(
r
'
[àáâãå]
'
,
'
a
'
,
string
)
new
=
re
.
sub
(
r
'
[èéêë]
'
,
'
e
'
,
new
)
new
=
re
.
sub
(
r
'
[ìíîï]
'
,
'
i
'
,
new
)
new
=
re
.
sub
(
r
'
[òóôõ]
'
,
'
o
'
,
new
)
new
=
re
.
sub
(
r
'
[ùúû]
'
,
'
u
'
,
new
)
new
=
re
.
sub
(
r
'
æ
'
,
'
ae
'
,
new
)
new
=
re
.
sub
(
'
[.,:;()¬]|^[CLXVI]+? |^\d+? |^\d+?\.+? |^\.+ ?|= |#
'
,
''
,
new
)
return
new
entry_df
[
'
cleaned entry
'
]
=
entry_df
[
'
entry
'
].
apply
(
lambda
x
:
prepare_string
(
x
))
```
%% Cell type:code id:1279d6ea-48a2-4f65-9cfa-b1f92eac16f1 tags:
```
python
non_BE_df
=
pd
.
read_excel
(
'
../Daten/Vorhersagen/WIP_final_BE_3.xlsx
'
,
index_col
=
0
)
```
%% Cell type:code id:ddf5d11c-5f72-4bc8-ab8f-0a1e0f01e60d tags:
```
python
# fix 22871 without metadata
i
=
22871
non_BE_df
.
at
[
i
,
'
Titel
'
]
=
'
De La coronica general de toda Espana y especialmente del Reyno de Valencia. etc
'
non_BE_df
.
at
[
i
,
'
Autor
'
]
=
'
Beuter, Pero-Anton
'
non_BE_df
.
at
[
i
,
'
Mitwirkender
'
]
=
''
non_BE_df
.
at
[
i
,
'
Anfang Veröffentlichungsdatum
'
]
=
'
1546
'
non_BE_df
.
at
[
i
,
'
Ende Veröffentlichungsdatum
'
]
=
'
1551
'
non_BE_df
.
at
[
i
,
'
Veröffentlichungsdatum
'
]
=
'
1546-1551
'
non_BE_df
.
at
[
i
,
'
Veröffentlichungsort
'
]
=
'
Valencia
'
non_BE_df
.
at
[
i
,
'
Veröffentlichungsort (normiert)
'
]
=
'
Valencia
'
non_BE_df
.
at
[
i
,
'
Sprache
'
]
=
'
Spanish
'
```
%% Cell type:code id:e9c00ca9-c051-4e3a-93cf-133031ca9e7f tags:
```
python
no_BE
=
non_BE_df
[
~
(
non_BE_df
[
'
Signatur
'
].
str
.
contains
(
'
BE
'
)
|
non_BE_df
[
'
Signatur
'
].
str
.
contains
(
'
Ink
'
))]
```
%% Cell type:code id:ca242a1c-baf8-4183-a565-a3797d6f4747 tags:
```
python
better_matches
=
[]
scorer
=
fuzz
.
token_set_ratio
for
index
,
row
in
tqdm
(
no_BE
.
iterrows
(),
total
=
len
(
no_BE
)):
keys
=
[
'
Autor
'
,
'
Mitwirkender
'
,
'
Titel
'
,
'
Veröffentlichungsort
'
,
'
Anfang Veröffentlichungsdatum
'
]
comb_string
=
''
for
key
in
keys
:
val
=
row
[
key
]
if
not
pd
.
isna
(
val
):
if
key
==
'
Autor
'
or
key
==
'
Mitwirkender
'
:
if
'
,
'
in
val
:
# falls name, vorname
val
=
val
.
split
(
'
,
'
)[
0
]
val
=
val
.
split
(
'
'
)[
0
]
elif
key
==
'
Titel
'
:
val
=
prepare_string
(
val
)
elif
key
==
'
Anfang Veröffentlichungsdatum
'
:
val
=
str
(
int
(
val
))
else
:
# key == 'Veröffentlichungsort'
pass
comb_string
+=
val
+
'
'
matches_lis
=
process
.
extract
(
comb_string
,
entry_df
[
'
cleaned entry
'
],
scorer
=
scorer
,
limit
=
5
)
flat_matches
=
[]
for
match
in
matches_lis
:
flat_matches
.
append
(
match
[
0
])
flat_matches
.
append
(
match
[
1
])
flat_matches
.
append
(
match
[
2
])
better_matches
.
append
([
comb_string
]
+
flat_matches
)
matches_df
=
pd
.
DataFrame
(
better_matches
,
columns
=
[
'
input
'
,
'
match_1
'
,
'
score_1
'
,
'
id_1
'
,
'
match_2
'
,
'
score_2
'
,
'
id_2
'
,
'
match_3
'
,
'
score_3
'
,
'
id_3
'
,
'
match_4
'
,
'
score_4
'
,
'
id_4
'
,
'
match_5
'
,
'
score_5
'
,
'
id_5
'
])
matches_df
[
'
control
'
]
=
''
```
%% Output
%% Cell type:code id:e9dd6e64-a45d-4f25-9ad1-624cfc5268fb tags:
```
python
matches_df_no_score
=
matches_df
.
drop
([
'
score_1
'
,
'
score_2
'
,
'
score_3
'
,
'
score_4
'
,
'
score_5
'
],
axis
=
1
)
matches_df_no_score
.
insert
(
1
,
'
input_id
'
,
no_BE
.
index
)
```
%% Cell type:code id:b1f7e1a3-b886-496c-a9ca-ec484d196c24 tags:
```
python
matches_df_no_score
.
to_excel
(
'
../Daten/Katalogabgleich/Kandidaten/no_BE_Kandidaten.xlsx
'
,
index
=
False
)
```
%% Cell type:code id:a4639759-8344-452e-96ce-cfca485165a4 tags:
```
python
```
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment