diff --git a/Daten/Vorhersagen/Complete_BE_Years_Places.xlsx b/Daten/Vorhersagen/Complete_BE_Years_Places.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..53d6ad84f33f4ad36672f82cab2b9131433d1bf5 Binary files /dev/null and b/Daten/Vorhersagen/Complete_BE_Years_Places.xlsx differ diff --git a/Notebooks/Normierte_Orte/places_regex.json b/Notebooks/Normierte_Orte/places_regex.json new file mode 100644 index 0000000000000000000000000000000000000000..048fe6a4970aea9970e1110a496e0bba8fa15f50 --- /dev/null +++ b/Notebooks/Normierte_Orte/places_regex.json @@ -0,0 +1 @@ +{"Aachen": ["Aachen", "Ai[ax]-la-Chapelle"], "Aarau": ["Aarau"], "Agen": ["Agen"], "Aix": ["Aix"], "Alcala de Henares": ["Complut", "Alcala"], "Alcobaca": ["Alcobaca"], "Altdorf bei Nürnberg": ["Altdorfii", "Altdorf"], "Altenburg": ["Altenburg"], "Altona": ["Altona"], "Amsterdam": ["Amste"], "Angers": ["Angers", "Angiers"], "Antwerpen": ["Antwerpen", "Antverpiæ", "Anvers", "Antverpiae", "Amberes", "Anueres"], "Amiens": ["Amiens"], "Arezzo": ["Arezzo"], "Arles": ["Arles"], "Arnheim": ["Arnheim", "Arnhemi", "Aernhemiae"], "Aschaffenburg": ["Aschaffenburg"], "Aschersleben": ["Aschersleben"], "Augsburg": ["Augs[bp]urg", "Augustae Vindel", "Augstae Vindelicorum", "Augusto-Vindelicorum"], "Avignon": ["Avignon"], "Bad Langensalza": ["Langensalza"], "Baden": ["Baden"], "Bamberg": ["Bamberg"], "Barcelona": ["Barcelona", "Barcinone"], "Basel": ["Basel", "Basil", "Basle"], "Bassano del Grappa": ["Bassano"], "Bautzen": ["Bautzen", "Budißin"], "Beauvais": ["Beauvais"], "Bayreuth": ["Bayreuth", "Baireuth"], "Besançon": ["Besançon"], "Benevento": ["Benevento"], "Bergamo": ["Bergamo", "Bergomi"], "Berlin": ["Berlin", "Berolini"], "Bern": ["Bern"], "Beziers": ["Beziers"], "Biel": ["Biel"], "Birmingham": ["Birmingham"], "Blois": ["Blois"], "Bologna": ["Bonon", "Bologna"], "Bonn": ["Bonn"], "Bordeaux": ["Bou?rdeaux"], "Bouillon": ["Bouillon"], "Bourges": ["Bourges"], "Boston": ["Boston"], "Bratislava": ["Bratislava", "Posonii", "Pozs[öo]ny", "Poszony", "P[óo]sonyban", "[VW]ratislaviae"], "Breslau": ["Breslau", "Breßlau"], "Budapest": ["Budapest", "Pesth?", "Budán", "B[uv]d[aæ]", "Ofen"], "Bukarest": ["Bukarest"], "Braunschweig": ["Braunschweig"], "Bremen": ["Bremen"], "Brescia": ["Brescia"], "Brügge": ["Brugis", "Brügge", "Bruges", "Brugensis", "Bruge en Flandre", "Brugge"], "Brünn": ["Brünn", "Brno"], "Brüssel": ["Brüssel", "Bruxellae", "Broselas", "Bruxell"], "Bytom": ["Bytom", "Beuthen", "Bethaniae ad Viadrum"], "Cadiz": ["Cadiz"], "Caen": ["Caen"], "Cambridge": ["Cambridge", "Cantabrigiae"], "Casale Monferrato": ["Casale"], "Casalmaggiore": ["Casalmaggiore"], "Charkiw": ["Charkiw", "Charkoviae"], "Chartres": ["Chartres"], "Chemnitz": ["Chemnitz"], "Chiari": ["Chiari"], "Cluj-Napoca": ["Cluj-Napoca", "Klausenburg", "Kolozsv[aá]r"], "Coburg": ["Coburg"], "Colle": ["Colle", "Collae"], "Colombo": ["[CK]olombo"], "Commercy": ["Commercy"], "Como": ["Como"], "Cremona": ["Cremona?"], "Danzig": ["Danzig"], "Darmstadt": ["Darmstadt"], "Delft": ["Delft", "Delphis"], "Den Haag": ["Den Haag", "[lL]a Haye", "Hagae Comitum", "Hagæ Comitum", "Hagae-Comitum", "Hagae-Comitis", "Gravenhaa?ge", "Haye", "Nell'Haya"], "Dessau": ["Dessau"], "Deventer": ["Deventer", "Daventriae"], "Digne-les-Bains": ["Digne"], "Dijon": ["Dijon"], "Dole": ["Dole"], "Dordrecht": ["Dordrecht"], "Douai": ["Douai", "Douay", "Duaci"], "Dresden": ["Dre[sß]den"], "Düsseldorf": ["Düsseldorf"], "Dublin": ["Dublin"], "Écija": ["E[cd]ija"], "Edinburgh": ["Eding?burgh", "Edimburgi"], "Eger": ["Eger"], "Einsiedeln": ["Einsiedeln", "Ensidlense"], "Eisenach": ["Eisenach"], "Elberfeld": ["Elberfeld"], "Erfurt": ["Erff?urt"], "Erlangen": ["Erlangen"], "Esztergom": ["Esztergom", "Strigonii"], "Evora": ["Evora", "Eborae"], "Exeter": ["Exeter", "Isc[aæ]e? Dunmoniorum"], "Faenza": ["Faenza"], "Ferrara": ["Ferrar"], "Fiesole": ["Poligrafia Fiesolana"], "Florenz": ["Florenz", "Firenze", "Florentiae", "Fiorenz", "Florence"], "Foligno": ["Foligno"], "Franeker": ["Franeker", "Franequerae"], "Frankfurt/M.": ["Frankfurt", "Franc"], "Frauenfeld": ["Frauenfeld"], "Freiberg": ["Fre[iy]berg"], "Freiburg im Breisgau": ["Fre[iy]burg", "Fribourg\\Z", "Friburgi Brisgoviae"], "Freiburg im Üechtland": ["Fribourg en Suisse"], "Genf": ["Genf", "Genev"], "Gent": ["Gent", "Gand", "Gend"], "Genua": ["Genua", "Genova", "Genuae"], "Gießen": ["Gießen", "Giessen", "Giessae"], "Girona": ["Girona", "Gerona", "Gerone"], "Glasgow": ["Glasgow", "Glasguae"], "Glogau": ["Glogau", "Glogow", "GÅ‚ogów"], "Görlitz": ["Görlitz"], "Göttingen": ["Göttingen", "Gottingae"], "Gotha": ["Gotha"], "Gouda": ["Gouda"], "Granada": ["Granada", "Granatae"], "Graz": ["Graz", "Grätz", "Graecij"], "Greifswald": ["Greifswald", "Gryphiswaldiae"], "Grenoble": ["Grenoble", "Gratianopoli"], "Groningen": ["Groningen"], "Güns": ["Güns"], "Hagenau": ["Hagenau", "Haguenau", "Haganoae"], "Halberstadt": ["Halberstadt", "Halberstadii"], "Halle an der Saale": ["Halle", "Halis Saxonum", "Halae"], "Hamburg": ["Hamburg", "Amburgo"], "Hamm": ["Hamm"], "Hannover": ["Hanoviæ", "Hanoviae", "Hannover"], "Harderwijk": ["Harderwijk", "Hardervici Gelrorum"], "Heidelberg": ["Heidelberg"], "Helmstedt": ["Helmstedt", "Helmstadii", "H[ae]lmaestadii"], "Hildesheim": ["Hildesheim"], "Hof": ["Hof"], "Ingolstadt": ["Ingolstadt"], "Innsbruck": ["Oeniponti", "Innsbruck"], "Jaen": ["Ja[eé]n"], "Jena": ["Jena"], "Kaliningrad": ["Königsberg", "Regimontii Prussorum", "Regiomonti Prussorum"], "Karlsbad": ["Karlsbad"], "Karlsruhe": ["[KC]arlsruhe"], "Kassel": ["Kassel", "Cassel"], "Kiel": ["Kiel"], "Klagenfurt": ["Klagenfurt"], "Köln": ["Colo", "Köln"], "Köthen": ["[CK]öthen"], "Kolkata": ["Kolkata", "Calcutta"], "Kopenhagen": ["Kopenhagen", "Ha[fvu]niae", "K[ij]obenhavn", "Köbenhavn"], "Kosice": ["Ko[sÅ¡]ice", "Kaschau", "Kassa"], "Hafod Uchtryd": ["Hafod"], "Krakau": ["Krakau", "Cracoviae"], "La Fleche": ["La Fleche"], "La Rochelle": ["La[ -]Rochelle"], "Lambeth": ["Lambeth"], "Landshut": ["Landshut"], "Lauingen": ["Lauingen"], "Lausanne": ["Lausann[eæ]"], "Lecce": ["Lecce"], "Leeds": ["Leeds"], "Leeuwarden": ["Leeuwarden", "Leovardi[aæ]e?"], "Leiden": ["Leide", "Lugduni Bat", "Leyde", "Lugduni [Ii]n Batavis", "Leidae"], "Leipzig": ["Leipzig", "Lipsia", "Lipcse"], "Lemgo": ["Lemgo"], "Levoca": ["Levo[cÄ]a", "Leutschau", "Leutsovia"], "Lille": ["Lille", "Insulis"], "Lima": ["Lima"], "Linz": ["Linz"], "Lissabon": ["Lissabon", "Lisboa"], "Livorno": ["Livorno"], "Ljubljana": ["Ljubljana", "Laibach"], "Lodi": ["Lodi"], "Löwen": ["Löwen", "Leuven", "Louvain", "Lovanii"], "London": ["Lond", "Albionspolis"], "Lübeck": ["Lübeck"], "Lüneburg": ["Lüneburg"], "Lüttich": ["Lüttich", "Liege", "Leodii"], "Lucca": ["Lucca"], "Lund": ["Lund"], "Lugny": ["Lugny"], "Luxemburg": ["Luxemburg"], "Lwiw": ["Lwiw", "Lemberg", "Leopol"], "Lyon": ["Lugduni\\Z", "Lvgdvni\\Z", "Lyon", "Lione"], "Macerata": ["Macerata"], "Madrid": ["Madrid"], "Magdeburg": ["Magdeburg"], "Mailand": ["Mediolani", "Mailand", "Milan"], "Maille": ["Maille"], "Malaga": ["Malaga"], "Mainz": ["Mainz", "Mogunti[ao]e?"], "Mannheim": ["Mannheim"], "Mantua": ["Mantua", "Mantova"], "Marburg": ["Marpurgi", "Marburg"], "Marseille": ["Marseille"], "Medina del Campo": ["Medina del Campo"], "Messina": ["Messina"], "Mexiko-Stadt": ["Mexico"], "Middelburg": ["Middelburg", "Meidelboug"], "Mirandola": ["Mirandola"], "Modena": ["Modena"], "Montargis": ["Montargis"], "Monte Chiaro": ["Monte Chiaro"], "Montpellier": ["Montpellier"], "Monza": ["Monza"], "Moskau": ["Moskau", "Moskwa", "Mosquae"], "Moulins": ["Moulins"], "München": ["München", "Monachii"], "Münster": ["Münster"], "Mumbai": ["Mumbai", "Bombay"], "Nancy": ["Nanc[iy]"], "Neapel": ["Neapel", "Napoli", "Neapoli", "Naples"], "Neuchatel": ["Neuchatel", "Neuenburg"], "Neustadt an der Orla": ["Neustadt an der Orla", "Neustadt a. ?d. O."], "Neustrelitz": ["Neustrelitz", "Neu-Strelitz"], "New York": ["New York"], "Newcastle upon Tyne": ["Newcastle upon Tyne"], "Nottingham": ["Nottingham"], "Nürnberg": ["Nürnberg", "Norimberg", "Nuremberg"], "Odense": ["Odense", "Otthiniae"], "Offenbach am Main": ["Offenbach am Main", "Offenbach a. M."], "Olmütz": ["Olmütz", "Olomouc"], "Örebro": ["Örebro"], "Orleans": ["Aurelianae", "Orleans"], "Oxford": ["Oxford", "Oxoni"], "Paderborn": ["Paderborn"], "Padua": ["Padua", "Padova", "Pat?avii"], "Palermo": ["Palermo", "Panormi", "Panorami"], "Palma": ["Palma", "Ciudad de Mallorca", "Mallorca"], "Pamplona": ["Pamplona"], "Paris": ["Par", "paris", "Lutetia"], "Passau": ["Passau"], "Pau": ["Pau"], "Pavia": ["Pavi[ae]"], "Perpignan": ["Perpignan", "Perpiñan"], "Perugia": ["Perugia"], "Philadelphia, PA": ["Philadelphia"], "Piacenza": ["Piacenza"], "Pisa": ["Pisa", "Pisis"], "Pistoia": ["Pisto[ij]a"], "Poitiers": ["Poictiers", "Poitiers"], "Portogruaro": ["Portogruaro"], "Potsdam": ["Potsdam"], "Prag": ["Prag"], "ProstÄ›jov": ["ProstÄ›jov", "ProstÄ›jov", "Prostannae"], "Ragusa": ["Ragusa"], "Randers": ["Randers"], "Rastatt": ["Rastatt"], "Regensburg": ["Regensburg", "Ratisbonae"], "Reims": ["Reims", "Remis"], "Rennes": ["Rennes"], "Riga": ["Riga"], "Rom": ["Rom"], "Rostock": ["Rostock", "Rostochii"], "Rouen": ["Rouen", "Rouan"], "Rotterdam": ["Rotterd"], "Rudolstadt": ["Rudolstadt"], "Saint-Jean-d’Angely": ["Saint-Jean-d’Angely", "Saint Jean d'Angely"], "Saint-Malo": ["Saint-Malo"], "Saint-Vincent": ["Saint-Vincent", "S. Vincent"], "Saint-Jean": ["Saint-Jean"], "Saint-Omer": ["Saint-Omer"], "Salamanca": ["Salamanca"], "Sankt Gallen": ["S. Galli", "Sankt Gallen", "St.? Gallen"], "Saumur": ["Saumur"], "Saragossa": ["Çaragoça", "Saragossa", "Zaragoza", "Caragoca", "Caragoza", "Caesaraugustae"], "Schaffhausen": ["Schaffhausen"], "Schleswig": ["Schleswig", "Slesvigae"], "Schwelm": ["Schwelm"], "Schwerin": ["Schwerin"], "Sevilla": ["Sevill[ae]"], "Shrewsbury": ["Shrewsbury"], "Siena": ["Siena"], "Sibiu": ["Sibiu", "Hermannstadt", "Cibinii", "Nagy-Szeben"], "Speyer": ["Speyer", "Spirae"], "Southampton": ["Southampton"], "St. Blasien": ["St. Blasien", "Sankt Blasien", "St Blasien", "San Blasii"], "St. Petersburg": ["St.? Petersbo?urg", "Saint-Petersbourg", "Sankt Petersburg", "Petropoli", "St. Petersbourg", "Pietroburgo"], "Stendal": ["Stendal"], "Stockholm": ["Stockholm", "Holmiae", "Holmiæ"], "Stralsund": ["Stralsund"], "Straßburg": ["Argentorati", "Argentina", "Strassburg", "Straßburg", "Strasbourg"], "Stuttgart": ["Stuttgart"], "Sulechow": ["Sulech[óo]w", "Züllichau"], "Sulzbach": ["Sult?zbach"], "Tartu": ["Tartu\\Z", "Dorpat"], "Tegernsee": ["Tegernsee"], "Tharangambadi": ["Tharangambadi", "Tranquebariae"], "Tiflis": ["Tiflis"], "Toledo": ["Toledo", "Toleti"], "Tortosa": ["T[ao]rtosa"], "Torun": ["Thorunii"], "Toulouse": ["Tolose", "Toulouse"], "Tours": ["Tours"], "Trient": ["Trient", "Trento?"], "Trentschin": ["Trentschin"], "Treviso": ["Treviso"], "Trevoux": ["Trevoux"], "Trier": ["Trie"], "Trnava": ["Trnava", "T[iy]rnaviae"], "Trogen": ["Trogen"], "Troyes": ["Troyes"], "Tübingen": ["Tübingen", "Tubingae"], "Turin": ["Torino", "Turin", "Taurini"], "Udine": ["Udine", "Utini"], "Ulm": ["Ulm"], "Uppsala": ["Upp?sala"], "Urbino": ["Urbino"], "Utrecht": ["Utrecht", "Ultrajecti", "Trajecti"], "Valencia": ["Valencia", "Valentiæ"], "Valenciennes": ["Valenciennes"], "Valladolid": ["Valladolid"], "Venedig": ["Ven", "Vine"], "Verona": ["Verona"], "Versailles": ["Versailles"], "Vicenza": ["Vicenza"], "Villefranche": ["Villefranche"], "Vitry": ["Vitry"], "Warschau": ["Warschau", "Varsaviae"], "Weimar": ["Weimar"], "Wesel": ["Wesel", "Vesaliae Clivorum"], "Wien": ["Wien", "Vienna", "Viennæ", "Becs", "Vindobona"], "Wiesbaden": ["Wiesbaden"], "Wittenberg": ["Wittenberg"], "Wolfenbüttel": ["Wolff?enbüttel"], "Würzburg": ["Würzburg"], "Ypern": ["Ypern", "Ypre"], "Zadar": ["Zadar", "Zara"], "Zagreb": ["Zagreb", "Agram"], "Zell": ["Zell"], "Zürich": ["Zürich", "Tiguri"], "Zweibrücken": ["Zweibrücken", "Biponti"]} \ No newline at end of file diff --git a/Notebooks/Remaining_barcodes_and_metadata.ipynb b/Notebooks/Remaining_barcodes_and_metadata.ipynb index 0ab9f09d2efe7d9db938cc34633a7b082974db79..4229c69bf14fc732c709e3dc913374f537c80488 100644 --- a/Notebooks/Remaining_barcodes_and_metadata.ipynb +++ b/Notebooks/Remaining_barcodes_and_metadata.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "be4bad89-ee87-4748-9049-f82e77a16417", "metadata": { "tags": [] @@ -11,10 +11,10 @@ { "data": { "text/plain": [ - "<contextlib.ExitStack at 0x7fdf17346530>" + "<contextlib.ExitStack at 0x7f0c803aba00>" ] }, - "execution_count": 1, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -7806,8 +7806,380 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "85e24615-b3f9-4e5f-b1a7-a1e4eedd18ef", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "comp_BE_no_dup = pd.read_excel('../Daten/Vorhersagen/Complete_BE_Years.xlsx', index_col=0)" + ] + }, + { + "cell_type": "markdown", + "id": "071e1c67-c3de-4d27-b718-b845bfea02a4", + "metadata": { + "tags": [] + }, + "source": [ + "# Ortsangabe normieren" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "4df117e2-7c1e-418a-a1b6-5170f8411df0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "comp_BE_no_dup.at[6088, 'Veröffentlichungsort'] = 'Paris'\n", + "comp_BE_no_dup.loc[9106:9109, 'Veröffentlichungsort (normiert)'] = 'Paris'\n", + "comp_BE_no_dup.loc[3395:3477, 'Veröffentlichungsort (normiert)'] = 'Amsterdam'" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d7a04b40-456c-4e2b-bdc5-748889d7ecf5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def normalize_place(entry, normed_places):\n", + " for place in normed_places:\n", + " for s in normed_places[place]:\n", + " match = re.search(s, entry)\n", + " if match:\n", + " return place\n", + " return ''" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "0c4c9202-d042-4ec6-a2b4-fe066d8148ec", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "with open('Normierte_Orte/places_regex.json', 'r') as fh:\n", + " normed = json.load(fh)\n", + "place_df = pd.DataFrame(comp_BE_no_dup[['Veröffentlichungsort', 'Veröffentlichungsort (normiert)']])\n", + "place_df['has_normed_place'] = 0\n", + "\n", + "for i, row in place_df.iterrows():\n", + " if pd.isna(row['Veröffentlichungsort (normiert)']):\n", + " if not pd.isna(row['Veröffentlichungsort']):\n", + " norm = normalize_place(row['Veröffentlichungsort'], normed)\n", + " if len(norm):\n", + " place_df.at[i, 'Veröffentlichungsort (normiert)'] = norm\n", + " place_df.at[i, 'has_normed_place'] = 1\n", + " else:\n", + " place_df.at[i, 'has_normed_place'] = 1" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "id": "f68981d7-edc0-4c23-88a1-b9d156fa2bce", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Veröffentlichungsort</th>\n", + " <th>Veröffentlichungsort (normiert)</th>\n", + " <th>has_normed_place</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>18388</th>\n", + " <td>Bengodi</td>\n", + " <td>NaN</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13256</th>\n", + " <td>Crisopoli</td>\n", + " <td>NaN</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8421</th>\n", + " <td>Doregnal (fing.)</td>\n", + " <td>NaN</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5215</th>\n", + " <td>Fridenstad</td>\n", + " <td>NaN</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10153</th>\n", + " <td>Grensing im Gänsserich</td>\n", + " <td>NaN</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14713</th>\n", + " <td>Monasterium Benedictino Casinate</td>\n", + " <td>NaN</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18226</th>\n", + " <td>[S l.]</td>\n", + " <td>NaN</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19742</th>\n", + " <td>[S.l.]</td>\n", + " <td>NaN</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22618</th>\n", + " <td>[Süddeutschland]</td>\n", + " <td>NaN</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>566</th>\n", + " <td>[s.l.]</td>\n", + " <td>NaN</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8620</th>\n", + " <td>o. O.</td>\n", + " <td>NaN</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21996</th>\n", + " <td>o.O.</td>\n", + " <td>NaN</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22909</th>\n", + " <td>s.l</td>\n", + " <td>NaN</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2492</th>\n", + " <td>s.l.</td>\n", + " <td>NaN</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18626</th>\n", + " <td>s.n.</td>\n", + " <td>NaN</td>\n", + " <td>0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Veröffentlichungsort Veröffentlichungsort (normiert) \n", + "18388 Bengodi NaN \\\n", + "13256 Crisopoli NaN \n", + "8421 Doregnal (fing.) NaN \n", + "5215 Fridenstad NaN \n", + "10153 Grensing im Gänsserich NaN \n", + "14713 Monasterium Benedictino Casinate NaN \n", + "18226 [S l.] NaN \n", + "19742 [S.l.] NaN \n", + "22618 [Süddeutschland] NaN \n", + "566 [s.l.] NaN \n", + "8620 o. O. NaN \n", + "21996 o.O. NaN \n", + "22909 s.l NaN \n", + "2492 s.l. NaN \n", + "18626 s.n. NaN \n", + "\n", + " has_normed_place \n", + "18388 0 \n", + "13256 0 \n", + "8421 0 \n", + "5215 0 \n", + "10153 0 \n", + "14713 0 \n", + "18226 0 \n", + "19742 0 \n", + "22618 0 \n", + "566 0 \n", + "8620 0 \n", + "21996 0 \n", + "22909 0 \n", + "2492 0 \n", + "18626 0 " + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "place_df[place_df['has_normed_place'] == 0].dropna(subset='Veröffentlichungsort').sort_values('Veröffentlichungsort').drop_duplicates('Veröffentlichungsort')" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "905a6ceb-7c43-443a-b4af-855d7b296478", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Veröffentlichungsort Lipcse, Budapest, Becs\n", + "Veröffentlichungsort (normiert) Budapest\n", + "has_normed_place 1\n", + "Name: 29, dtype: object" + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "place_df.loc[29]" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "id": "72ffff8e-4b14-4410-aeac-e19f3603c3d4", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "438" + ] + }, + "execution_count": 88, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(place_df['has_normed_place'] == 0).sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "id": "977f3d35-4677-49a1-ac8a-63b0d3b964a5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "comp_BE_no_dup['Veröffentlichungsort (normiert)'] = place_df['Veröffentlichungsort (normiert)']" + ] + }, + { + "cell_type": "markdown", + "id": "d0c91647-f9c1-4ab0-a1b5-2944865727e7", + "metadata": { + "tags": [] + }, + "source": [ + "# IIIF Manifest URL hinzufuegen" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "id": "de21377a-9cf6-4738-8799-842f9370ed95", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def get_iiif_manifest_url(bc):\n", + " if not pd.isna(bc):\n", + " abo_match = re.search('Z\\d{8,9}', bc)\n", + " dtl_match = re.search('dtl_(\\d+)', bc)\n", + " if abo_match:\n", + " return f'https://iiif.onb.ac.at/presentation/ABO/{abo_match[0]}/manifest'\n", + " elif dtl_match:\n", + " return f'https://iiif.onb.ac.at/presentation/REPO/{dtl_match[0]}/manifest'\n", + " else:\n", + " return ''\n", + " else:\n", + " return ''" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "id": "67fe94d8-3509-4312-9912-803f4aec8796", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "comp_BE_no_dup['IIIF Manifest'] = comp_BE_no_dup['Barcode'].apply(lambda x: get_iiif_manifest_url(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "id": "5a01e1f7-b15a-4699-969b-0952e6595e09", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Zwischenstand abspeichern\n", + "comp_BE_no_dup.to_excel('../Daten/Vorhersagen/Complete_BE_Years_Places.xlsx')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce82ba73-7b0a-454d-8b8b-033569614b75", "metadata": {}, "outputs": [], "source": [] diff --git a/Notebooks/String_matching.ipynb b/Notebooks/String_matching.ipynb index a10ecb03d9c63c7f466d40c12eb2995322d6b319..12c8995e69e0f8ec234684aff5e9eb94ad857985 100644 --- a/Notebooks/String_matching.ipynb +++ b/Notebooks/String_matching.ipynb @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 2, "id": "29ca0dc8-cae7-4f12-bd60-fd74ea6ae5ac", "metadata": { "tags": [] @@ -35,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 3, "id": "c1e1c42a-962f-40bc-bb17-b62e8089feb7", "metadata": { "tags": [] @@ -47,7 +47,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 4, "id": "50d15898-4687-46b7-b7e0-528d7cf9aec0", "metadata": { "tags": [] @@ -69,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 5, "id": "990dfeee-1141-4acb-8a3d-a7af0573f5be", "metadata": { "tags": [] @@ -84,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 6, "id": "bcd301fe-cb80-4b1c-b65f-465fce5ed915", "metadata": { "tags": [] @@ -103,7 +103,7 @@ " 0.0102726686745882]], dtype=object)" ] }, - "execution_count": 22, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -114,7 +114,7 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 13, "id": "c0f4a42a-7e21-41e8-833c-2dd2f9d1985e", "metadata": { "tags": [] @@ -161,39 +161,39 @@ " </thead>\n", " <tbody>\n", " <tr>\n", - " <th>2147</th>\n", - " <td>14.376</td>\n", - " <td>364</td>\n", - " <td>Artes</td>\n", - " <td>Ars Militaris, cum tractatibus de Tormentis Bellicis</td>\n", - " <td>Folio</td>\n", - " <td>318</td>\n", - " <td>14.376_364_00</td>\n", - " <td>611 Carte generale de l'Histoire Militaire de France depuis Clovis jusqu'a la XV.e Année du Regne de Louis XV. par le S. Lemau de la Iaisse. 1733. chez l'Autheur. n. CCXLVIII</td>\n", - " <td>Carte generale de l'Histoire Militaire de France depuis Clovis jusqu'a la XVe Annee du Regne de Louis XV par le S Lemau de la Iaisse 1733 chez l'Autheur n CCXLVIII</td>\n", + " <th>7182</th>\n", + " <td>14.378</td>\n", + " <td>226</td>\n", + " <td>Historia Nova Europæ</td>\n", + " <td>Hispaniæ Regnorum vel Provinciarum & Urbium Historia</td>\n", + " <td>Quarto</td>\n", + " <td>1064</td>\n", + " <td>14.378_226_03</td>\n", + " <td>Suplica de la Ciudad de Tortosa en occasion de las Alteraciones del Prencipado de Cataluña, y Condados de Rosellon, Zerdana &c. en Tortosa. 1640. Martorell. n. 796.</td>\n", + " <td>Suplica de la Ciudad de Tortosa en occasion de las Alteraciones del Prencipado de Cataluña y Condados de Rosellon Zerdana &c en Tortosa 1640 Martorell n 796</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ - " volume page number category \n", - "2147 14.376 364 Artes \\\n", + " volume page number category \n", + "7182 14.378 226 Historia Nova Europæ \\\n", "\n", - " subcategory format \n", - "2147 Ars Militaris, cum tractatibus de Tormentis Bellicis Folio \\\n", + " subcategory format \n", + "7182 Hispaniæ Regnorum vel Provinciarum & Urbium Historia Quarto \\\n", "\n", " handwritten page number entry_ID \n", - "2147 318 14.376_364_00 \\\n", + "7182 1064 14.378_226_03 \\\n", "\n", - " entry \n", - "2147 611 Carte generale de l'Histoire Militaire de France depuis Clovis jusqu'a la XV.e Année du Regne de Louis XV. par le S. Lemau de la Iaisse. 1733. chez l'Autheur. n. CCXLVIII \\\n", + " entry \n", + "7182 Suplica de la Ciudad de Tortosa en occasion de las Alteraciones del Prencipado de Cataluña, y Condados de Rosellon, Zerdana &c. en Tortosa. 1640. Martorell. n. 796. \\\n", "\n", - " cleaned entry \n", - "2147 Carte generale de l'Histoire Militaire de France depuis Clovis jusqu'a la XVe Annee du Regne de Louis XV par le S Lemau de la Iaisse 1733 chez l'Autheur n CCXLVIII " + " cleaned entry \n", + "7182 Suplica de la Ciudad de Tortosa en occasion de las Alteraciones del Prencipado de Cataluña y Condados de Rosellon Zerdana &c en Tortosa 1640 Martorell n 796 " ] }, - "execution_count": 114, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -202,7 +202,7 @@ "def search_in_entry(df, string):\n", " return df[df['cleaned entry'].str.contains(string)]\n", "\n", - "info = search_in_entry(search_in_entry(entry_df, 'Carte'), '1733')\n", + "info = search_in_entry(search_in_entry(entry_df, ''), 'Tortos')\n", "print(len(info))\n", "info" ] @@ -1194,7 +1194,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 14, "id": "d6efcff5-0393-4835-b673-001e85877f13", "metadata": { "tags": [] @@ -1224,7 +1224,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 15, "id": "7a39c6a7-d81b-4f79-89a0-bafc96411d93", "metadata": { "tags": [] @@ -1244,7 +1244,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 16, "id": "cb7b6815-c782-4e00-bd7f-9abcf7f523f5", "metadata": { "tags": [] @@ -1282,7 +1282,7 @@ }, { "cell_type": "code", - "execution_count": 185, + "execution_count": 17, "id": "836d1b5e-ef2f-4ff8-9c03-298b029f73b2", "metadata": { "tags": [] @@ -1322,7 +1322,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 18, "id": "9f7bd1bf-a4cd-427a-b65a-5e2fce029212", "metadata": { "tags": [] @@ -1342,7 +1342,7 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 19, "id": "17434971-7462-4f00-8370-d5573f3ea72c", "metadata": { "tags": [] @@ -1361,7 +1361,7 @@ "[2, 4, 7, 9, 10, 11, 12, 14, 16, 18, 19, 20, 22, 25, 26, 28, 29, 32, 34, 35]" ] }, - "execution_count": 84, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1375,7 +1375,7 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 20, "id": "25fcabf9-fb1c-4fe7-838a-8b595f0a2673", "metadata": { "tags": [] @@ -1388,7 +1388,7 @@ }, { "cell_type": "code", - "execution_count": 136, + "execution_count": 21, "id": "24bb9a74-0e99-4760-a3d5-05f8df71f85a", "metadata": { "tags": [] @@ -1419,7 +1419,7 @@ }, { "cell_type": "code", - "execution_count": 170, + "execution_count": 22, "id": "6c952698-2bc7-4511-a688-3bcd30ed8196", "metadata": { "tags": [] @@ -1456,6 +1456,41 @@ "## Add new matching data to existing catalogue data" ] }, + { + "cell_type": "code", + "execution_count": 23, + "id": "5ef1c335-c990-4f8c-aa47-bd967e73c7f6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "BE_df = pd.read_excel('../Daten/Vorhersagen/Katalogauszug, Vorhersagen und hs. Katalogverbindungen.xlsx')" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "8720472d-c222-45b5-b1af-66d579b0405e", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "5777" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "BE_df['hs. Katalog'].sum()" + ] + }, { "cell_type": "code", "execution_count": 172, @@ -2382,13 +2417,151 @@ "id": "36866c95-917c-48a6-9c2d-0ce1f9f9f8b2", "metadata": {}, "source": [ - "# String matching vom hs. Katalog ausgehend" + "# Mehrfache matches hinzufuegen" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 69, "id": "b8870950-aee8-42ab-9cc5-07f9d2e419ab", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "comp_BE_no_dup = pd.read_excel('../Daten/Vorhersagen/Complete_BE_Years_Places.xlsx', index_col=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "64685c07-6cc4-490f-8a56-44798693c039", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('7950', ''), ('9247', '')]\n" + ] + }, + { + "ename": "TypeError", + "evalue": "int() argument must be a string, a bytes-like object or a real number, not 'tuple'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[36], line 15\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28mprint\u001b[39m(matches)\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m match_id \u001b[38;5;129;01min\u001b[39;00m matches:\n\u001b[0;32m---> 15\u001b[0m matches_data\u001b[38;5;241m.\u001b[39mappend(entry_df\u001b[38;5;241m.\u001b[39mloc[\u001b[38;5;28;43mint\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mmatch_id\u001b[49m\u001b[43m)\u001b[49m])\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m 18\u001b[0m \u001b[38;5;66;03m# BE_id = entry['input_id']\u001b[39;00m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;66;03m# comp_BE_no_dup.at[BE_id, 'hs. Katalog'] = 1\u001b[39;00m\n\u001b[1;32m 20\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[38;5;66;03m# comp_BE_no_dup.at[BE_id, 'hs. Katalog Image URL'] = f\"https://iiif.onb.ac.at/images/DOD/{dod_id[str(corr_entry['volume'])]}/{corr_entry['page number']:08}.jp2/full/full/0/native.jpg\"\u001b[39;00m\n\u001b[1;32m 34\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", + "\u001b[0;31mTypeError\u001b[0m: int() argument must be a string, a bytes-like object or a real number, not 'tuple'" + ] + } + ], + "source": [ + "dod_id = {\n", + " '14.376': 51202, \n", + " '14.377': 51184,\n", + " '14.378': 51219\n", + "}\n", + "\n", + "for i, entry in has_hw_catalog.iterrows():\n", + " man_match = str(entry['control'])\n", + " match_regex = re.compile('(\\d{1,4})(?<!\\?)|(\\d{1,4})\\?')\n", + " matches = match_regex.findall(man_match)\n", + " if matches:\n", + " matches_data = []\n", + " print(matches)\n", + " for match_id in matches:\n", + " matches_data.append(entry_df.loc[int(match_id)])\n", + " break\n", + " \n", + "# BE_id = entry['input_id']\n", + "# comp_BE_no_dup.at[BE_id, 'hs. Katalog'] = 1\n", + "\n", + "# if '?' not in man_match:\n", + "# comp_BE_no_dup.at[BE_id, 'hs. Katalog Konfidenz'] = 'sicher'\n", + "# else:\n", + "# comp_BE_no_dup.at[BE_id, 'hs. Katalog Konfidenz'] = 'unsicher'\n", + "# comp_BE_no_dup.at[BE_id, 'hs. Katalogband'] = str(corr_entry['volume'])\n", + "# comp_BE_no_dup.at[BE_id, 'hs. Katalogseite Digitalisat'] = corr_entry['page number']\n", + "# comp_BE_no_dup.at[BE_id, 'Wissensklasse'] = corr_entry['category']\n", + "# comp_BE_no_dup.at[BE_id, 'Wissensunterklasse'] = corr_entry['subcategory']\n", + "# comp_BE_no_dup.at[BE_id, 'Formatangabe'] = corr_entry['format']\n", + "# comp_BE_no_dup.at[BE_id, 'hs. Katalogseite Handschrift'] = corr_entry['handwritten page number']\n", + "# comp_BE_no_dup.at[BE_id, 'hs. Katalogeintrag ID'] = corr_entry['entry_ID']\n", + "# comp_BE_no_dup.at[BE_id, 'hs. Katalogeintrag'] = corr_entry['entry']\n", + "# comp_BE_no_dup.at[BE_id, 'hs. Katalog Image URL'] = f\"https://iiif.onb.ac.at/images/DOD/{dod_id[str(corr_entry['volume'])]}/{corr_entry['page number']:08}.jp2/full/full/0/native.jpg\"\n", + " else:\n", + " print('no match found for', man_match, 'with id', i)" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "49a2de57-0529-4f3d-bd54-7ce1665b05f5", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['1234,']\n", + "['4564?']\n", + "[]\n", + "['2?']\n", + "['234']\n", + "['345?,']\n", + "['234,', '345']\n", + "[]\n", + "['44']\n", + "[]\n", + "[]\n", + "['2?,', '55?']\n" + ] + } + ], + "source": [ + "match_examples = ['1234, 4564?', '2?', '345?,234', '234,345', '44', '2?, 55?']\n", + "for man_match in match_examples:\n", + " match_regex = re.compile('\\d{1,4}\\Z|\\d{1,4},')\n", + " matches = match_regex.findall(man_match)\n", + " print(matches)\n", + " uncertain_regex = re.compile('\\d{1,4}\\?\\Z|\\d{1,4}\\?,')\n", + " uncertain_matches = uncertain_regex.findall(man_match)\n", + " print(uncertain_matches)" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "1bf5d4cc-a77f-46c1-93a5-92636c4a19b1", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "29" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "has_hw_catalog[has_hw_catalog['control_string'].str.contains(',')]['control_string'].apply(lambda x: len(x)).max()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b9fc869-d681-4612-b86d-e23963d1f239", "metadata": {}, "outputs": [], "source": []