diff --git a/Notebooks/Analyze_ALMA_export.ipynb b/Notebooks/Analyze_ALMA_export.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..9fbd5e674b2b87176f1c8b52d4230248d3204a83 --- /dev/null +++ b/Notebooks/Analyze_ALMA_export.ipynb @@ -0,0 +1,2889 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "fa2c0c2d-356b-4ea7-ac83-d1a0246b1139", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: openpyxl in /opt/conda/lib/python3.10/site-packages (3.1.2)\n", + "Requirement already satisfied: et-xmlfile in /opt/conda/lib/python3.10/site-packages (from openpyxl) (1.1.0)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install openpyxl" + ] + }, + { + "cell_type": "code", + "execution_count": 619, + "id": "6ec73b33-1f5d-4445-9d84-a85545cd5733", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import re\n", + "import requests\n", + "import json" + ] + }, + { + "cell_type": "code", + "execution_count": 882, + "id": "4773a522-d048-4bc1-80b1-55e1d8c595a5", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n", + "False\n" + ] + } + ], + "source": [ + "p = re.compile('Z[0-9X]+')\n", + "print(bool(p.search('+Z203753604')))\n", + "print(bool(p.search('B1575290')))\n", + "\n", + "def extract_valid_bc(bc):\n", + " pattern = 'Z[0-9X]+'\n", + " match = re.search(pattern, str(bc))\n", + " if match:\n", + " return match.group(0)\n", + " return None\n", + "\n", + "def get_iiif_manifest(bcs):\n", + " mans = []\n", + " for bc in bcs:\n", + " r = requests.get(f'https://iiif.onb.ac.at/presentation/ABO/{bc}/manifest/')\n", + " if r.status_code == 200:\n", + " man = r.content\n", + " mans.append(man)\n", + " return mans" + ] + }, + { + "cell_type": "code", + "execution_count": 169, + "id": "424debb0-4153-4b21-8ac5-b754d4df2008", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df_1 = pd.read_excel('data/catalogue/BE ANAv4.xlsx')\n", + "df_2 = pd.read_excel('data/catalogue/notBE-Eugeniana ANA.xlsx')\n", + "df_3 = pd.read_excel('data/catalogue/NichtBE-Eugeniana SW6xx-ME.xlsx')\n", + "df_4 = pd.read_excel('data/catalogue/BE ANA ohne Item.xlsx')\n", + "df_5 = pd.read_excel('data/catalogue/Signatur BE2 ME SW etc + ALMA.xlsx')" + ] + }, + { + "cell_type": "code", + "execution_count": 192, + "id": "c1a776c7-6b78-4a2e-aa28-eda36f9d35d1", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1\n", + "22412\n", + "Index([ 'Permanent Call Number', 992,\n", + " 993, 695,\n", + " 866, 'Library Code (Active)',\n", + " 'Location Code', 'Barcode',\n", + " 'Item Call Number', 'Title',\n", + " 'Author', 'Author (contributor)',\n", + " 'Begin Publication Date', 'End Publication Date',\n", + " 'Publication Date', 'Publication Place',\n", + " 'Publisher', 'Unnamed: 17',\n", + " 'MMS Id', 856,\n", + " 'Subjects', 'Subjects (Names)'],\n", + " dtype='object')\n", + "2\n", + "31\n", + "Index([ 'Permanent Call Number', 992,\n", + " 993, 695,\n", + " 866, 'Library Code (Active)',\n", + " 'Location Code', 'Barcode',\n", + " 'Item Call Number', 'Author',\n", + " 'Author (contributor)', 'Begin Publication Date',\n", + " 'MMS Id', 'Publication Date',\n", + " 'Publication Place', 'Publisher',\n", + " 'Title', 'End Publication Date',\n", + " 856, 'Subjects',\n", + " 'Subjects (Names)'],\n", + " dtype='object')\n", + "3\n", + "31\n", + "Index(['001', 600, 610, 611, 630, 648, 650, 651, 653, 655, 689], dtype='object')\n", + "4\n", + "22496\n", + "Index([ 'Permanent Call Number', '992 HOL',\n", + " '993 HOL', '695 HOL',\n", + " '866 HOL', 'Library Code (Active)',\n", + " 'Location Code', 'Author',\n", + " 'Author (contributor)', 'Title',\n", + " 'Begin Publication Date', 'End Publication Date',\n", + " 'Publication Date', 'Publication Place',\n", + " 'Publisher', 856,\n", + " 'Subjects', 'Subjects (Names)',\n", + " 'Unnamed: 18', 'MMS Id'],\n", + " dtype='object')\n", + "5\n", + "20706\n", + "Index([ '001', 600,\n", + " 610, 611,\n", + " 630, 648,\n", + " 650, 651,\n", + " 653, '655',\n", + " '689', '130$a',\n", + " '245$a', '245$c',\n", + " '500$a', '751$a',\n", + " '982$z', 'Type / Creator / Imprint',\n", + " 'Unnamed: 18', 'Subject',\n", + " 'Title', 'Series',\n", + " 'Availability', 'Modification Date',\n", + " 'Edition', 'Record number',\n", + " 'Language', 'MMS ID'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "for i in range(1, 6):\n", + " print(i)\n", + " print(len(eval(f'df_{i}')))\n", + " print(eval(f'df_{i}').columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 379, + "id": "2376519e-bac0-44c0-9365-542845f3a6bc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 740, + "id": "0a39cd9f-8687-4f82-84b0-836fe40ac366", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "merge_23 = df_2.merge(df_3, how='outer', left_on='MMS Id', right_on='001')\n", + "merge_23['Begin Publication Date'] = merge_23['Begin Publication Date'].astype(object)\n", + "merge_23['End Publication Date'] = merge_23['End Publication Date'].astype(object)\n", + "merge_23['Publication Date'] = merge_23['Publication Date'].astype(object)\n", + "\n", + "merge_123 = df_1.merge(merge_23, how='outer', on=['Begin Publication Date', 'End Publication Date', 'Publication Date', 'Barcode', 'Permanent Call Number', 992, 695, 'Library Code (Active)', 'Location Code', 'Title', 'Author', 'Author (contributor)', 'Publication Place', 'Publisher', 'MMS Id', 856, 'Subjects'])\n", + "merge_45 = df_4.merge(df_5, how='outer', left_on='MMS Id', right_on='MMS ID')\n", + "merge_45['Title'] = merge_45.apply(lambda x: combine_entries(x['Title_x'], x['Title_y']), axis=1)\n", + "merge_12345 = merge_45.merge(merge_123, how='outer', on=['Begin Publication Date', 'End Publication Date', 'Publication Date', 'Permanent Call Number', 'Library Code (Active)', 'Location Code', 'Title', 'Author', 'Author (contributor)', 'Publication Place', 'Publisher', 'MMS Id', 856, 'Subjects'])" + ] + }, + { + "cell_type": "code", + "execution_count": 745, + "id": "018d5635-7b0a-496f-97a7-44e04afe4c39", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Index([ 'Permanent Call Number', '992 HOL',\n", + " '993 HOL', '695 HOL',\n", + " '866 HOL', 'Library Code (Active)',\n", + " 'Location Code', 'Author',\n", + " 'Author (contributor)', 'Title_x',\n", + " 'Begin Publication Date', 'End Publication Date',\n", + " 'Publication Date', 'Publication Place',\n", + " 'Publisher', 856,\n", + " 'Subjects', 'Subjects (Names)',\n", + " 'MMS Id', '001_x',\n", + " '600_x', '610_x',\n", + " '650_x', '653_x',\n", + " '655', '689',\n", + " '130$a', '245$a',\n", + " '245$c', '500$a',\n", + " '751$a', '982$z',\n", + " 'Type / Creator / Imprint', 'Unnamed: 18_y',\n", + " 'Subject', 'Title_y',\n", + " 'Series', 'Availability',\n", + " 'Modification Date', 'Edition',\n", + " 'Record number', 'Language',\n", + " 'MMS ID', 'Title',\n", + " 992, '993_x',\n", + " 695, '866_x',\n", + " 'Barcode', 'Item Call Number_x',\n", + " 'Subjects (Names)_x', '001_y',\n", + " 655, 689],\n", + " dtype='object')" + ] + }, + "execution_count": 745, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merge_12345_dropna = merge_12345.dropna(axis=1, how='all', ignore_index=True)\n", + "merge_12345_dropna.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 746, + "id": "d8b8ed91-fb6b-4eb8-87f3-2bb45f32c144", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['001_x',\n", + " '001_y',\n", + " '130$a',\n", + " '245$a',\n", + " '245$c',\n", + " '500$a',\n", + " '600_x',\n", + " '610_x',\n", + " '650_x',\n", + " '653_x',\n", + " '655',\n", + " '655',\n", + " '689',\n", + " '689',\n", + " '695',\n", + " '695 HOL',\n", + " '751$a',\n", + " '856',\n", + " '866 HOL',\n", + " '866_x',\n", + " '982$z',\n", + " '992',\n", + " '992 HOL',\n", + " '993 HOL',\n", + " '993_x',\n", + " 'Author',\n", + " 'Author (contributor)',\n", + " 'Availability',\n", + " 'Barcode',\n", + " 'Begin Publication Date',\n", + " 'Edition',\n", + " 'End Publication Date',\n", + " 'Item Call Number_x',\n", + " 'Language',\n", + " 'Library Code (Active)',\n", + " 'Location Code',\n", + " 'MMS ID',\n", + " 'MMS Id',\n", + " 'Modification Date',\n", + " 'Permanent Call Number',\n", + " 'Publication Date',\n", + " 'Publication Place',\n", + " 'Publisher',\n", + " 'Record number',\n", + " 'Series',\n", + " 'Subject',\n", + " 'Subjects',\n", + " 'Subjects (Names)',\n", + " 'Subjects (Names)_x',\n", + " 'Title',\n", + " 'Title_x',\n", + " 'Title_y',\n", + " 'Type / Creator / Imprint',\n", + " 'Unnamed: 18_y']" + ] + }, + "execution_count": 746, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sorted([ 'Permanent Call Number', '992 HOL',\n", + " '993 HOL', '695 HOL',\n", + " '866 HOL', 'Library Code (Active)',\n", + " 'Location Code', 'Author',\n", + " 'Author (contributor)', 'Title_x',\n", + " 'Begin Publication Date', 'End Publication Date',\n", + " 'Publication Date', 'Publication Place',\n", + " 'Publisher', '856',\n", + " 'Subjects', 'Subjects (Names)',\n", + " 'MMS Id', '001_x',\n", + " '600_x', '610_x',\n", + " '650_x', '653_x',\n", + " '655', '689',\n", + " '130$a', '245$a',\n", + " '245$c', '500$a',\n", + " '751$a', '982$z',\n", + " 'Type / Creator / Imprint', 'Unnamed: 18_y',\n", + " 'Subject', 'Title_y',\n", + " 'Series', 'Availability',\n", + " 'Modification Date', 'Edition',\n", + " 'Record number', 'Language',\n", + " 'MMS ID', 'Title',\n", + " '992', '993_x',\n", + " '695', '866_x',\n", + " 'Barcode', 'Item Call Number_x',\n", + " 'Subjects (Names)_x', '001_y',\n", + " '655', '689'])" + ] + }, + { + "cell_type": "code", + "execution_count": 963, + "id": "4868e5a1-cdb3-4b3c-9d1d-248cadbcb4a0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def combine_ids(lis):\n", + " int_lis = []\n", + " for e in lis:\n", + " if not np.isnan(e):\n", + " int_lis.append(int(e))\n", + " return list(set(int_lis))[0]\n", + "\n", + "def combine_entries(a, b):\n", + " if a == b:\n", + " return a\n", + " else:\n", + " if isinstance(a, str) and isinstance(b, str):\n", + " return a + b\n", + " elif isinstance(a, str):\n", + " return a\n", + " else:\n", + " return b\n", + "\n", + "def get_signature(a, b):\n", + " if isinstance(a, str):\n", + " return a\n", + " else:\n", + " match = re.search('BE.*;|\\d{6}.*;', b)\n", + " if match:\n", + " return match.group(0)\n", + " else:\n", + " return a\n", + "\n", + "merge_IDs = merge_12345_dropna.apply(lambda x: combine_ids([x['001_x'], x['001_y'], x['MMS Id'], x['MMS ID']]), axis=1)\n", + "merge_655 = merge_12345_dropna.apply(lambda x: combine_entries(x[655], x['655']), axis=1)\n", + "merge_689 = merge_12345_dropna.apply(lambda x: combine_entries(x[689], x['689']), axis=1)\n", + "merge_695 = merge_12345_dropna.apply(lambda x: combine_entries(x[695], x['695 HOL']), axis=1)\n", + "merge_866 = merge_12345_dropna.apply(lambda x: combine_entries(x['866 HOL'], x['866_x']), axis=1)\n", + "merge_992 = merge_12345_dropna.apply(lambda x: combine_entries(x[992], x['992 HOL']), axis=1)\n", + "merge_993 = merge_12345_dropna.apply(lambda x: combine_entries(x['993_x'], x['993 HOL']), axis=1)\n", + "merge_992_993 = pd.concat([merge_992, merge_993], axis=1)\n", + "merge_992_993['Bemerkungen'] = merge_992_993.apply(lambda x: combine_entries(x[0], x[1]), axis=1)\n", + "merge_subject_names = merge_12345_dropna.apply(lambda x: combine_entries(x['Subjects (Names)'], x['Subjects (Names)_x']), axis=1)\n", + "merge_begin_date = merge_12345_dropna['Begin Publication Date'][(merge_12345_dropna['Begin Publication Date'] != '####') & (merge_12345_dropna['Begin Publication Date'] != '9999')]\n", + "merge_end_date = merge_12345_dropna['End Publication Date'][(merge_12345_dropna['End Publication Date'] != '####') & (merge_12345_dropna['End Publication Date'] != '9999')]\n", + "merge_date = merge_12345_dropna['Publication Date'][(merge_12345_dropna['Publication Date'] != '####') & (merge_12345_dropna['Publication Date'] != '9999')]\n", + "merge_signatur = merge_12345_dropna.apply(lambda x: get_signature(x['Permanent Call Number'], x['Availability']), axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 999, + "id": "c5813ff3-1734-4273-b49b-3f5d4bc6964f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "c_df = pd.concat([merge_signatur,\n", + " merge_12345_dropna['Barcode'],\n", + " merge_12345_dropna['Title'], \n", + " merge_12345_dropna['Author'], \n", + " merge_12345_dropna['Author (contributor)'], \n", + " merge_begin_date, \n", + " merge_end_date, \n", + " merge_date, \n", + " merge_12345_dropna['Publication Place'], \n", + " merge_12345_dropna['751$a'],\n", + " merge_12345_dropna['Subjects'],\n", + " merge_689,\n", + " merge_695,\n", + " merge_655,\n", + " merge_992_993['Bemerkungen'],\n", + " ], axis=1)\n", + "c_df.rename(columns={0: 'Signatur', 'Title': 'Titel', 'Author': 'Autor', 'Author (contributor)': 'Mitwirkender', 'Begin Publication Date': 'Anfang Veröffentlichungsdatum', 'End Publication Date': 'Ende Veröffentlichungsdatum', 'Publication Date': 'Veröffentlichungsdatum', 'Publication Place': 'Veröffentlichungsort', '751$a': 'Veröffentlichungsort (normiert)', 'Subjects': 'Schlagwörter', 1: 'Schlagwörter (mit GND)', 2: 'Vorbesitzer', 3: 'Typ'}, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 1000, + "id": "43bf5dc7-b9e0-40e9-9e27-244ed09e508f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SignaturBarcodeTitelAutorMitwirkenderAnfang VeröffentlichungsdatumEnde VeröffentlichungsdatumVeröffentlichungsdatumVeröffentlichungsortVeröffentlichungsort (normiert)SchlagwörterSchlagwörter (mit GND)VorbesitzerTypBemerkungenvalid_bc
0BE.1.A.13537581-20Flora Napolitana, ossia descrizione delle pian...Tenore, MicheleNaN181118361811-36NapoliNaNNeapel; PflanzenNeapel$Dg--(DE-588)4041476-0;Pflanzen$Ds--(DE-...NaNNaNNaNNone
1BE.1.C.10B1711164Flora Napolitana, ossia descrizione delle pian...Tenore, MicheleNaN181118361811-36NapoliNaNNeapel; PflanzenNeapel$Dg--(DE-588)4041476-0;Pflanzen$Ds--(DE-...NaNNaNNaNNone
2BE.1.A.10B1214205Antiquites du Bosphore Cimmerien conservees au...NaNGosudarstvennyj ĖrmitažNaNNaNNaNSt. PetersburgNaNSankt Petersburg; Museum; Eremitage; Straße vo...Antiquität$Ds--(DE-588)4002325-4;Straße von Ke...NaNNaNNaNNone
3BE.1.A.11B1475984Musee des antiques dessine et grave par P. Bou...Bouillon, PierreSaint-Victor, Jacques Maximilien Renjamin Bins de181118271811-1827ParisNaNAltertümerAltertümer$Ds--(DE-588)4201096-2;AT-OBV--ONB-AKNaNNaNNaNNone
4BE.1.A.12B1771764Divers works of early masters in christian dec...Weale, JohnNaN1846NaN1846LondonNaNDekoration; ChristentumDekoration$Ds--(DE-588)4149033-2;Christentum$D...NaNNaNNaNNone
...................................................
22638Ink 7.E.11566517-10Biblia Mit Postilla litteralis von Nicolaus de...NaNKoberger, Anton1485NaN7. Mai 1485; [1485.05.07]NürnbergNaNInkunabelInkunabel$Af$$a Savoyen-Carignan, Eugen <von>; $$b [Vorbes...NaN$$c Rote Rubriken und Lombarden (Punktverdicku...None
22639Ink 8.E.26+Z158726101La nef des fols du monde Aus dem Lat. des Jako...Brant, Sebastian 1458-1521Jean Lambert; Rivière, Pierre -14991497NaN[nicht vor Dez.] 1497ParisNaNInkunabelInkunabel$Af$$a Savoyen-Carignan, Eugen <von>; $$b [Vorbes...NaN$$d Barock-Einband für Eugen von Savoyen: rote...Z158726101
22640Ink 9.D.5+Z35095803Thesaurus Cornu copiae et Horti Adonidis <grie...NaNManuzio, Aldo Pio; Bolzanio, Urbano 1443-1524;...1496NaNAug. 1496; [1496.08]VenedigNaNInkunabelInkunabel$Af$$a Savoyen-Carignan, Eugen <von>; $$b [Vorbes...NaN$$f Ink 9.D.5; $$m Vereinzelt griechische Marg...Z35095803
22641Ink 9.F.22+Z96101306Comoediae ; Francesco Petrarca: Vita TerentiiTerentius Afer, Publius v195-v159Zarotto, Antonio1476NaN23. Feb. 1476; [1476.02.23]MailandNaNInkunabelInkunabel$Af$$a Wappen; $$b [Vorbesitzer, 15./16. Jh?]; $$...NaN$$c Wappenschild (Bl.3a, a1a) mit Wasserfarben...Z96101306
22642Ink 9.F.51460328-10Biblia ; Interpretationes Hebraicorum nominumNaNWild, Leonhard1481NaN1481VenedigNaNInkunabelInkunabel$Af$$a Savoyen-Carignan, Eugen <von>; $$b [Vorbes...NaN$$c Rote und blaue Lombarden (verwischt), rote...None
\n", + "

22643 rows × 16 columns

\n", + "
" + ], + "text/plain": [ + " Signatur Barcode \n", + "0 BE.1.A.1 3537581-20 \\\n", + "1 BE.1.C.10 B1711164 \n", + "2 BE.1.A.10 B1214205 \n", + "3 BE.1.A.11 B1475984 \n", + "4 BE.1.A.12 B1771764 \n", + "... ... ... \n", + "22638 Ink 7.E.1 1566517-10 \n", + "22639 Ink 8.E.26 +Z158726101 \n", + "22640 Ink 9.D.5 +Z35095803 \n", + "22641 Ink 9.F.22 +Z96101306 \n", + "22642 Ink 9.F.5 1460328-10 \n", + "\n", + " Titel \n", + "0 Flora Napolitana, ossia descrizione delle pian... \\\n", + "1 Flora Napolitana, ossia descrizione delle pian... \n", + "2 Antiquites du Bosphore Cimmerien conservees au... \n", + "3 Musee des antiques dessine et grave par P. Bou... \n", + "4 Divers works of early masters in christian dec... \n", + "... ... \n", + "22638 Biblia Mit Postilla litteralis von Nicolaus de... \n", + "22639 La nef des fols du monde Aus dem Lat. des Jako... \n", + "22640 Thesaurus Cornu copiae et Horti Adonidis ; $$b [Vorbes... NaN \n", + "22639 $$a Savoyen-Carignan, Eugen ; $$b [Vorbes... NaN \n", + "22640 $$a Savoyen-Carignan, Eugen ; $$b [Vorbes... NaN \n", + "22641 $$a Wappen; $$b [Vorbesitzer, 15./16. Jh?]; $$... NaN \n", + "22642 $$a Savoyen-Carignan, Eugen ; $$b [Vorbes... NaN \n", + "\n", + " Bemerkungen valid_bc \n", + "0 NaN None \n", + "1 NaN None \n", + "2 NaN None \n", + "3 NaN None \n", + "4 NaN None \n", + "... ... ... \n", + "22638 $$c Rote Rubriken und Lombarden (Punktverdicku... None \n", + "22639 $$d Barock-Einband für Eugen von Savoyen: rote... Z158726101 \n", + "22640 $$f Ink 9.D.5; $$m Vereinzelt griechische Marg... Z35095803 \n", + "22641 $$c Wappenschild (Bl.3a, a1a) mit Wasserfarben... Z96101306 \n", + "22642 $$c Rote und blaue Lombarden (verwischt), rote... None \n", + "\n", + "[22643 rows x 16 columns]" + ] + }, + "execution_count": 1000, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c_df['valid_bc'] = c_df['Barcode'].apply(lambda x: extract_valid_bc(x))\n", + "c_df" + ] + }, + { + "cell_type": "code", + "execution_count": 988, + "id": "ea675efe-3005-4052-9290-ec4b3262b7bb", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "final = pd.read_csv('data/predictions/final_predictions.csv')\n", + "final['valid_bc'] = final['filename'].apply(lambda x: extract_valid_bc(x))\n", + "final_color_preds = final.drop_duplicates(subset='valid_bc')" + ] + }, + { + "cell_type": "code", + "execution_count": 1004, + "id": "8d681b59-7f88-4747-9923-11223e79305d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "merged_col_pred = c_df.merge(final_color_preds, how='left', on='valid_bc')" + ] + }, + { + "cell_type": "code", + "execution_count": 209, + "id": "3ad5557e-e3c9-4505-8589-79f2b1118aa6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "pred = pd.read_csv('data/predictions/combined_predictions.csv')\n", + "pred = pred.drop('Unnamed: 0', axis=1)\n", + "pred['valid_bc'] = pred['filename'].apply(lambda x: extract_valid_bc(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 1057, + "id": "f9536ee4-4f27-4047-b4dc-cdac170b8400", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "pred_drop = pred.drop_duplicates(subset=['prediction', 'valid_bc'])" + ] + }, + { + "cell_type": "code", + "execution_count": 1065, + "id": "6906dc88-62e9-4adb-9ca1-e22c4de78c13", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "20574" + ] + }, + "execution_count": 1065, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pred_drop.drop_duplicates('valid_bc')['valid_bc'].isin(c_df['valid_bc']).sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 1080, + "id": "ce8c0d9b-3b6e-4766-9e25-403ee3bb3aed", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "supralibros_bc = pred[pred['prediction'] != 'N']['valid_bc']\n", + "neg_preds = pred[~pred['valid_bc'].isin(supralibros_bc)].drop_duplicates('valid_bc')" + ] + }, + { + "cell_type": "code", + "execution_count": 1095, + "id": "109706d0-4a93-4df4-8682-be91044c2446", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "all_predictions = pd.concat([final_color_preds, neg_preds]).drop(['years', 'man_prediction', 'man_color'], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 1101, + "id": "5a608b2a-de63-4eb5-b2d3-48eaeccfb9a6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "cat_pred_comb = c_df.merge(all_predictions, how='left', on='valid_bc')" + ] + }, + { + "cell_type": "code", + "execution_count": 1108, + "id": "b9ff69db-bea1-47c8-92ac-8f11176d4fca", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Signatur', 'Barcode', 'Titel', 'Autor', 'Mitwirkender',\n", + " 'Anfang Veröffentlichungsdatum', 'Ende Veröffentlichungsdatum',\n", + " 'Veröffentlichungsdatum', 'Veröffentlichungsort',\n", + " 'Veröffentlichungsort (normiert)', 'Schlagwörter',\n", + " 'Schlagwörter (mit GND)', 'Vorbesitzer', 'Typ', 'Bemerkungen',\n", + " 'valid_bc', 'filename', 'prediction', 'p_A', 'p_B', 'p_C', 'p_N',\n", + " 'color', 'p_blue', 'p_red', 'p_yellow'],\n", + " dtype='object')" + ] + }, + "execution_count": 1108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cat_pred_comb.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 1111, + "id": "fdde2770-b2cb-402f-aa55-572239be15f0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "cat_pred_comb.rename(columns={'valid_bc': 'Gültiger Barcode', 'filename': 'Dateiname', 'prediction': 'Wappenklassifizierung', 'color': 'Farbklassifizierung'}, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 1183, + "id": "3e2838ff-c5d9-49dc-817c-cc1ec9393f61", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SignaturBarcodeTitelAutorMitwirkenderAnfang VeröffentlichungsdatumEnde VeröffentlichungsdatumVeröffentlichungsdatumVeröffentlichungsortVeröffentlichungsort (normiert)...DateinameWappenklassifizierungp_Ap_Bp_Cp_NFarbklassifizierungp_bluep_redp_yellow
0BE.1.A.13537581-20Flora Napolitana, ossia descrizione delle pian...Tenore, MicheleNaN181118361811-36NapoliNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1BE.1.C.10B1711164Flora Napolitana, ossia descrizione delle pian...Tenore, MicheleNaN181118361811-36NapoliNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2BE.1.A.10B1214205Antiquites du Bosphore Cimmerien conservees au...NaNGosudarstvennyj ĖrmitažNaNNaNNaNSt. PetersburgNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
3BE.1.A.11B1475984Musee des antiques dessine et grave par P. Bou...Bouillon, PierreSaint-Victor, Jacques Maximilien Renjamin Bins de181118271811-1827ParisNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4BE.1.A.12B1771764Divers works of early masters in christian dec...Weale, JohnNaN1846NaN1846LondonNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
..................................................................
22638Ink 7.E.11566517-10Biblia Mit Postilla litteralis von Nicolaus de...NaNKoberger, Anton1485NaN7. Mai 1485; [1485.05.07]NürnbergNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22639Ink 8.E.26+Z158726101La nef des fols du monde Aus dem Lat. des Jako...Brant, Sebastian 1458-1521Jean Lambert; Rivière, Pierre -14991497NaN[nicht vor Dez.] 1497ParisNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22640Ink 9.D.5+Z35095803Thesaurus Cornu copiae et Horti Adonidis <grie...NaNManuzio, Aldo Pio; Bolzanio, Urbano 1443-1524;...1496NaNAug. 1496; [1496.08]VenedigNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22641Ink 9.F.22+Z96101306Comoediae ; Francesco Petrarca: Vita TerentiiTerentius Afer, Publius v195-v159Zarotto, Antonio1476NaN23. Feb. 1476; [1476.02.23]MailandNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22642Ink 9.F.51460328-10Biblia ; Interpretationes Hebraicorum nominumNaNWild, Leonhard1481NaN1481VenedigNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

22643 rows × 26 columns

\n", + "
" + ], + "text/plain": [ + " Signatur Barcode \n", + "0 BE.1.A.1 3537581-20 \\\n", + "1 BE.1.C.10 B1711164 \n", + "2 BE.1.A.10 B1214205 \n", + "3 BE.1.A.11 B1475984 \n", + "4 BE.1.A.12 B1771764 \n", + "... ... ... \n", + "22638 Ink 7.E.1 1566517-10 \n", + "22639 Ink 8.E.26 +Z158726101 \n", + "22640 Ink 9.D.5 +Z35095803 \n", + "22641 Ink 9.F.22 +Z96101306 \n", + "22642 Ink 9.F.5 1460328-10 \n", + "\n", + " Titel \n", + "0 Flora Napolitana, ossia descrizione delle pian... \\\n", + "1 Flora Napolitana, ossia descrizione delle pian... \n", + "2 Antiquites du Bosphore Cimmerien conservees au... \n", + "3 Musee des antiques dessine et grave par P. Bou... \n", + "4 Divers works of early masters in christian dec... \n", + "... ... \n", + "22638 Biblia Mit Postilla litteralis von Nicolaus de... \n", + "22639 La nef des fols du monde Aus dem Lat. des Jako... \n", + "22640 Thesaurus Cornu copiae et Horti Adonidis \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SignaturBarcodeTitelAutorMitwirkenderAnfang VeröffentlichungsdatumEnde VeröffentlichungsdatumVeröffentlichungsdatumVeröffentlichungsortVeröffentlichungsort (normiert)...DateinameWappenklassifizierungp_Ap_Bp_Cp_NFarbklassifizierungp_bluep_redp_yellow
659BE.1.N.75*+Z167910202Postila to est, kratko istlmačenje vsih' nedel...NaNChristoph Württemberg, Herzog 1515-1568; Consu...1562NaN1562V Tubingi [Urach]Tübingen;Urach...Z167910202_00000001.jpgN0.0005090.0000980.0009440.998449NaNNaNNaNNaN
\n", + "

1 rows × 27 columns

\n", + "" + ], + "text/plain": [ + " Signatur Barcode \n", + "659 BE.1.N.75* +Z167910202 \\\n", + "\n", + " Titel Autor \n", + "659 Postila to est, kratko istlmačenje vsih' nedel... NaN \\\n", + "\n", + " Mitwirkender \n", + "659 Christoph Württemberg, Herzog 1515-1568; Consu... \\\n", + "\n", + " Anfang Veröffentlichungsdatum Ende Veröffentlichungsdatum \n", + "659 1562 NaN \\\n", + "\n", + " Veröffentlichungsdatum Veröffentlichungsort \n", + "659 1562 V Tubingi [Urach] \\\n", + "\n", + " Veröffentlichungsort (normiert) ... Dateiname \n", + "659 Tübingen;Urach ... Z167910202_00000001.jpg \\\n", + "\n", + " Wappenklassifizierung p_A p_B p_C p_N \n", + "659 N 0.000509 0.000098 0.000944 0.998449 \\\n", + "\n", + " Farbklassifizierung p_blue p_red p_yellow \n", + "659 NaN NaN NaN NaN \n", + "\n", + "[1 rows x 27 columns]" + ] + }, + "execution_count": 1242, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cat_pred_comb[cat_pred_comb['Gültiger Barcode'] == 'Z167910202']" + ] + }, + { + "cell_type": "code", + "execution_count": 1200, + "id": "6423f5bb-d52a-4b49-b91e-1d085daf70fd", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SignaturBarcodeTitelAutorMitwirkenderAnfang VeröffentlichungsdatumEnde VeröffentlichungsdatumVeröffentlichungsdatumVeröffentlichungsortVeröffentlichungsort (normiert)...DateinameWappenklassifizierungp_Ap_Bp_Cp_NFarbklassifizierungp_bluep_redp_yellow
\n", + "

0 rows × 26 columns

\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [Signatur, Barcode, Titel, Autor, Mitwirkender, Anfang Veröffentlichungsdatum, Ende Veröffentlichungsdatum, Veröffentlichungsdatum, Veröffentlichungsort, Veröffentlichungsort (normiert), Schlagwörter, Schlagwörter (mit GND), Vorbesitzer, Typ, Bemerkungen, Gültiger Barcode, Dateiname, Wappenklassifizierung, p_A, p_B, p_C, p_N, Farbklassifizierung, p_blue, p_red, p_yellow]\n", + "Index: []\n", + "\n", + "[0 rows x 26 columns]" + ] + }, + "execution_count": 1200, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cat_pred_comb.dropna(subset='Dateiname')[cat_pred_comb.dropna(subset='Dateiname').duplicated('Dateiname', keep=False)]" + ] + }, + { + "cell_type": "code", + "execution_count": 1244, + "id": "c6428a9b-2b1f-4bc8-967f-4314fbd06bee", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SignaturBarcodeTitelAutorMitwirkenderAnfang VeröffentlichungsdatumEnde VeröffentlichungsdatumVeröffentlichungsdatumVeröffentlichungsortVeröffentlichungsort (normiert)SpracheSchlagwörter
0BE.1.A.13537581-20Flora Napolitana, ossia descrizione delle pian...Tenore, MicheleNaN181118361811-36NapoliNaNItalianNeapel; Pflanzen
1BE.1.C.10B1711164Flora Napolitana, ossia descrizione delle pian...Tenore, MicheleNaN181118361811-36NapoliNaNItalianNeapel; Pflanzen
2BE.1.A.10B1214205Antiquites du Bosphore Cimmerien conservees au...NaNGosudarstvennyj ĖrmitažNaNNaNNaNSt. PetersburgNaNunknownSankt Petersburg; Museum; Eremitage; Straße vo...
3BE.1.A.11B1475984Musee des antiques dessine et grave par P. Bou...Bouillon, PierreSaint-Victor, Jacques Maximilien Renjamin Bins de181118271811-1827ParisNaNFrenchAltertümer
4BE.1.A.12B1771764Divers works of early masters in christian dec...Weale, JohnNaN1846NaN1846LondonNaNEnglishDekoration; Christentum
5BE.1.A.13B986740Colonna Traiana eretta dal senato, e popolo Ro...Bartoli, Pietro Santo 1635-1700Chacón, Alfonso 1540-15991751NaN1751RomaNaNItalianTrajanssäule--Rom; Trajan--Römisches Reich, Ka...
6BE.1.A.14B998501Columna Cochlis M. Aurelio Antonio Augusto dicataBellori, Giovanni PietroNaN1704NaN1704RomaNaNItalianMark Aurel--Römisches Reich, Kaiser--121-180; ...
7BE.1.A.15B1581441A new collection of chimney pieces. Ornamented...Richardson, GeorgeNaN1781NaN1781LondonNaNEnglishAquatinta; Schornstein
8BE.1.A.16B1417347Raccolta Di Statve Antiche E Moderne Data In L...NaNMaffei, Paolo Alessandro 1653-1716; Rossi, Dom...1704NaN1704RomaNaNItalianPlastik
9BE.1.A.17B992390Le Grand Porte-Feuille Politique A l'usage des...Beaufort, Louis <<de>> 1703-1795Maradan, Claude François; Selbstverl.; Beaufor...1789NaN1789A ParisParisFrenchFranzösische Revolution; Bild
\n", + "
" + ], + "text/plain": [ + " Signatur Barcode Titel \n", + "0 BE.1.A.1 3537581-20 Flora Napolitana, ossia descrizione delle pian... \\\n", + "1 BE.1.C.10 B1711164 Flora Napolitana, ossia descrizione delle pian... \n", + "2 BE.1.A.10 B1214205 Antiquites du Bosphore Cimmerien conservees au... \n", + "3 BE.1.A.11 B1475984 Musee des antiques dessine et grave par P. Bou... \n", + "4 BE.1.A.12 B1771764 Divers works of early masters in christian dec... \n", + "5 BE.1.A.13 B986740 Colonna Traiana eretta dal senato, e popolo Ro... \n", + "6 BE.1.A.14 B998501 Columna Cochlis M. Aurelio Antonio Augusto dicata \n", + "7 BE.1.A.15 B1581441 A new collection of chimney pieces. Ornamented... \n", + "8 BE.1.A.16 B1417347 Raccolta Di Statve Antiche E Moderne Data In L... \n", + "9 BE.1.A.17 B992390 Le Grand Porte-Feuille Politique A l'usage des... \n", + "\n", + " Autor \n", + "0 Tenore, Michele \\\n", + "1 Tenore, Michele \n", + "2 NaN \n", + "3 Bouillon, Pierre \n", + "4 Weale, John \n", + "5 Bartoli, Pietro Santo 1635-1700 \n", + "6 Bellori, Giovanni Pietro \n", + "7 Richardson, George \n", + "8 NaN \n", + "9 Beaufort, Louis <> 1703-1795 \n", + "\n", + " Mitwirkender \n", + "0 NaN \\\n", + "1 NaN \n", + "2 Gosudarstvennyj Ėrmitaž \n", + "3 Saint-Victor, Jacques Maximilien Renjamin Bins de \n", + "4 NaN \n", + "5 Chacón, Alfonso 1540-1599 \n", + "6 NaN \n", + "7 NaN \n", + "8 Maffei, Paolo Alessandro 1653-1716; Rossi, Dom... \n", + "9 Maradan, Claude François; Selbstverl.; Beaufor... \n", + "\n", + " Anfang Veröffentlichungsdatum Ende Veröffentlichungsdatum \n", + "0 1811 1836 \\\n", + "1 1811 1836 \n", + "2 NaN NaN \n", + "3 1811 1827 \n", + "4 1846 NaN \n", + "5 1751 NaN \n", + "6 1704 NaN \n", + "7 1781 NaN \n", + "8 1704 NaN \n", + "9 1789 NaN \n", + "\n", + " Veröffentlichungsdatum Veröffentlichungsort Veröffentlichungsort (normiert) \n", + "0 1811-36 Napoli NaN \\\n", + "1 1811-36 Napoli NaN \n", + "2 NaN St. Petersburg NaN \n", + "3 1811-1827 Paris NaN \n", + "4 1846 London NaN \n", + "5 1751 Roma NaN \n", + "6 1704 Roma NaN \n", + "7 1781 London NaN \n", + "8 1704 Roma NaN \n", + "9 1789 A Paris Paris \n", + "\n", + " Sprache Schlagwörter \n", + "0 Italian Neapel; Pflanzen \n", + "1 Italian Neapel; Pflanzen \n", + "2 unknown Sankt Petersburg; Museum; Eremitage; Straße vo... \n", + "3 French Altertümer \n", + "4 English Dekoration; Christentum \n", + "5 Italian Trajanssäule--Rom; Trajan--Römisches Reich, Ka... \n", + "6 Italian Mark Aurel--Römisches Reich, Kaiser--121-180; ... \n", + "7 English Aquatinta; Schornstein \n", + "8 Italian Plastik \n", + "9 French Französische Revolution; Bild " + ] + }, + "execution_count": 1244, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cat_pred_comb.iloc[0:10,0:12]" + ] + }, + { + "cell_type": "code", + "execution_count": 1250, + "id": "e450dd3d-1d5f-4b52-a52c-b3d0ad48515a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SignaturBarcodeTitelAutorMitwirkenderAnfang VeröffentlichungsdatumEnde VeröffentlichungsdatumVeröffentlichungsdatumVeröffentlichungsortVeröffentlichungsort (normiert)...DateinameWappenklassifizierungp_Ap_Bp_Cp_NFarbklassifizierungp_bluep_redp_yellow
22620Ink 2.D.15+Z158717306Ars memorandi per figuras evangelistarumNaNNaN1470NaN[um 1470][Süddeutschland]NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22621Ink 3.H.31+Z173262908De consuetudinibus et conditionibus orientaliu...Polo, Marco 1254-1324Leeu, Gerard1483NaN[zwischen 1483 und 11. Juni 1484][Gouda]NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22622Ink 3.H.8+Z158731704Herbarium Apulei Mit Widmungsbrief des Drucker...NaNLignamine, Johannes Philippus de1482NaN[um 1481-82][Rom]NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22623Ink 4.A.21638836-20Biblia Mit Glossa ordinaria und InterlinearglosseNaNRusch, Adolf1480NaN[nicht nach 1480][Straßburg]NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22624Ink 4.C.31605764-10Biblia ; Menardus Monachus: Generalis et compe...NaNWinters, Konrad1479NaN20. Sept. 1479; [1479.09.20]KölnNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22625Ink 4.E.4+Z158736209Elegantiae linguae latinae Mit Brief des Autor...Valla, Lorenzo 1407-1457Jenson, Nicolas1471NaN[vor Juli] 1471VenedigNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22626Ink 4.E.9+Z158719005Hypnerotomachia Poliphili Mit lat. Widmungsbri...Colonna, Francesco 1433-1527Manuzio, Aldo Pio 1450-15151499NaNDezember 1499; [1499.12]VenedigNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22627Ink 4.G.19+Z158745004Biblia ; Interpretationes Hebraicorum nominumNaNRenner, Franz1480NaN1480VenedigNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22628Ink 4.H.44+Z15872590XConcordantia astronomie cum theologia. Concord...Peter von Ailly, Kardinal 1351-1420Ratdolt, Erhard1490NaN2. Jan. 1490; [1490.01.02]AugsburgNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22629Ink 4.H.451444073-10Biblia ; Interpretationes Hebraicorum nominumNaNNaN1480NaN31. Mai 1480; [1480.05.31]VenedigNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22630Ink 4.H.58+Z158747906Coronica del cid ruy diazNaNNaN1498NaNMai 1498SevillaNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22631Ink 5.B.10+Z158733907Biblia <dt.> Übers. aus dem Lateinischen. Mit ...NaNKoberger, Anton1483NaN17. Feb. 1483; [1483.02.17]NürnbergNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22632Ink 5.C.121571736-10BibliaNaNSchöffer, Peter (der Ältere)1472NaN23. Feb. 1472MainzNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22633Ink 5.C.9+Z173284400Historiae Romanae decades Mit den Periochae. M...Livius, Titus v59-17NaN1470NaN1470[Venedig]NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22634Ink 5.E.4+Z158727300DecameroneBoccaccio, Giovanni 1313-1375Zarotto, Antonio1476NaN1476MailandNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22635Ink 6.B.21639891-10Biblia <ital.> Übers. aus dem LateinischenNaNAmmergau, Adam von1471NaN1. Okt. 1471[Venedig]NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22636Ink 6.E.11+Z158731601De materia medica <lat.> Mit Glossen und Zusät...Dioscorides, Pedanius 40-90NaN1478NaNJuli 1478; [1478.07]ColleNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22637Ink 7.D.16+Z95540203Mathesis Mit Widmungsbrief an Kardinal Ippolit...Firmicus Maternus, Iulius ca. 335/350Manuzio, Aldo Pio 1450-1515; Guidobaldo I. Urb...1499NaN1499; Juni (P. 1), Okt. 1499 (P. 2)VenedigNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22638Ink 7.E.11566517-10Biblia Mit Postilla litteralis von Nicolaus de...NaNKoberger, Anton1485NaN7. Mai 1485; [1485.05.07]NürnbergNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22639Ink 8.E.26+Z158726101La nef des fols du monde Aus dem Lat. des Jako...Brant, Sebastian 1458-1521Jean Lambert; Rivière, Pierre -14991497NaN[nicht vor Dez.] 1497ParisNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22640Ink 9.D.5+Z35095803Thesaurus Cornu copiae et Horti Adonidis <grie...NaNManuzio, Aldo Pio; Bolzanio, Urbano 1443-1524;...1496NaNAug. 1496; [1496.08]VenedigNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22641Ink 9.F.22+Z96101306Comoediae ; Francesco Petrarca: Vita TerentiiTerentius Afer, Publius v195-v159Zarotto, Antonio1476NaN23. Feb. 1476; [1476.02.23]MailandNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
22642Ink 9.F.51460328-10Biblia ; Interpretationes Hebraicorum nominumNaNWild, Leonhard1481NaN1481VenedigNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

23 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " Signatur Barcode \n", + "22620 Ink 2.D.15 +Z158717306 \\\n", + "22621 Ink 3.H.31 +Z173262908 \n", + "22622 Ink 3.H.8 +Z158731704 \n", + "22623 Ink 4.A.2 1638836-20 \n", + "22624 Ink 4.C.3 1605764-10 \n", + "22625 Ink 4.E.4 +Z158736209 \n", + "22626 Ink 4.E.9 +Z158719005 \n", + "22627 Ink 4.G.19 +Z158745004 \n", + "22628 Ink 4.H.44 +Z15872590X \n", + "22629 Ink 4.H.45 1444073-10 \n", + "22630 Ink 4.H.58 +Z158747906 \n", + "22631 Ink 5.B.10 +Z158733907 \n", + "22632 Ink 5.C.12 1571736-10 \n", + "22633 Ink 5.C.9 +Z173284400 \n", + "22634 Ink 5.E.4 +Z158727300 \n", + "22635 Ink 6.B.2 1639891-10 \n", + "22636 Ink 6.E.11 +Z158731601 \n", + "22637 Ink 7.D.16 +Z95540203 \n", + "22638 Ink 7.E.1 1566517-10 \n", + "22639 Ink 8.E.26 +Z158726101 \n", + "22640 Ink 9.D.5 +Z35095803 \n", + "22641 Ink 9.F.22 +Z96101306 \n", + "22642 Ink 9.F.5 1460328-10 \n", + "\n", + " Titel \n", + "22620 Ars memorandi per figuras evangelistarum \\\n", + "22621 De consuetudinibus et conditionibus orientaliu... \n", + "22622 Herbarium Apulei Mit Widmungsbrief des Drucker... \n", + "22623 Biblia Mit Glossa ordinaria und Interlinearglosse \n", + "22624 Biblia ; Menardus Monachus: Generalis et compe... \n", + "22625 Elegantiae linguae latinae Mit Brief des Autor... \n", + "22626 Hypnerotomachia Poliphili Mit lat. Widmungsbri... \n", + "22627 Biblia ; Interpretationes Hebraicorum nominum \n", + "22628 Concordantia astronomie cum theologia. Concord... \n", + "22629 Biblia ; Interpretationes Hebraicorum nominum \n", + "22630 Coronica del cid ruy diaz \n", + "22631 Biblia Übers. aus dem Lateinischen. Mit ... \n", + "22632 Biblia \n", + "22633 Historiae Romanae decades Mit den Periochae. M... \n", + "22634 Decamerone \n", + "22635 Biblia Übers. aus dem Lateinischen \n", + "22636 De materia medica Mit Glossen und Zusät... \n", + "22637 Mathesis Mit Widmungsbrief an Kardinal Ippolit... \n", + "22638 Biblia Mit Postilla litteralis von Nicolaus de... \n", + "22639 La nef des fols du monde Aus dem Lat. des Jako... \n", + "22640 Thesaurus Cornu copiae et Horti Adonidis " ] @@ -4130,6 +4130,11 @@ } ], "source": [ + "from matplotlib import rc\n", + "rc('font',**{'family':'serif','serif':['Palatino'], 'size': 16})\n", + "rc('text', usetex=True)\n", + "\n", + "\n", "classes = ['A', 'B', 'C', 'N']\n", "fp_d = {\n", " 'A': A_fp,\n", @@ -4156,9 +4161,9 @@ " ax = plt.subplot(1, 4, i+1)\n", " ax.hist(hist_data[i], bins=60, log=True, stacked=True, label=labels[i])\n", " ax.set_title(f\"Type {classes[i]}: {(both_full['prediction'] == classes[i]).sum()} predictions\")\n", - " ax.set_xlabel(f'p_{classes[i]}')\n", - " if i > 0:\n", - " ax.set_ylabel('')\n", + " ax.set_xlabel(f'$p_\\mathrm {classes[i]}$')\n", + " if i == 0:\n", + " ax.set_ylabel('Frequency')\n", " if i < 3:\n", " ax.legend()\n", "\n", @@ -4468,8 +4473,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "c1f9d79e-979d-434c-bec0-33e45b40c822", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "5" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ABO_df['filename'].str.contains('Z166436806').sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28436237-75c6-45d5-8932-31ce37781da6", "metadata": {}, "outputs": [], "source": [] diff --git a/Notebooks/Color_classifier.ipynb b/Notebooks/Color_classifier.ipynb index 725a38d981e40455cdc314ea3bc551aac2480f84..14fca07196c55e2d50756cde9f54379d9db2e3a0 100644 --- a/Notebooks/Color_classifier.ipynb +++ b/Notebooks/Color_classifier.ipynb @@ -11,7 +11,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 2, @@ -1604,9 +1604,11 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 3, "id": "d12e2f21-50ba-402c-9ddd-672a3ec3e0d9", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "combined_df = pd.read_csv('data/predictions/old/best_color_combined_predictions.csv')" @@ -1896,7 +1898,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 4, "id": "4aa3fbc3-662d-4aaa-b90d-0324d4851f8e", "metadata": { "tags": [] @@ -2253,7 +2255,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 5, "id": "d5ca0f92-298e-4a54-a569-6861bcb0a3c8", "metadata": { "tags": [] @@ -2287,7 +2289,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 16, "id": "6aec4623-719e-49ef-8bd9-b38518d8a8fc", "metadata": { "tags": [] @@ -2295,7 +2297,7 @@ "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -2305,6 +2307,10 @@ } ], "source": [ + "from matplotlib import rc\n", + "rc('font',**{'family':'serif','serif':['Palatino'], 'size': 16})\n", + "rc('text', usetex=True)\n", + "\n", "colors = ('red', 'yellow', 'blue')\n", "\n", "nicer_colors = {\n", @@ -2319,7 +2325,7 @@ " ax = plt.subplot(1, 3, i+1)\n", " combined_df[combined_df['man_color'] == colors[i]][f'p_{colors[i]}'].plot.hist(bins=60, log=True, color=nicer_colors[colors[i]])\n", " ax.set_title(f\"{colors[i].capitalize()} color: {(combined_df['man_color'] == colors[i]).sum()} predictions\")\n", - " ax.set_xlabel(f'p_{colors[i]}')\n", + " ax.set_xlabel(\"$p_\\mathrm{\" + colors[i] + \"}$\")\n", " if i > 0:\n", " ax.set_ylabel('')\n", "\n", @@ -2329,7 +2335,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 14, "id": "faacafb5-d11d-4be8-9f83-54ec142d0db5", "metadata": { "tags": [] @@ -2409,7 +2415,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 18, "id": "80219328-80ac-4a54-979b-b67e85e3dacd", "metadata": { "tags": [] @@ -2417,7 +2423,7 @@ "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -2427,6 +2433,8 @@ } ], "source": [ + "rc('font',**{'family':'serif','serif':['Palatino'], 'size': 14})\n", + "\n", "types = ('Type A', 'Type B', 'Type C')\n", "colors = ('Red', 'Yellow', 'Blue')\n", "color_counts = {\n", diff --git a/Notebooks/TEI_Export.ipynb b/Notebooks/TEI_Export.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..be30e8c1176637764cafb76cf4724249e07bab4a --- /dev/null +++ b/Notebooks/TEI_Export.ipynb @@ -0,0 +1,1537 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "8a7c8849-b1a3-4f88-b534-cec8b4c13f09", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install -r requirements.txt -q" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "5b24e324-6659-482d-8d82-39c1d604f0d3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from bs4 import BeautifulSoup as bs\n", + "import cv2 as cv\n", + "import re\n", + "import pathlib\n", + "import numpy as np\n", + "import json\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.mixture import GaussianMixture\n", + "# from scipy.optimize import curve_fit\n", + "# import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "id": "f7535107-5d1e-43d9-b0ed-077c306a73c1", + "metadata": {}, + "source": [ + "# Klassenstruktur für eine Seite und ihre Untereinheiten" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "97ce5633-934c-4fbb-a71f-7e4710ff4211", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def prepare_string(s):\n", + " new = s.lower()\n", + " new = re.sub(r'[àáâãäå]', 'a', new)\n", + " new = re.sub(r'[èéêë]', 'e', new)\n", + " new = re.sub(r'[ìíîï]', 'i', new)\n", + " new = re.sub(r'[òóôõö]', 'o', new)\n", + " new = re.sub(r'[ùúûü]', 'u', new)\n", + " new = re.sub('\\.|,|;|\\s', '', new)\n", + " return new\n", + "\n", + "class Page:\n", + " def __init__(self, page_xml_string, tei_xml_string):\n", + " self.XMLSources = XMLSources(page_xml_string, tei_xml_string)\n", + " self.Header = Header(self.XMLSources)\n", + " self.Entries = Entries(self.Header, self.XMLSources)\n", + " self.RemainingLines = RemainingLines(self.Header, self.Entries, self.XMLSources)\n", + " \n", + " def __str__(self):\n", + " return f'Filename: {self.XMLSources.PageXML.find_all(\"Page\")[0].attrs[\"imageFilename\"]}\\n' \\\n", + " f'{self.Header}'\n", + " \n", + " def get_filename(self):\n", + " return self.XMLSources.PageXML.find_all(\"Page\")[0].attrs[\"imageFilename\"]\n", + "\n", + "class Line:\n", + " def __init__(self, xml_line):\n", + " self.ID = xml_line.attrs['id']\n", + " polygon_string = xml_line.Coords.attrs['points']\n", + " polygon_tuples = polygon_string.split(' ')\n", + " polygon = [tup.split(',') for tup in polygon_tuples]\n", + " self.BoundingPolygon = np.array(polygon, dtype='int')\n", + " baseline_string = xml_line.Baseline.attrs['points']\n", + " baseline_tuples = baseline_string.split(' ')\n", + " baseline = [tup.split(',') for tup in baseline_tuples]\n", + " self.Baseline = np.array(baseline, dtype='int')\n", + " self.Text = xml_line.find_all('TextEquiv')[-1].Unicode.text\n", + " self.Tags = xml_line.attrs['custom']\n", + " \n", + " def __str__(self):\n", + " # return f'ID: {self.ID}, text: {self.Text}\\n'\n", + " return self.Text\n", + " \n", + " def get_polygon_bb(self):\n", + " pass\n", + "\n", + "class Header:\n", + " def __init__(self, XMLSources):\n", + " self.XMLSources = XMLSources\n", + " self.PageNumber = self.get_page_number()\n", + " self.Format = self.get_format()\n", + " self.Category = self.get_category()\n", + " self.Subcategory = self.get_subcategory()\n", + " self.isValid = self.check_validity()\n", + " \n", + " def __str__(self):\n", + " # header_string = f'Has valid header: {self.isValid}\\n'\n", + " header_string = ''\n", + " if self.isValid:\n", + " header_string += f'Page number(s): {\"\".join(self.PageNumber[0])}\\n' \\\n", + " f'Format: {\" \".join(self.Format[0])}\\n' \\\n", + " f'Category: {\" \".join(self.Category[0])}\\n' \\\n", + " f'Subcategory: {\" \".join(self.Subcategory[0])}'\n", + " return header_string\n", + "\n", + " def get_all_ids(self):\n", + " lines = []\n", + " lines += self.PageNumber[1] + self.Format[1] + self.Category[1] + self.Subcategory[1]\n", + " ids = [l.ID for l in lines]\n", + " return ids\n", + " \n", + " def check_validity(self):\n", + " hasPageNumber = self.PageNumber[0]\n", + " # hasFormat = self.Format[0]\n", + " hasCategory = self.Category[0]\n", + " # hasSubcategory = self.Subcategory[0]\n", + " # check if a header line has more than one tag\n", + " # ids = self.get_all_ids()\n", + " # id_set = set(ids)\n", + " # if len(ids) != len(id_set):\n", + " # print('more than one tag for header lines')\n", + " categories, category_lines = self.Category\n", + " subcategory_line_IDs = [subcat_l.ID for subcat_l in self.Subcategory[1]]\n", + " for cat, cat_l in zip(categories, category_lines):\n", + " if cat_l.ID in subcategory_line_IDs:\n", + " category_lines.remove(cat_l)\n", + " categories.remove(cat)\n", + " self.Category = (categories, category_lines)\n", + " return bool(hasPageNumber and hasCategory)\n", + " \n", + " def get_width_height(self):\n", + " page = self.XMLSources.PageXML.find_all('Page')[0]\n", + " w, h = int(page.attrs['imageWidth']), int(page.attrs['imageHeight'])\n", + " return w, h\n", + "\n", + " def get_page_number(self):\n", + " page_regex = '^[IVX]+\\Z|^\\d+\\**\\Z|^\\d+⁎*\\Z|^[a-h]\\Z|^\\*+\\Z'\n", + " textlines = self.XMLSources.PageXML.find_all('TextLine')\n", + " w, h = self.get_width_height()\n", + " w_cut, h_cut = 0.75, 0.1\n", + " page_numbers = []\n", + " page_number_lines = []\n", + " for line in textlines:\n", + " l = Line(line)\n", + " avg_x, avg_y = np.mean(l.Baseline, axis=0)\n", + " text = l.Text\n", + " prep_text = re.sub('\\.|\\s', '', text)\n", + " m = re.match(page_regex, prep_text)\n", + " if m and (avg_y < h_cut * h) and ((avg_x > w_cut * w) or (avg_x < (1 - w_cut) * w)):\n", + " page_numbers.append(m.group(0))\n", + " page_number_lines.append(l)\n", + " return (page_numbers, page_number_lines)\n", + " \n", + " def get_format(self):\n", + " true_formats = ['infolio', 'inquarto', 'inoctavo', '&minforma']\n", + " index_to_format = {\n", + " 0: 'Folio',\n", + " 1: 'Quarto',\n", + " 2: 'Octavo und kleiner',\n", + " 3: 'Octavo und kleiner'\n", + " }\n", + " textlines = self.XMLSources.PageXML.find_all('TextLine')\n", + " w, h = self.get_width_height()\n", + " w_cut, h_cut = 0.75, 0.2\n", + " formats = []\n", + " format_lines = []\n", + " for line in textlines:\n", + " l = Line(line)\n", + " avg_x, avg_y = np.mean(l.Baseline, axis=0)\n", + " text = l.Text\n", + " cand_format = prepare_string(text)\n", + " if (cand_format in true_formats) and (avg_y < h_cut * h) and ((avg_x > w_cut * w) or (avg_x < (1 - w_cut) * w)):\n", + " # formats.append(index_to_format[true_formats.index(cand_format)])\n", + " formats.append(text)\n", + " format_lines.append(l)\n", + " return (formats, format_lines)\n", + " \n", + " def get_category(self):\n", + " true_categories = ['Theologia', 'Iurisprudentia', 'Philosophia', 'Historia Naturalis', 'Medicina', 'Mathematica', 'Artes', 'Grammatica', 'Rhetorica',\n", + " 'Poëtica', 'Philologia', 'Geographia', 'Chronologia', 'Historia Ecclesiastica', 'Historia Antiqua',\n", + " 'Historia Nova Europæ', 'Hist. Nova Europæ', 'Historia Nova extra Europam', 'Hist. Nova extra Europam', 'Miscellanea Historica', \n", + " 'Paralipomena historica', 'Codices Manuscripti', 'Imagines', 'Imagines Incisæ', 'Imagines Variæ Incisæ', 'Effigierum Incisar. Collectio',\n", + " 'Imaginum Delineatar. Collectio'\n", + " ]\n", + " textlines = self.XMLSources.PageXML.find_all('TextLine')\n", + " # w, h = self.get_width_height()\n", + " # w_cut, h_cut = 0.1, 0.1\n", + " categories = []\n", + " category_lines = []\n", + " for line in textlines:\n", + " l = Line(line)\n", + " # avg_x, avg_y = np.mean(l.Baseline, axis=0)\n", + " text = l.Text\n", + " # TODO: implement similarity comparison and location check\n", + " prepared_string = prepare_string(text)\n", + " prep_text = re.sub('\\d', '', prepared_string)\n", + " # if a 4-digit number was removed from the text we're probably looking at a line from an entry\n", + " if len(prepared_string) - len(prep_text) > 3:\n", + " continue\n", + " prep_text_in_cats = [prep_text in prepare_string(cat) for cat in true_categories]\n", + " # code snippet for location check:\n", + " # ... and (avg_y < h_cut * h) and (abs(avg_x - w/2) < w_cut * w)\n", + " if sum(prep_text_in_cats) and len(prep_text) > 3:\n", + " categories.append(text)\n", + " category_lines.append(l)\n", + " return (categories, category_lines)\n", + " \n", + " def get_subcategory(self):\n", + " true_categories = ['Theologia', 'Iurisprudentia', 'Philosophia', 'Historia Naturalis', 'Medicina', 'Mathematica', 'Artes', 'Grammatica', 'Rhetorica',\n", + " 'Poëtica', 'Philologia', 'Geographia', 'Chronologia', 'Historia Ecclesiastica', 'Historia Antiqua',\n", + " 'Historia Nova Europæ', 'Hist. Nova Europæ', 'Historia Nova extra Europam', 'Hist. Nova extra Europam', 'Miscellanea Historica', \n", + " 'Paralipomena historica', 'Codices Manuscripti', 'Imagines', 'Imagines Incisæ', 'Imagines Variæ Incisæ', 'Effigierum Incisar. Collectio',\n", + " 'Imaginum Delineatar. Collectio'\n", + " ]\n", + " prep_cats = [prepare_string(cat) for cat in true_categories]\n", + " true_subcategories = ['Textus & Versiones Sacræ Scripturæ',\n", + " 'Textus & Versiones S. Scripturæ',\n", + " 'Interpretes, Commentatores, & Paraphrastæ in S. Scripturam',\n", + " 'Critici Sacri',\n", + " 'Liturgiæ Sacræ',\n", + " 'Concilia & ad Eadem pertinentia',\n", + " 'Concilia, & quæ ad eamdem Rem pertinent.',\n", + " 'Concilia, & quæ ad eamd. Rem pertinent.',\n", + " 'SS. Patres Græci',\n", + " 'SS. Patres Latini',\n", + " 'Collectiones & Excerpta SS. Patrum & Scriptorum Ecclesiasticorum',\n", + " 'Collectiones ac Excerpta SS. PP. & Scriptorum Ecclesiasticorum',\n", + " 'Collectiones & Excerpta SS. PP. & Scriptor. Ecclesiasticorum',\n", + " 'Collectiones ac Excerpta SS. PP. & Scriptor. Ecclesiasticorum',\n", + " 'Theologi Scholastici',\n", + " 'Theologi Morales, Ascetici, & Parænetici',\n", + " 'Theologi Polemici',\n", + " 'Theologi Heterodoxi',\n", + " 'Canonum collectores & Canonistæ; Epistolæ decretales; necnon de Hierarchiâ & Rebus Ecclesiasticis',\n", + " 'Canonum collectores, & Canonistæ. Epistolæ decretal. & Bullæ; necnon de Hierarchiâ & Reb. Ecclesiast.',\n", + " 'Canonum collectores, & Canonistæ. Epistolæ decretal. ac Bullæ; necnon de Hierarchiâ & Reb. Ecclesiast.',\n", + " 'Canonum collectores & Canonistæ; Epist. decretal. & Bullæ; necnon de Hierarchiâ & Reb. Ecclesiast.',\n", + " 'De Potestate Spirituali & Temporali Pontificis Max; ac de Potestate Regiâ in Regimine Ecclesiastico',\n", + " 'De Potestate Spirituali & Temporali Pontific. Max. ac de Regiâ Potestate in Regimine Ecclesiast.',\n", + " 'Ius Civile, Publicum, & Municipale',\n", + " 'Philosophi Veteres & novi cum suis Interpretibus, & Tractatus Philosophici generales',\n", + " 'Philosophi Veteres & novi cum suis Interpretibus necnon Tractatus Philosophici generales',\n", + " 'Philosophi veteres & novi cum suis Interpret. necnon Tractatus Philosophici general.',\n", + " 'Philosophi veteres ac novi cum suis Interpretib. nec non Tractatus Philosophici generales',\n", + " 'Philosophi veteres ac novi cum suis Interpretibus; necnon Tractatus Philosophici generales',\n", + " 'Logici, Morales, & Politici',\n", + " 'Metaphysici, Physici, & Philosophia Arcana',\n", + " 'Tractatus Vniversales Hist. Nat. cum miscellan.',\n", + " 'Tractatus Vniversales Historiæ Natural. cum Miscellaneis',\n", + " 'Tractatus Vniversales Historiæ Naturalis cum Miscellaneis',\n", + " 'Metallorum, Fossilium, Gemmarum, Lapidum, Aquarum, Conchiliorum, &c. Historia',\n", + " 'Metallorum, Fossilium, Gemmar. Lapidum, Aquarum, Conchilior. &c.',\n", + " 'De Agriculturâ & Re Rusticâ',\n", + " 'Plantarum Historia',\n", + " 'Plantarum, Arborum, Fruticum & Florum',\n", + " 'Animalium Historia',\n", + " 'Medicina; Chirurgia, Anatomia & Pharmac.',\n", + " 'Vbi etiam Chirurgia, Anatomia, Pharmacia & Chimia',\n", + " 'Vbi etiam Chirurgia, Anatomia, Pharmacia, necnon Chimia',\n", + " 'Tractatus generales; Arithmetica, Geometria & Musica',\n", + " 'Astronomia; Astrologia; Gnomonica; & optica',\n", + " 'Cosmographia; Astronomia; Astrologia; Gnomonica, Optica',\n", + " 'Tractatus Mechanici',\n", + " 'Ars Militaris, cum tractatibus de Tormentis Bellicis',\n", + " 'Ars Militaris, & de Bellicis Tormentis Tractat.',\n", + " 'Ars Militaris, & de Bellicis Machinis Tractat.',\n", + " 'Ars Hydraulica & Nautica',\n", + " 'Ars Delineatoria, Pictoria, & Sculptoria',\n", + " 'Ars Delineatoria, Pictoria, Sculptoria, & Chalcographica',\n", + " 'Architectura Civilis',\n", + " 'Architectura Militaris, & Arcium Icones',\n", + " 'Architectura Militar. vbi etiam Arcium Icones',\n", + " 'Architectura Militaris, vbi etiam Arcium Icones',\n", + " 'Ars Graphica, Typographica, Gymnastica, & Aliæ Artes',\n", + " 'Tractatus Grammatici general. cum Lexicograph.',\n", + " 'Tractatus Grammatici cum Lexicographis',\n", + " 'Tractat. Grammatici cum Lexicographis',\n", + " 'Rhetores, Seu Artis Rhetoricæ Scriptores',\n", + " 'Oratores Græci, & Latini Antiqui',\n", + " 'Oratores Latini recentiores, Gallici, & Italici',\n", + " 'De Arte Poëticâ, & Poëtis Scriptores',\n", + " 'Scriptores de Arte Poëticâ & Poëtis ',\n", + " 'Poëtæ Græci',\n", + " 'Poëtæ Latini Antiqui',\n", + " 'Poëtæ Latini Recentiores, cum Germanicis',\n", + " 'Poëtæ Gallici unà cum Dramaticis',\n", + " 'Poëtæ Gallici vnà cum Dramaticis',\n", + " 'Poëtæ Gallici cum Dramaticis',\n", + " 'Poëtæ Gallici unà cum Drammaticis',\n", + " 'Poëtæ Gallici unà cum Dramatic.',\n", + " 'Poëtæ Italici, & Hispanici, unà cum Dramat.',\n", + " 'Poëtæ Italici, Hispanici, & Lusitanici, unà cum Dramaticis',\n", + " 'Poëtæ Italici, Hispan. & Lusitanici cum Dramatic.',\n", + " 'Mythologi, & Fabularum Scriptores',\n", + " 'Poësis Prosaïca, Seu Facetiarum, Narrationum & Historiarum Eroticarum Scriptores',\n", + " 'Poësis Prosaïca, Sive Facetiarum, Narrationum, & Historiarum Eroticarum Scriptores',\n", + " 'Poësis Prosaïca, Sive Facetiarum, Narrationum, & Historiarum Eroticar. Scriptores.'\n", + " 'Operum Græcor. varij Argumenti Collectiones',\n", + " 'Operum Græcorum varij Argumenti Collectiones',\n", + " 'Operum Græcorum varij Argumenti Scriptores',\n", + " 'Operum Latinor. varij Argumenti Collectiones',\n", + " 'Operum Latinorum varij Argumenti Collectiones',\n", + " 'Operum Gallicorum & Italicorum Argumenti varij Collectiones',\n", + " 'Operum Gallicorum & Italicorum varij Argum. Collectiones',\n", + " 'Operum Gallicorum & Italicorum varij Argumenti Collectiones',\n", + " 'Critici',\n", + " 'Epistolographi Græci & Latini',\n", + " 'Epistolographi Gallici, & Italici',\n", + " 'Gnomici, Seu Sententiæ, Apophtegmata, Adagia, Dictaque moralia & critica',\n", + " 'Satyræ, Apologiæ, ac dissertationes variæ',\n", + " 'Hieroglyphica, Seu Symbola, Emblemata, &c.',\n", + " 'Dialogi, & Colloquia',\n", + " 'Geographi veteres & novi',\n", + " 'Geographi veteres ac Novi',\n", + " 'Descriptiones & Tabulæ Geographicæ, Chorographicæ, ac Topographicæ',\n", + " 'Descript. & Tabulæ Geographicæ, Chorograph., ac Topographicæ',\n", + " 'Descript. & Tabulæ Geographicæ, Chorographicæ, ac Topographicæ',\n", + " 'Descript. & Tabulæ Geographicæ, Chorograph; ac Topographicæ',\n", + " 'Peregrinationes, & Navigationes',\n", + " 'Peregrinationes, Navigationes, & Itinera',\n", + " 'Chronologia Technica & Historica',\n", + " 'Chronographi, Seu Chronica, & Historiæ Vniversal',\n", + " 'Chronographi, seu Chronica, & Historiæ Vniversales',\n", + " 'Historia generalis ante & post Christum natum',\n", + " 'Historiæ general. ante & post Christum natum',\n", + " 'Hist. general. ante & post Christum natum',\n", + " 'Historia Eccles. ante & post Christum natum',\n", + " 'Historia Eccl. ante & post Christum nat.',\n", + " 'Historia Ecclesiarum Europæ cum Episcopor. Vitis',\n", + " 'Histor. Ecclesiarum Europæ cum Episcopor. vitis',\n", + " 'Historia Ecclesiar. Europæ cum Episcopor. Vitis',\n", + " 'Historia Ecclesiarum Europæ, cum Episcop. Vitis',\n", + " 'Historia Ecclesiar. Europæ cum Episcoporum Vitis',\n", + " 'Historia Ecclesiarum extra Europam, & Missionum ad Fidei Propagationem',\n", + " 'Historia Ecclesiar. extra Europam, & Missionum ad Fidei Propagationem',\n", + " 'Historia Ecclesiarum extra Europ. & Missionum ad Fidei Propagationem',\n", + " 'Historia Ecclesiar. extra Europ. & Missionum ad Fidei Propagationem',\n", + " 'Vitæ Sanctorum, Martyrologia, & Hist. de Locis Sacr. SS. Reliquiis, Imaginibus miraculosis, &c.',\n", + " 'Vitæ SS, Martyrologia; & Hist. de Locis Sacris, Reliquiis, Imaginibus miraculosis, &c.',\n", + " 'Martyrologia, Vitæ SS. & Hist. de Locis Sacr., Reliquiis, Imaginibus miraculosis, &c',\n", + " 'Historia Summorum Pontificum & Cardinalium',\n", + " 'Historia Ordinum Religiosorum, & Coenobiorum cum Institutorum, Fundatorum, &c. Vitis',\n", + " 'Historia Ordinum Religiosor. & Monasterior. cum Institutorum, Fundator. &c Vitis',\n", + " 'Historia Ordinum Religiosorum, & Monaster. cum Institutorum, Fundatorum, &c. Vitis',\n", + " 'Historia Ordinum Religiosorum & Monasteriorum, cum Institutorum, Fundatorum, &c Vitis.',\n", + " 'Hist. Ordinum Religiosorum, & Monasterior. cum Institutorum, Fundatorum, &c Vitis.',\n", + " 'Historia Ordinum Religiosor. & Monasteriorum, cum Institutorum, Fundatorum, &c Vitis.',\n", + " 'Historia Ordinum Militarium & Equestrium',\n", + " 'Historia Hæresium & Hæreticorum, necnon de Inquisitione Tractatus',\n", + " 'Historia Judaïca, Assyria, Medorum, Persica vetus, Macedonica, Babylonica, Trojana, &c.',\n", + " 'Historia Judaïca, Assyria, Medorum, Persica vet., Macedonica, Babylonica, Trojana, &c.',\n", + " 'Historia Græca generalis & Singularis, cum Atticâ, & Insularum descriptionibus',\n", + " 'Historia Romana generalis, seu ab Vrbe conditâ. & Imperij Romani notitia',\n", + " 'Historia Romana Sæculorum aliq. præsertim Imperatorum temporibus',\n", + " 'Historia Romana Sæculorum aliquot, præsertim Imperatorum temporibus',\n", + " 'Historia Romana Sæculorum aliquot, præsertim Imperatorum tempore',\n", + " 'Historia Imperij Oriental. Seu Bysantina',\n", + " 'Historia Imperij Orientalis, Seu Bysantina',\n", + " 'Historia Imperii Orientalis, Seu Bysantina',\n", + " 'Historia Europæa Vniversalis',\n", + " 'Italiæ universæ Historia & Notitia',\n", + " 'Italiæ Vniversæ Historia & Notitia',\n", + " 'Hist. Latij, Romæ modernæ, ac Status Ecclesiastici, ubi etiam Parmensis, Vrbinensis & Ferrariensis',\n", + " 'Historia Latij, Romæ modernæ, ac Statûs Ecclesiastici',\n", + " 'Historia Neapolitana generalis & Singularis Historia Sicula generalis & Singularis',\n", + " 'Hist. Neapolitana & Sicula gener. & Singul.',\n", + " 'Hist. Neapolitana, & Sicula gener. ac Singul.',\n", + " 'Historia Neapolit. & Sicula gener. ac Sing.',\n", + " 'Historia Veneta generalis & Singularis',\n", + " 'Historia Veneta generalis & Singular.',\n", + " 'Historia Veneta generalis & Singul.',\n", + " 'Historia Florentina generalis & Singularis',\n", + " 'Historia Mediolanensis, Mantuana, & Monferrat',\n", + " 'Historia Mediolanens. Mantuana, & Monferrat',\n", + " 'Hist. Mediolanens, Mantuana, & Monferratens.',\n", + " 'Historia Pedemontana & Sabaudica',\n", + " 'Hist. Sabaudica & Pedemont. gener. ac Singul.',\n", + " 'Hist. Sabaudica & Pedemont. gener. & Singul',\n", + " 'Historia Genuensis & Ligustica; Corsica, Sardinica, Tremitensis, & Melitensis',\n", + " 'Hist. Genuensis & Ligustica; Corsica; Sardinica; Ragusina & Melitens.',\n", + " 'Hist. Genuensis ac Ligustica; Corsica; Sardinica; Ragusina & Melitens.',\n", + " 'Hist. Genuensis ac Ligustica; Corsica; Sardin; Ragusina & Melitens.',\n", + " 'Histor. Genuens. ac Ligustica, Corsica, Sardinic; Ragusina & Melitensis',\n", + " 'Franciæ Historia generalis & Notitia, ubi etiam Historia Gallica Vetus',\n", + " 'Franciæ Historia generalis & Notitia, necnon Historia Gallica vetus',\n", + " 'Historia Franciæ aliquot temporum, ubi etiam Regum Historiæ, Libelli memoriales, Vitae, aliaque ad eamdem Historiam pertinentia',\n", + " 'Hist. Franciæ Singul. vel aliquot temporum; Vitæ Regum, & aliæ; Libelli Memoriales; aliaque',\n", + " 'Hist. Franciæ Singul. vel aliquot temporum; Vitæ Regum, & aliæ; Libelli Memoriales; alteraque',\n", + " 'Hist. Franciæ Singul. vel aliquot tempor; Vitæ Regum, & aliæ; Libelli Memoriales; aliaque',\n", + " 'Hist. Franciæ Singul. vel aliquot tempor; Vitæ Regum & alior; Libelli Memoriales; aliaque',\n", + " 'Hist. Franciæ Singul. vel aliquot tempor; Vitæ Regum ac alior; Libelli Memoriales; & alia.',\n", + " 'Hist. Franciæ Singul. vel aliquot tempor; Vitæ Regum & aliorum; Libelli Memoriales; aliaque',\n", + " 'Hist. Franciæ Singul. vel aliquot temp; Vitæ Regum & alior; Libelli Memoriales; aliaque',\n", + " 'Historia Franciæ Sing. vel aliq. tempor; Vitæ Regum, & aliæ; Libelli Memoriales; aliaque',\n", + " 'Historia Provinciarum & Vrbium Franciæ',\n", + " 'Historia Provinciarum & Vrbium Galliæ',\n", + " 'Historia Germaniæ generalis & Singularis seu aliquot temporum, cum Imperatorum Vitis, &c',\n", + " 'Historia Germaniæ general. & aliquot tempor; cum Imperatorum Vitis; &c',\n", + " 'Historia Germaniæ generalis & aliquot tempor, cum Imperatorum Vitis; &c',\n", + " 'Historia & descriptio Regionum Germaniæ',\n", + " 'Descriptio & Historia Regionum Germaniæ',\n", + " 'Historia Belgij Catholici generalis & Singular.',\n", + " 'Historia general. & Singul. Belgij Catholici',\n", + " 'Historia general. & Singular. Belgij Regij',\n", + " 'Historia general. & Singularis Belgij Regij',\n", + " 'Historia Generalis & Singularis Belgij Regij',\n", + " 'Historia generalis & Singular. Belgij Regij',\n", + " 'Historia Belgij foederati generalis & Singular',\n", + " 'Historia generalis & Singular. Belgij Foederati',\n", + " 'Historia generalis & Singul. Belgij Foederati',\n", + " 'Historia Lotharingica, Helvetica, & Populorum consinium',\n", + " 'Historia Lotharingica, Helvetica, & Rhætica',\n", + " 'Historia Lotharingica, Helvetica & Rhoetica',\n", + " 'Historia Hispanica generalis & aliquot temporum cum Regum Vitis, &c',\n", + " 'Historia Hispanica general. & aliquot tempor; Vitæ Regum & alia',\n", + " 'Historia Hispanica general. & aliquot tempor; Vitæ Regum; aliaque',\n", + " 'Historia Regnorum, seu Provinciarum & Vrbium Hispaniæ',\n", + " 'Hispaniæ Regnorum Seu Provinciar; & Vrbium Historia',\n", + " 'Hispaniæ Regnorum Seu Provinciar; & Vrbium Historia',\n", + " 'Hispaniæ Regnorum vel Provinciarum & Vrbium Historia',\n", + " 'Hispaniæ Regnorum vel Provinciar. ac Vrbium Historia',\n", + " 'Historia Lusitanica generalis & Singularis',\n", + " 'Historia Lusitaniæ generalis & Singularis',\n", + " 'Historica Anglica, Scotica, & Hybernica, general. & Singularis',\n", + " 'Angliæ, Scotiæ, ac Hyberniæ Hist. generalis & Singularis',\n", + " 'Angliæ, Scotiæ, ac Hiberniæ Histor. generalis & Singularis',\n", + " 'Historia Septentrionalis general. vetus & nova',\n", + " 'Historia Septentrionalis generalis vetus & nova',\n", + " 'Regionum Septentrion. nempe Daniæ, Norweg. Lapponiæ, Islandiæ, Sueciæ, ac Livoniæ Historia',\n", + " 'Regionum Septentrion. nempe Daniæ, Norwegiæ, Lapponiæ, Islandiæ, Sueciæ, ac Livoniæ Histor.',\n", + " 'Historia Regionum Septentrionalium, Scilicet Daniæ, Norwegiæ, Lapponiæ, Islandiæ Groënlandiæ, Sueciæ, ac Livoniæ',\n", + " 'Regionum Septentrion. Sive Moscoviæ, Poloniæ, Lithuaniæ, ac Prussiæ Regiæ Hist.',\n", + " 'Regionum Septentrion. sive Moscoviæ, Poloniæ, Lithuaniæ, ac Prussiæ Regiæ Historia',\n", + " 'Regionum Septentrion. seu Moscoviæ, Poloniæ, Lithuaniæ ac Prussiæ Regiæ Historia',\n", + " 'Historiæ Regionum Septentrion. seu Moscoviæ, Poloniæ, Lithuaniæ, ac Prussiæ Regiæ',\n", + " 'Historia Regionum Septentrionalium, nempe Moscoviæ, Poloniæ, Lithuaniæ, ac Prussiæ Reg.',\n", + " 'Historia Regionum Septentrionalium, nempè Hungariæ, Sclavoniæ, Croatiæ, Dalmatiæ, Transylvaniæ, Moldaviæ, ac Valachiæ',\n", + " 'Regionum Septentrionalium Sive Hungariæ, Sclavoniæ, Croatiæ, Dalmatiæ, Transylvaniæ, Moldaviæ, ac Valachiæ Historia',\n", + " 'Regionum Septentrion. seu Hungariæ, Sclavoniæ, Croatiæ, Dalmatiæ, Transylvaniæ, Moldaviæ, ac Valachiæ Historia',\n", + " 'Regionum Septentrion. Seu Hungariæ, Sclavon; Croatiæ, Dalmatiæ, Transylvaniæ, Moldaviæ, ac Valachiæ Historia',\n", + " 'Regionum Septentr. Seu Hungariæ, Sclavoniæ, Croatiæ, Dalmatiæ, Transylvaniæ, Moldaviæ, ac Valachiæ Historia',\n", + " 'Historia Orientalis generalis, cum Arabicâ, Sarracenicâ, & Turcicâ',\n", + " 'Historia Orientalis general. & Singul. Sive Arabica, Sarracenica, & Turcica',\n", + " 'Historia Orientalis general. & Singular. sive Arabica, Sarracenica, & Turcica',\n", + " 'Historia Asiatica generalis & Singularis',\n", + " 'Historia Africana generalis & Singularis',\n", + " 'Historia Americana generalis & Singularis',\n", + " 'Historiæ variæ, Dictionaria historica, & acta Publica',\n", + " 'Historiæ diversæ; Dictionaria Historica, & Acta Publica',\n", + " 'Historia Heraldica, necnon de Nobilitate ac Nobilib.',\n", + " 'Tractatus Heroïci ac Heraldici, seu de Nobilitate ac Insignibus',\n", + " 'Tractatus Heroici & Heraldici; seu de Nobilitate ac Insignibus',\n", + " 'Historia Genealogica',\n", + " 'Genealogiæ',\n", + " 'Miscellanea Antiquaria, seu Antiquitatum collectiones, & Museorum descriptions',\n", + " 'Antiquitatum collectiones cum Museis',\n", + " 'Antiquitatis Monumenta varia',\n", + " 'Antiquitatis Monumenta, seu Ædificia, Amphith., Obelisci, Statuæ, Gemmæ, Lucernæ, Vasa, &c',\n", + " 'Antiquitatis Monumenta, seu Ædificia, Amphitheatr., Obelisci, Statuæ, Gemmæ, Lucernæ, Vasa, &c',\n", + " 'Ritus Veterum, Seu de Rebus eorum Sacris, Civilibus, Militaribus, ac Domesticis',\n", + " 'Ritus Veterum; seu de Reb. eorum Sacris, Civilib; Militaribus ac Domesticis',\n", + " 'Ritus Veterum, seu de Rebus eor. Sacris, Civilibus, Militaribus ac Domesticis',\n", + " 'Inscriptiones ex Lapidibus excerptæ',\n", + " 'Inscriptiones antiquæ ex Lapidibus excerptæ',\n", + " 'Historia de Numismatibus & Re Monetariâ',\n", + " 'Pomparum Triumphalium ac Ludicrar. Historia',\n", + " 'Pompæ, ac Ceremoniæ Triumphales & Ludicræ',\n", + " 'Pomparum Exequialium Historia',\n", + " 'Pompæ, ac Ceremoniæ Exequiales',\n", + " 'Historia Litteraria & Bibliographica cum Biblioth. Catalogis',\n", + " 'De Re Litterariâ & Bibliographicâ, cum Catalogis Bibliothecarum',\n", + " 'De Re Litteriâ & Bibliographicâ cum Catalogis Bibliothecarum',\n", + " 'Historia Academiarum, Scholarum, Vniversitatum, & Collegiorum',\n", + " 'De Academiis, Vniversitatibus, Scholis, Colleg. &c',\n", + " 'Icones, Vitæ, ac Elogia Illustrium Veterum',\n", + " 'Icones, Vitæ, ac Elogia Principum, & Illustrium Militiâ, Dignitatibusque Recentiorum',\n", + " 'Icones & Vitæ Recentiorum Principum, & Militiâ Dignitatibusque Illustr.',\n", + " 'Icones, Vitæ, ac Elogia Scientiis & Artibus Illustrium',\n", + " 'Icones ac Vitæ Recentiorum Principum, & Militiâ Dignitatibusque Illustr.',\n", + " 'Icones ac Vitæ Recentior. Principum, & Militiâ Dignitatibusque Illustrium',\n", + " 'Icones & Vitæ Veterum Ill. Græcor. ac Romanor.',\n", + " 'Icones & Vitæ Veterum Ill. Græcorum ac Roman.',\n", + " 'Icones & Vitæ Veterum Ill. Græcor. & Roman.',\n", + " 'Icones ac Vitæ Scientiis & Artibus Illustrium',\n", + " 'Opera integra Pictorum, Sculptorum, & Chalcographorum',\n", + " 'Opera Integra Pictorum, Sculptorum & Cælatorum Ecole de Florence',\n", + " 'Opera Integra Pictorum, Sculptorum, & Cælator',\n", + " 'Imaginum Romæ in æs incisarum Collectio',\n", + " 'Imaginum Romæ Incisarum Collectio',\n", + " 'Imaginum Romæ Collectio',\n", + " 'Imaginum Pynacothecæ Regiæ Collectio',\n", + " 'Imaginum variarum Collectio',\n", + " 'Imagines Variæ Incisæ',\n", + " 'Effigierum in æs incisarum Collectio',\n", + " 'Imaginum delineatarum Collectio'\n", + " ]\n", + " textlines = self.XMLSources.PageXML.find_all('TextLine')\n", + " subcategories = []\n", + " subcategory_lines = []\n", + " for line in textlines:\n", + " l = Line(line)\n", + " text = l.Text\n", + " prepared_string = prepare_string(text)\n", + " prep_text = re.sub('\\d', '', prepared_string)\n", + " # if a 4-digit number was removed from the text we're probably looking at a line from an entry\n", + " if len(prepared_string) - len(prep_text) > 3:\n", + " continue\n", + " # if there is no subcategory\n", + " # if prep_text == 'codicesmanuscripti' \\\n", + " # or prep_text == 'imaginesvariæincisæ' \\\n", + " # or prep_text =='effigierumincisarcollectio' \\\n", + " # or prep_text == 'imaginumdelineatarcollectio':\n", + " # subcategories.append(text)\n", + " # subcategory_lines.append(l)\n", + " # if category skip this line\n", + " if prep_text in prep_cats:\n", + " continue\n", + " prep_text_in_subcat = [prep_text in prepare_string(subcat) for subcat in true_subcategories]\n", + " # TODO: implement similarity comparison and location check\n", + " if sum(prep_text_in_subcat) and len(prep_text) > 3:\n", + " subcategories.append(text)\n", + " subcategory_lines.append(l)\n", + " return (subcategories, subcategory_lines)\n", + "\n", + "class Entries:\n", + " def __init__(self, Header, XMLSources):\n", + " self.Entries = self.group_entries(Header, XMLSources)\n", + " \n", + " def __str__(self):\n", + " entry_string = '\\n'\n", + " for entry in self.Entries:\n", + " # entry_string += f'Entry of length {len(entry)}:\\n'\n", + " for line in entry:\n", + " if entry.index(line) > 0:\n", + " entry_string += '\\t'\n", + " entry_string += f'{line}\\n'\n", + " entry_string += '\\n'\n", + " return entry_string\n", + " \n", + " def get_all_ids(self):\n", + " lines = []\n", + " for entry in self.Entries:\n", + " for line in entry:\n", + " lines.append(line)\n", + " IDs = [l.ID for l in lines]\n", + " return IDs\n", + " \n", + " def group_entries(self, Header, XMLSources):\n", + " entries = []\n", + " if Header.isValid:\n", + " all_lines = [Line(line) for line in XMLSources.PageXML.find_all('TextLine')]\n", + " header_ids = Header.get_all_ids()\n", + " no_header_lines = list(filter(lambda x: x.ID not in header_ids, all_lines))\n", + " sorted_lines = sorted(no_header_lines, key=lambda x: x.Baseline[0][1])\n", + " w, h = Header.get_width_height()\n", + " filter_lines = list(filter(lambda x: x.Baseline[0][0] < 0.4 * w, sorted_lines))\n", + " if not filter_lines:\n", + " return entries\n", + " bl_centers_x = [np.mean(l.Baseline, axis=0)[0] for l in filter_lines]\n", + " bl_left_x = [l.Baseline[0][0] for l in filter_lines]\n", + " # print(bl_centers_x)\n", + " bl_left_x_mean = np.mean(bl_left_x)\n", + " # print('bl_left_x_mean:', bl_left_x_mean)\n", + " left_outliers_removed = []\n", + " for line, center, left in zip(filter_lines, bl_centers_x, bl_left_x):\n", + " if (center > bl_left_x_mean + 50 and left < bl_left_x_mean + 700):\n", + " left_outliers_removed.append(line)\n", + " filter_coord = [line.Baseline[0][0] for line in left_outliers_removed]\n", + " if not filter_coord:\n", + " return entries\n", + " X = np.array(filter_coord)[:, np.newaxis]\n", + " # for line in left_outliers_removed:\n", + " # print(line)\n", + " # print(X)\n", + " gm = GaussianMixture(n_components=2, random_state=42).fit(X)\n", + " preds = gm.predict(X)\n", + " # print(gm.means_)\n", + " # print(preds)\n", + " beg_marker = int(not (gm.means_[0, 0] < gm.means_[1, 0]))\n", + " # print(beg_marker)\n", + " # print('Means:', gm.means_[0][0], gm.means_[1][0])\n", + " # print('Covariances:', gm.covariances_[0, 0, 0], gm.covariances_[1, 0, 0])\n", + " for line, pred in zip(left_outliers_removed, preds):\n", + " if pred == beg_marker or left_outliers_removed.index(line) == 0:\n", + " entries.append([line])\n", + " else:\n", + " entries[-1].append(line)\n", + " return entries\n", + "\n", + "class RemainingLines:\n", + " def __init__(self, Header, Entries, XMLSources):\n", + " self.Lines = self.get_remaining_lines(Header, Entries, XMLSources)\n", + " \n", + " def __str__(self):\n", + " rem_string = 'Remaining Lines:\\n'\n", + " for line in self.Lines:\n", + " rem_string += f'{line}\\n'\n", + " return rem_string\n", + " \n", + " def get_remaining_lines(self, Header, Entries, XMLSources):\n", + " all_lines = [Line(line) for line in XMLSources.PageXML.find_all('TextLine')]\n", + " header_ids = Header.get_all_ids()\n", + " no_header_lines = list(filter(lambda x: x.ID not in header_ids, all_lines))\n", + " entry_ids = Entries.get_all_ids()\n", + " no_entry_lines = list(filter(lambda x: x.ID not in entry_ids, no_header_lines))\n", + " return no_entry_lines\n", + "\n", + "class XMLSources:\n", + " def __init__(self, page_string, tei_string):\n", + " with open(page_string, 'r') as page_xml:\n", + " content = page_xml.readlines()\n", + " content = \"\".join(content)\n", + " bs_content = bs(content, \"lxml-xml\")\n", + " self.PageXML = bs_content\n", + " with open(tei_string, 'r') as tei_xml:\n", + " content = tei_xml.readlines()\n", + " content = \"\".join(content)\n", + " bs_content = bs(content, \"lxml-xml\")\n", + " self.TEIXML = bs_content\n" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "d0100237-c1ee-4771-9f78-fc42877e259a", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Filename: 14.376_00000025.jpeg\n", + "Page number(s): 1\n", + "Format: in folio\n", + "Category: Theologia\n", + "Subcategory: Textus & Versiones S. Scripturæ\n", + "\n", + "1 Biblia Sacra Polyglotta curis Cardinalis Ximenij¬\n", + "\tedita, cum Vocabulario Hebraïco & Chaldaïco veteris\n", + "\tTestamenti. 5 Vol. Compluti. 1515. & annis Seqq.\n", + "\tGuill. de Brocario. n. 84. Integra biblia sunt V. volumini=\n", + "\tbus comprehensa. deest Vocabularium, quod pro sexto numeratur.\n", + "\n", + "2 Biblia Sacra Polyglotta Philippi II. Regis Cathol. pietate,\n", + "\tac # Studio ad Sacro- Sanctæ Ecclesiæ usum edita, cum\n", + "\tApparatu & Lexico. 8 Vol. Antuerp. 1572. Plantin.\n", + "\tn. 9.\n", + "\n", + "3 Biblia Sacra Polyglotta Studio & curâ Briani Waltoni\n", + "\tedita. 6 Vol. chartâ Magnâ. Londini. 1637. Roycroft.\n", + "\taccedit n. 36.\n", + "\tLexicon Heptaglotton ad eadem Biblia Polyglotta\n", + "\tautore Edmundo Castello. 2 Vol. chartâ M. Londini.\n", + "\t1669. Roycroft. N. 36.\n", + "\n", + "4 Biblia Sacra Latina Moguntina dicta, prima omnium\n", + "\teditio in Membranis. 2 Vol. Moguntiæ. 1462.¬\n", + "\tIoan. Fust, & Petr. Schoiffer de Gerneshem. n. 2088. II.O.5.\n", + "\n", + "5 Biblia Sacra Latina Moguntina, editio altera 2 Vol.\n", + "\tchartâ Magnâ. Moguntiæ. 1472. Petr. Schoiffer. II.O.7.\n", + "\tde Gerneshem. n. 2089.\n", + "\n", + "6 Biblia Sacra Latina cum glossa & Comment. 4 Vol.\n", + "\tchartâ Magnâ. Editio perantiqua sine loco & II.O.1.\n", + "\tanno n. 2086.\n", + "\n", + "7 Biblia Sacra latina vulgatæ Editionis. 2 Vol. chartâ III.B.3.\n", + "\tMagnâ. 1474. absque loco Editionis n. 2090.\n", + "\n", + "8 Biblia Sacra Latina cum Evangelistarum Canonibus III.B.7.\n", + "\t& concordantiis. chartâ M. Coloniæ. 1479. de Homborch. n. 2093.\n", + "\n", + "\n", + "Remaining Lines:\n", + "# Benedicti\n", + "Arriæ Montani\n", + "\n" + ] + } + ], + "source": [ + "# TODO: continue correcting headings here\n", + "page = Page('data/tei-xml/14.376/14.376_00000025.xml', 'data/tei-xml/old/codex_14_376_up_to_30_tei.xml')\n", + "print(page)\n", + "print(page.Entries)\n", + "print(page.RemainingLines)" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "e9741a33-f921-40f3-9bc7-6b8b2e28fc27", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Filename: 14.376_00000027.jpeg\n", + "Page number(s): 3\n", + "Format: in folio\n", + "Category: Theologia\n", + "Subcategory: Textus & Versiones S. Scripturæ\n", + "\n", + "16 La Sacra Biblia tradotta dall'Ebraico in Italiana\n", + "\tlingua con commento per Ant. Bruccioli. 6 Vol. in\n", + "\t4 Tom. in Venetia. 1544. Franc. Bruccioli. n. 164.\n", + "\n", + "17 La Sacra Biblia tradotta in Italiano e commentata\n", + "\tda Giov: Diodati, coll'aggiunta de Sacri Salmi\n", + "\tmessi in rime dal medesimo. in Geneva. 1641. n. 163.\n", + "\n", + "18 La S.e Bible historiée dite d'Escholatre avec des fig.\n", + "\tgravées en bois. 2 Vol. sans datte. Barthel. Verard. n. 165.\n", + "\n", + "19 La S.te Bible traduite en françois par Rob. Olivetan\n", + "\tgrand Pap. Neufchatel. 1535. Pierre de Wingle dit\n", + "\tPirot Picard. n. 86.\n", + "\n", + "20 Biblia Sacra germanicè Scripta cum figuris. 2 Vol.\n", + "\tchartâ M. Norimbergæ. 1483. ant. Koberger. n. 2097. VI.D.20.\n", + "\n", + "21 Biblia Sacra germanicè Scripta Martino Luthero\n", + "\teditore cum figuris coloribus adornatis. 2 Vol. Wittemb.\n", + "\t1545. hans Lufft. n. 87.\n", + "\n", + "22 Biblia Sacra germanicè Mss. in Membranis cum\n", + "\tfiguris pictis. n. II.\n", + "\n", + "23 Biblia Sacra Hollandicé Mss. in Membranis cum fig.\n", + "\tpictis. 2 Vol. n. III.\n", + "\n", + "24 Biblia Sacra, Linguâ Valachâ Scripta. chartâ M.\n", + "\tBukurestini. 1688. n. 11.\n", + "\n", + "25 Historia Veteris ac Novi Testamenti MS. in Membranis\n", + "\tcum innumeris ferè Imaginibus miniatè depictis. n. 1.\n", + "\n", + "\n", + "Remaining Lines:\n", + "\n" + ] + } + ], + "source": [ + "page = Page('data/tei-xml/14.376/14.376_00000027.xml', 'data/tei-xml/old/codex_14_376_up_to_30_tei.xml')\n", + "print(page)\n", + "print(page.Entries)\n", + "print(page.RemainingLines)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "25e8978d-c62f-4b4c-88e1-be9f2c2b80c2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "page_dir_14376 = pathlib.Path('data/tei-xml/14.376')\n", + "pages_14376 = []\n", + "\n", + "for xml in page_dir_14376.glob('*.xml'):\n", + " try:\n", + " pages_14376.append(Page(xml, 'data/tei-xml/old/codex_14_376_up_to_30_tei.xml'))\n", + " except Exception as e:\n", + " print(xml)\n", + " print(e)\n", + "# pages = sorted(pages, key=(lambda x: int(x.Header.PageNumber[0][0])))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "57168e21-394a-44ed-b8e1-632a7ad76863", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "page_dir_14377 = pathlib.Path('data/tei-xml/14.377')\n", + "pages_14377 = []\n", + "\n", + "for xml in page_dir_14377.glob('*.xml'):\n", + " try:\n", + " pages_14377.append(Page(xml, 'data/tei-xml/old/codex_14_376_up_to_30_tei.xml'))\n", + " except Exception as e:\n", + " print(xml)\n", + " print(e)" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "edc2700e-7590-4f7f-a773-b6363552b473", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "page_dir_14378 = pathlib.Path('data/tei-xml/14.378')\n", + "pages_14378 = []\n", + "\n", + "for xml in page_dir_14378.glob('*.xml'):\n", + " try:\n", + " pages_14378.append(Page(xml, 'data/tei-xml/old/codex_14_376_up_to_30_tei.xml'))\n", + " except Exception as e:\n", + " print(xml)\n", + " print(e)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "0f1f3087-81c9-451c-81bb-2f2e59d29d96", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total: 451 pages\n", + "Valid header: 434 pages\n", + "Invalid header: 17 pages\n" + ] + } + ], + "source": [ + "invalid_pages_14376 = list(filter(lambda x: not x.Header.isValid, pages_14376))\n", + "print('Total:', len(pages_14376), 'pages')\n", + "print('Valid header:', len(pages_14376) - len(invalid_pages_14376), 'pages')\n", + "print('Invalid header:', len(invalid_pages_14376), 'pages')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "46836c45-0926-456b-bf94-a1ac07462e21", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total: 450 pages\n", + "Valid header: 440 pages\n", + "Invalid header: 10 pages\n" + ] + } + ], + "source": [ + "invalid_pages_14377 = list(filter(lambda x: not x.Header.isValid, pages_14377))\n", + "print('Total:', len(pages_14377), 'pages')\n", + "print('Valid header:', len(pages_14377) - len(invalid_pages_14377), 'pages')\n", + "print('Invalid header:', len(invalid_pages_14377), 'pages')" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "5e3b7597-72b0-4e06-a6fc-be069dfa9b7a", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total: 565 pages\n", + "Valid header: 556 pages\n", + "Invalid header: 9 pages\n" + ] + } + ], + "source": [ + "invalid_pages_14378 = list(filter(lambda x: not x.Header.isValid, pages_14378))\n", + "print('Total:', len(pages_14378), 'pages')\n", + "print('Valid header:', len(pages_14378) - len(invalid_pages_14378), 'pages')\n", + "print('Invalid header:', len(invalid_pages_14378), 'pages')\n", + "sorted_pages_14378 = sorted(pages_14378, key=lambda x: x.get_filename())\n", + "sorted_invalid_pages_14378 = sorted(invalid_pages_14378, key=lambda x: x.get_filename())" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "f18d74ec-63ac-4950-a1d8-89ab3c42849e", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "14.378_00000280.jpeg\n", + "14.378_00000328.jpeg\n", + "14.378_00000009.jpeg\n" + ] + } + ], + "source": [ + "def has_empty_header(page):\n", + " p_num = page.Header.PageNumber[0]\n", + " p_format = page.Header.Format[0]\n", + " p_cat = page.Header.Category[0]\n", + " p_subcat = page.Header.Subcategory[0]\n", + " return p_num + p_format + p_cat + p_subcat == []\n", + "\n", + "empty_headers = []\n", + "for page in invalid_pages_14378:\n", + " if has_empty_header(page):\n", + " empty_headers.append(page)\n", + "\n", + "for page in empty_headers:\n", + " print(page.XMLSources.PageXML.find_all('Page')[0].attrs['imageFilename'])" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "c92c8f90-ac4e-49eb-9d76-786ee0fc6ee3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "14.378_00000009.jpeg\n", + "14.378_00000011.jpeg\n", + "14.378_00000279.jpeg\n", + "14.378_00000280.jpeg\n", + "14.378_00000317.jpeg\n", + "14.378_00000327.jpeg\n", + "14.378_00000328.jpeg\n", + "14.378_00000487.jpeg\n", + "14.378_00000531.jpeg\n" + ] + } + ], + "source": [ + "for page in sorted_invalid_pages_14378:\n", + " print(page.get_filename())\n", + " # print(page.Header.Format[0])\n", + " # if not page.Header.Format[0]:\n", + " # print(page.get_filename())" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "1b9ba554-0903-4367-b966-86ae213d59c5", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "345\n", + "123\n", + "456\n", + "456\n" + ] + } + ], + "source": [ + "def extract_number(page_number_array):\n", + " pg_nums = page_number_array[0]\n", + " for num in pg_nums:\n", + " num_str = re.sub('\\*+|⁎+|X+', '', num)\n", + " if not num_str:\n", + " continue\n", + " parsed_num = int(num_str)\n", + " return parsed_num\n", + "\n", + "print(extract_number([['345', '*'], ['asdf', 'qwer']]))\n", + "print(extract_number([['**', '123'], ['asdf', 'qwer']]))\n", + "print(extract_number([['456**'], ['asdf']]))\n", + "print(extract_number([['456⁎⁎'], ['asdf']]))\n", + "sorted_valid_pages_14376 = sorted(set(pages_14376) - set(invalid_pages_14376), key=lambda x: x.get_filename())\n", + "sorted_valid_pages_14377 = sorted(set(pages_14377) - set(invalid_pages_14377), key=lambda x: x.get_filename())\n", + "sorted_valid_pages_14378 = sorted(set(pages_14378) - set(invalid_pages_14378), key=lambda x: x.get_filename())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9556eaa9-ba80-4960-a5f6-d351dc59fcab", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "for page in sorted_valid_pages_14378:\n", + " rem_page_str = str(page.RemainingLines)\n", + " if rem_page_str != 'Remaining Lines:\\n':\n", + " print(page.get_filename())\n", + " print(rem_page_str)" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "dddc2174-31a8-4165-80b7-ea519ccefb2e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "for page in sorted_valid_pages_14378:\n", + " if len(page.Header.Format[0]) > 2 or len(page.Header.PageNumber[0]) > 2 or len(page.Header.Category[0]) > 1 or len(page.Header.Subcategory[0]) > 3:\n", + " print(page.XMLSources.PageXML.find_all('Page')[0].attrs['imageFilename'])" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "69fecbc7-77c2-4056-ba53-22a0a677c396", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# filenames where grouping entries failed:\n", + "failed_names_14376 = [\n", + " '14.376_00000049.jpeg',\n", + " '14.376_00000100.jpeg',\n", + " '14.376_00000104.jpeg',\n", + " '14.376_00000114.jpeg',\n", + " '14.376_00000127.jpeg',\n", + " '14.376_00000129.jpeg',\n", + " '14.376_00000133.jpeg',\n", + " '14.376_00000134.jpeg',\n", + " '14.376_00000182.jpeg',\n", + " '14.376_00000221.jpeg',\n", + " '14.376_00000226.jpeg',\n", + " '14.376_00000249.jpeg',\n", + " '14.376_00000270.jpeg',\n", + " '14.376_00000284.jpeg',\n", + " '14.376_00000297.jpeg',\n", + " '14.376_00000301.jpeg',\n", + " '14.376_00000324.jpeg',\n", + " '14.376_00000328.jpeg',\n", + " '14.376_00000338.jpeg',\n", + " '14.376_00000371.jpeg',\n", + " '14.376_00000381.jpeg',\n", + " '14.376_00000399.jpeg',\n", + " '14.376_00000414.jpeg',\n", + " '14.376_00000428.jpeg',\n", + " '14.376_00000438.jpeg',\n", + " '14.376_00000441.jpeg',\n", + " '14.376_00000443.jpeg'\n", + "]\n", + "\n", + "failed_names_14377 = [\n", + " '14.377_00000017.jpeg',\n", + " '14.377_00000025.jpeg',\n", + " '14.377_00000054.jpeg',\n", + " '14.377_00000064.jpeg',\n", + " '14.377_00000078.jpeg',\n", + " '14.377_00000079.jpeg',\n", + " '14.377_00000093.jpeg',\n", + " '14.377_00000096.jpeg',\n", + " '14.377_00000097.jpeg',\n", + " '14.377_00000098.jpeg',\n", + " '14.377_00000112.jpeg',\n", + " '14.377_00000122.jpeg',\n", + " '14.377_00000129.jpeg',\n", + " '14.377_00000130.jpeg',\n", + " '14.377_00000131.jpeg',\n", + " '14.377_00000132.jpeg',\n", + " '14.377_00000133.jpeg',\n", + " '14.377_00000134.jpeg',\n", + " '14.377_00000135.jpeg',\n", + " '14.377_00000136.jpeg',\n", + " '14.377_00000137.jpeg',\n", + " '14.377_00000138.jpeg',\n", + " '14.377_00000139.jpeg',\n", + " '14.377_00000153.jpeg',\n", + " '14.377_00000155.jpeg',\n", + " '14.377_00000172.jpeg',\n", + " '14.377_00000179.jpeg',\n", + " '14.377_00000180.jpeg',\n", + " '14.377_00000181.jpeg',\n", + " '14.377_00000186.jpeg',\n", + " '14.377_00000199.jpeg',\n", + " '14.377_00000231.jpeg',\n", + " '14.377_00000259.jpeg',\n", + " '14.377_00000273.jpeg',\n", + " '14.377_00000281.jpeg',\n", + " '14.377_00000291.jpeg',\n", + " '14.377_00000292.jpeg',\n", + " '14.377_00000294.jpeg',\n", + " '14.377_00000295.jpeg',\n", + " '14.377_00000329.jpeg',\n", + " '14.377_00000366.jpeg',\n", + " '14.377_00000386.jpeg',\n", + " '14.377_00000387.jpeg',\n", + " '14.377_00000392.jpeg',\n", + " '14.377_00000407.jpeg',\n", + " '14.377_00000433.jpeg',\n", + "]\n", + "\n", + "failed_names_14378 = [\n", + " '14.378_00000015.jpeg',\n", + " '14.378_00000059.jpeg',\n", + " '14.378_00000064.jpeg',\n", + " '14.378_00000076.jpeg',\n", + " '14.378_00000079.jpeg',\n", + " '14.378_00000091.jpeg',\n", + " '14.378_00000101.jpeg',\n", + " '14.378_00000111.jpeg',\n", + " '14.378_00000125.jpeg',\n", + " '14.378_00000139.jpeg',\n", + " '14.378_00000147.jpeg',\n", + " '14.378_00000151.jpeg',\n", + " '14.378_00000152.jpeg',\n", + " '14.378_00000153.jpeg',\n", + " '14.378_00000155.jpeg',\n", + " '14.378_00000169.jpeg',\n", + " '14.378_00000171.jpeg',\n", + " '14.378_00000173.jpeg',\n", + " '14.378_00000205.jpeg',\n", + " '14.378_00000210.jpeg',\n", + " '14.378_00000246.jpeg',\n", + " '14.378_00000251.jpeg',\n", + " '14.378_00000258.jpeg',\n", + " '14.378_00000261.jpeg',\n", + " '14.378_00000266.jpeg',\n", + " '14.378_00000274.jpeg',\n", + " '14.378_00000282.jpeg',\n", + " '14.378_00000283.jpeg',\n", + " '14.378_00000310.jpeg',\n", + " '14.378_00000322.jpeg',\n", + " '14.378_00000388.jpeg',\n", + " '14.378_00000392.jpeg',\n", + " '14.378_00000393.jpeg',\n", + " '14.378_00000399.jpeg',\n", + " '14.378_00000408.jpeg',\n", + " '14.378_00000410.jpeg',\n", + " '14.378_00000433.jpeg',\n", + " '14.378_00000452.jpeg',\n", + " '14.378_00000461.jpeg',\n", + " '14.378_00000462.jpeg',\n", + " '14.378_00000465.jpeg',\n", + " '14.378_00000466.jpeg',\n", + " '14.378_00000498.jpeg',\n", + " '14.378_00000513.jpeg',\n", + " '14.378_00000529.jpeg',\n", + " '14.378_00000579.jpeg',\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "313f429d-25af-4dd1-b2b2-ee48d515b619", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "119\n" + ] + } + ], + "source": [ + "print(len(failed_names_14376 + failed_names_14377 + failed_names_14378))" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "85aacdce-6686-46f7-9200-c77a87ac2569", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Filename: 14.378_00000577.jpeg\n", + "Page number(s): 1397\n", + "Format: \n", + "Category: Imagines Variæ Incisæ\n", + "Subcategory: Imagines Variæ Incisæ\n", + "\n", + "CCLXXXIX Les Heros de la Ligue, ou la Procession Monacale conduite\n", + "\tpar Louis XIV. pour la conversion des Heretiques de\n", + "\tSon Royaume. in 4.° Paris. 1691. P. Peters n. 2249. in 42.\n", + "\n", + "CCXC Il Claustro di S. Michele nel boseo di Bologna dipinto\n", + "\tda Lodovico Carraccio ed Altri Maestri asciti dalla sua\n", + "\tSeuola, intagliato per Giac. Giovannini, e descritto\n", + "\tda Carlo Cesare Malvasia. in fol.° carta grande. in\n", + "\tBologna 1694. p-335- n. 881.\n", + "\n", + "CCXCI Vlisse all'Isola di Circe, figure Teatrali intagliate\n", + "\tper R. van Hoecke. n. 1366.\n", + "\n", + "CCXCII Palazzi antichi di Genova raccolti e disegnati da P.\n", + "\tPaolo Rubens, carta Maggiore. in Anversa. 1652.\n", + "\tGiac. Meursio. n. CCL.\n", + "\n", + "CCXCIII Le Fabriche e Vedute di Venezia disegnate ed intagliate\n", + "\tda Luca Carlevariis. in fol.° oblongo. in Venezia. 1703.\n", + "\tGiov. Batt. Finazzi. n. 1110.\n", + "\n", + "CCXCIV Il gran Teatro delle Pitture e Perspettive di Venezia.\n", + "\tdisegnate ed intagliate da'piu Eccellenti Professori.¬\n", + "\tin fol. carta grande 2 Vol. in 1. Tom in Venezia.\n", + "\t1720. Domenico Lovisa. n. CXCIII.\n", + "\n", + "CCXCV La Vita di S. Filippo di Neri espressa con Imagini da\n", + "\tGiac. Stella inventate, e da Christiano Sas intagliate.\n", + "\tin fol.° n. 1215.\n", + "\n", + "CCXCVI Feste fatte Sopra l'Arno per le Nozze del Principe di\n", + "\tToscana l'anno 1608. disegnatè da Giulio Parigi.\n", + "\tin fol.° in carta grande. n. CLXII.\n", + "\tLivre des Vales invené par Mr. Stella Caris 1667.\n", + "\tn. 1111.\n", + "\n", + "\n", + "Remaining Lines:\n", + "2\n", + "\n", + "Filename: 14.378_00000578.jpeg\n", + "Page number(s): 1398\n", + "Format: \n", + "Category: Imagines Variæ Incisæ.\n", + "Subcategory: Imagines Variæ Incisæ.\n", + "\n", + "CCXVII Gli Habiti di tutte le Parti del Mondo intagliati da\n", + "\tBartolomeo Grassi in fol.° oblongo. in Roma. 1585. n.1112\n", + "\n", + "CCXCVIII. Habiti delle Donne Venetiane intagliati in Rame da\n", + "\tGiac. Franco, con una descrizione breve. in 4.° oblong. n. 2020\n", + "\n", + "CCXCVIIII Plans des Maisons de plaisance de l’Etat de Milan, tres\n", + "\tgrand in fol.° Vol. non converte\n", + "\n", + "CCC Recueil de figures contenant differens Evenemens arrivés\n", + "\ten France pendant et depuis le Regne d’Henry II.¬\n", + "\ttres grand in fol.° oblong Sans datte. La Monpieles\n", + "\n", + "CCCI. Recueil d'Estampes gravées d'après les Tableaux des plus\n", + "\tbeaux Cabinets de France par les Soins de M.r Crozat.\n", + "\tin fol.° Tr Vol. 2\n", + "\tAmori sdegni et gielosie di Giunone Julio Bonatone\n", + "\tinventore. S. L. et A. n. 958.\n", + "\tScherzi d’amore espressi da Odoardo Fialetti, Venet.\n", + "\t1617. n. 959.\n", + "\tBakhuizen /:Ludolf/ Ob het Kunstig zeeschilderen,\n", + "\tAmsterd. 1701. n. 960.\n", + "\tLe Blond / La Gallerie de Ann. Carrache S. L et A. n. 957.\n", + "\tRecueil des Estampes S. L. et A. et sine Authore n. 956.\n", + "\tBerrettini (Pietro) Galleria, dipinta nel Palazzo del\n", + "\tPrencipe Pansilio. Roma S. A. n. 987.\n", + "\tvan Dyck (Ant:) Jcones Principum aliorumque in fol. C. M.\n", + "\tn. 986.\n", + "\tCarrache (Annibal) Diverse figure disegnate di pen=\n", + "\tna nell' hore di ricreatione. Roma 1646. n. 974.\n", + "\tComitium Gloriæ centum quâ Sanguine quâ virtute\n", + "\tillustrium Heroum per. El. Wideman. Aug. S. A. n. 976.\n", + "\tHistoire de Psiche et cupidon gravee sur les desseins\n", + "\tde Raphael par Aug. Venitien S. L et A. n. 975.\n", + "\tStrasburger Trachten von Mann und Weib 1680. n. 2250. 4.to\n", + "\tHuret / Gregor / Theatrum dolorum Jesu Christi, Paris 1664. n. CCLXXII.\n", + "\tPicart /: Bern:/ Jmpostures innocentes, ou recueil\n", + "\td’Estampes, &c. Amsterd. 1734. n. 987⁎\n", + "\n", + "\n", + "Remaining Lines:\n", + "\n", + "Filename: 14.378_00000579.jpeg\n", + "Page number(s): 1399\n", + "Format: \n", + "Category: Effigierum Incisar. Collectio\n", + "Subcategory: Effigierum Incisar. Collectio\n", + "\n", + "Recueil de Portraits en 216 Portefeüilles, savoir\n", + "\n", + "CCCII Des Anciens. 6\n", + "\n", + "CCCIII d'Italie. 31.\n", + "\n", + "CCCIV De France. 48.\n", + "\n", + "CCCV D'Allemagne. 61.\n", + "\n", + "CCCVI Des Paysbas Catholiques. 9.\n", + "\n", + "CCCVII Des Provinces Vnies. 10\n", + "\n", + "CCCVIII De Lorraine. 2\n", + "\n", + "CCCIX D'Espagne. 5.\n", + "\n", + "CCCX De Portugal. 2.\n", + "\n", + "CCCXI D'Angleterre. 13.\n", + "\n", + "CCCXII De Pologne. 2\n", + "\n", + "CCCXIII De Suede. 2.\n", + "\n", + "CCCXIV De Dannemarck. 2. I\n", + "\n", + "CCCXV D'Hongrie, Transylvanie, & Valachie. 3.\n", + "\n", + "CCCXVI De Turquie, Grece, & de l'Archipel. 1.\n", + "\n", + "CCCXVII D'Asie, d'Afrique, & d'Amerique. 1.\n", + "\n", + "\n", + "Remaining Lines:\n", + "\n", + "Filename: 14.378_00000580.jpeg\n", + "Page number(s): 1400\n", + "Format: \n", + "Category: Effigierum Incisar. Collectio\n", + "Subcategory: Effigierum Incisar. Collectio\n", + "\n", + "CCCXVIII D'Anciens Evêques, & d'Evêques in Partibus. 1.\n", + "\n", + "CCCXIX Des Ordres Religieux. 14.\n", + "\n", + "CCCXX Des Ordres Militaires. a la fin des Ordres Religieux\n", + "\n", + "CCCXXI Anonymes. 2.\n", + "\tCCCXXII En Petites Suittes. 1.\n", + "\tCCCXXIII De Comediens & figures Extraordinaires. 1.\n", + "\n", + "\n", + "Remaining Lines:\n", + "\n", + "Filename: 14.378_00000581.jpeg\n", + "Page number(s): 1401\n", + "Format: \n", + "Category: Imaginum Delineatar. Collectio\n", + "Subcategory: Imaginum Delineatar. Collectio\n", + "\n", + "CCCXXIV Vn Porte feuilles contenant des Desseins de differens\n", + "\tMaitres d'Italie, avec le Cataloque, savoir\n", + "\tde Raphaël & de Son Ecole.\n", + "\tde L'Ecole Florentine\n", + "\tde l'Ecole de Venise.\n", + "\tde L'Ecole de Lombardie\n", + "\tde L'Ecole de Bologne.\n", + "\tde L'Ecole de Sienne.\n", + "\n", + "CCCXXV Vn Portefeüilles contenant des Desseins de Maitres\n", + "\tde France, avec diverses Etudes faites d'apres des\n", + "\tTableaux de Raphael & de Michel-Ange par\n", + "\tun Anonyme, & le Cataloque. n. CCLXV. 2. Vol.\n", + "\n", + "CCCXXVI Vn Portefeüilles contenant des Desseins de Paÿsages\n", + "\tpar differens Maitres, avec le Cataloque.\n", + "\tIl y a dans ce Portefeüilles un grand Dessein de¬\n", + "\tIules Romain qui represente Thesée & Hercules¬\n", + "\tcombattans contre les Amazones, lequel n'a pu entrer\n", + "\tdans le Portefeüilles ou sont les autres Desseins de\n", + "\tce Maitre acause de sa grandeur. n. CCLIV.\n", + "\n", + "CCCXXVII Vn Portefeüilles contenant des Desseins de Paysages\n", + "\tpar differens Maitres, sans Cataloque. n. CCLXIX.\n", + "\n", + "CCCXXVIII Vn Portefeüilles contenant des dessems de plusieurs¬\n", + "\tgenres par differens Maitres, Sans Cataloque. n. 891\n", + "\n", + "CCCXXIX Deux Portefeüilles contenans des Desseins de Fleurs\n", + "\tpar Robert. n. 892.\n", + "\n", + "CCCXXX Vn Portefeüilles contenans des Desseins d'Oiseaux\n", + "\tpar Robert. n. 893. et 894\n", + "\n", + "\n", + "Remaining Lines:\n", + "\n", + "Filename: 14.378_00000582.jpeg\n", + "Page number(s): 1402\n", + "Format: \n", + "Category: Imaginum Delineatar. Collectio\n", + "Subcategory: Imaginum Delineatar. Collectio\n", + "\n", + "CCCXXXI Vn Portefeüilles contenant des Desseins de plusieurs\n", + "\tVilles d'Espagne par Ant. Van-den-Wingarde.\n", + "\tdont il n'y a pas de Catalogue. n. 990.\n", + "\n", + "CCCXXXII Vn Recueil des Portraits peints en miniature sur velains\n", + "\tau nombres de 34. dont le Premier est celuy de\n", + "\n", + "mi n. 17\n", + "\tPhilippe le Hardy. n. CCIV.\n", + "\n", + "CCCXXXIII Dix Vol. de Plantes peintes en miniature par Nicol.\n", + "\tRobert. vide. Hist. Plantarum\n", + "\n", + "CCCXXXIV Cinq Vol. d'Oiseaux peints en Miniature par Nic.\n", + "\tRobert. vide. Histor. Animalium.\n", + "\n", + "CCCXXXV Divers Portraits, Ceremonies, Marches &c. des Turcs &\n", + "\td'autres nations du Levant, peints en miniature n. CCLXX.\n", + "\n", + "\n", + "Remaining Lines:\n", + "Teil von\n", + "Cod. mi n 41?\n", + "mi n. 53\n", + "mi n. 52\n", + "\n" + ] + } + ], + "source": [ + "x = 550\n", + "\n", + "for page in sorted_valid_pages_14378[x:x+50]:\n", + " print(page)\n", + " print(page.Entries)\n", + " print(page.RemainingLines)" + ] + }, + { + "cell_type": "markdown", + "id": "dfd319f6-268c-415a-95bb-bbbca979dfda", + "metadata": {}, + "source": [ + "# Counting number of entries" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "b08d2889-3dd3-4cb4-8cb5-2a297ee336c1", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2787 3105 3510 9402\n" + ] + } + ], + "source": [ + "count_14376 = 0\n", + "count_14377 = 0\n", + "count_14378 = 0\n", + "\n", + "for page in sorted_valid_pages_14376:\n", + " entries = page.Entries.Entries\n", + " count_14376 += len(entries)\n", + "\n", + "for page in sorted_valid_pages_14377:\n", + " entries = page.Entries.Entries\n", + " count_14377 += len(entries)\n", + " \n", + "for page in sorted_valid_pages_14378:\n", + " entries = page.Entries.Entries\n", + " count_14378 += len(entries)\n", + " \n", + "print(count_14376, count_14377, count_14378, count_14376 + count_14377 + count_14378)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e18ec97-2c48-4625-99ba-f98051e7ec8c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Skripte/plot_WER.py b/Skripte/plot_WER.py index 1d051219e9807ab8c7fac0b412b06283d0f088e5..47bf9877bb499199a9b6ce452ecf405d71d5a8d0 100644 --- a/Skripte/plot_WER.py +++ b/Skripte/plot_WER.py @@ -2,6 +2,11 @@ import matplotlib.pyplot as plt from matplotlib.patches import Rectangle import pandas as pd import numpy as np +from matplotlib import rc +#rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']}) +## for Palatino and other serif fonts use: +rc('font',**{'family':'serif','serif':['Palatino']}) +rc('text', usetex=True) WER_array = [['Modellname', 'Id', 'WER_A', 'WER_B'], ['German_Kurrent_17th-18th', 27457, 26.6, 30.3], @@ -13,14 +18,15 @@ WER_array = [['Modellname', 'Id', 'WER_A', 'WER_B'], WER_df = pd.DataFrame(WER_array[1:], columns=WER_array[0]) fig, ax = plt.subplots() -ax.bar(np.arange(6), WER_df['WER_A'], width=0.3, label='$\mathregular{WER}_\mathregular{all}$') -ax.bar(np.arange(6) + 0.3, WER_df['WER_B'], width=0.3, label='$\mathregular{WER}_{\{1, 3, 4\}}$') +ax.bar(np.arange(6), WER_df['WER_A'], width=0.3, label='$\mathrm{WER}_\mathrm{all}$') +ax.bar(np.arange(6) + 0.3, WER_df['WER_B'], width=0.3, label='$\mathrm{WER}_{\{1, 3, 4\}}$') plt.xticks(np.arange(6) + 0.15, WER_df['Id']) -plt.xlabel('Modell-Id') -plt.ylabel('WER [%]') -plt.title('Manuell bestimmte Wortfehlerraten auf dem Validierungs-Set') +# plt.xlabel('Modell-Id') +plt.xlabel('Model ID') +plt.ylabel('WER [\%]') +# plt.title('Manuell bestimmte Wortfehlerraten auf dem Validierungs-Set') +plt.title('Manually determined word error rates on the validation set') plt.legend(loc='upper right') - -# plt.savefig('../Bilder/WER_Validation.png', bbox_inches='tight', dpi=300) -plt.show() +plt.savefig('../img/documentation/WER_Validation_eng.png', bbox_inches='tight', dpi=300) +# plt.show() diff --git a/Skripte/plot_bounding_boxes.py b/Skripte/plot_bounding_boxes.py index 3da765d4f9d4614a93b248afdff16349fc0ab938..3da2c3d73f74627fec201f8510476b4d30dfb12e 100644 --- a/Skripte/plot_bounding_boxes.py +++ b/Skripte/plot_bounding_boxes.py @@ -2,8 +2,11 @@ import matplotlib.pyplot as plt from matplotlib.patches import Rectangle import pandas as pd import numpy as np +from matplotlib import rc +rc('font',**{'family':'serif','serif':['Palatino'], 'size': 14}) +rc('text', usetex=True) -bb_df = pd.read_csv('../Groundtruth/bounding_boxes.csv') +bb_df = pd.read_csv('../data/groundtruth/bounding_boxes.csv') bb_df['cent_x'] = bb_df.apply(lambda x: x['BB_x/w'] + x['BB_w/w']/2, axis=1) bb_df['cent_y'] = bb_df.apply(lambda x: x['BB_y/h'] + x['BB_h/h']/2, axis=1) @@ -36,11 +39,13 @@ ax.set(xlim=(0, 1), ylim=(0, 1)) ax.set_aspect(1.5) plt.gca().invert_yaxis() plt.title('Supralibros bounding boxes') -plt.xlabel('Normierte Breite') -plt.ylabel('Normierte Höhe') +# plt.xlabel('Normierte Breite') +plt.xlabel('Normalized width') +# plt.ylabel('Normierte Höhe') +plt.ylabel('Normalized height') -plt.show() -# plt.savefig('../Bilder/bounding_boxes.png', bbox_inches='tight', dpi=300) +# plt.show() +plt.savefig('../img/documentation/bounding_boxes_eng.png', bbox_inches='tight', dpi=300) print('Die Daten des 97% Quantil Rechtecks sind x,y,w,h =', average_x, average_y, hull_width, hull_height) print('Ausgedrueckt in Koordinaten fuer iiif ist das', (average_x - hull_width/2)*100, (average_y - hull_height/2)*100, hull_width*100, hull_height*100)