diff --git a/tokenization.ipynb b/tokenization.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..e3ecde65337b30b12455e1566e5118103289fcda
--- /dev/null
+++ b/tokenization.ipynb
@@ -0,0 +1,1074 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "1c9cc762-eb3a-4e4a-8392-b38b897df498",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import csv\n",
+ "import pathlib\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "import nltk\n",
+ "import re\n",
+ "import statistics\n",
+ "import string\n",
+ "from collections import Counter\n",
+ "from collections import defaultdict\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "import octis"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "7ea8760d-97e5-44ae-a271-1478989fd97b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import gensim\n",
+ "import spacy\n",
+ "import sklearn\n",
+ "import torch\n",
+ "import libsvm\n",
+ "import flask\n",
+ "import sentence_transformers\n",
+ "import requests\n",
+ "import tomotopy"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "32ea9351-4c82-48dc-9d85-9888243f0d3f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "os.chdir(r\"C:\\Users\\onb1202\\OneDrive - Österreichische Nationalbibliothek\\Praktikum TK\\daten\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8f3e3f35-d073-47e8-9ca8-03d995120d3a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#small test corpus"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "471091f8-cee3-4bff-bfeb-f403b879c830",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#df = pd.read_csv('test_raw.tsv', sep = '\\t', encoding='utf-8')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "9091c711-de70-46b3-9e42-517c9ecf106d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#from somajo import SoMaJo\n",
+ "\n",
+ "#tokenizer = SoMaJo(\"de_CMC\", split_camel_case=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "669d3e3b-c043-4594-8e88-106020e3b945",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#sentence = tokenizer.tokenize_text_file(df['ocr'], paragraph_separator=\"single_newlines\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b5012333-27a2-4f1c-ac15-170fa4e5fcb8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#type(sentence)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "e83b6b9c-f515-4b18-9eeb-354ba8bb395d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#import time\n",
+ "\n",
+ "#df = pd.read_csv('test_raw.tsv', sep = '\\t', encoding='utf-8')\n",
+ "\n",
+ "#ts = time.time()\n",
+ "#for i, ocr in df['ocr'].items():\n",
+ "# sentences = [s for s in tokenizer.tokenize_text(ocr.split('\\n'), parallel=4)]\n",
+ "# all_tokens = []\n",
+ "# for sentence in sentences[:15]:\n",
+ "# for t in sentence:\n",
+ "# all_tokens.append(t.text)\n",
+ "# df.at[i, 'ocr'] = all_tokens\n",
+ "#print(time.time() - ts)\n",
+ "#df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9abcd566-9acd-45e1-88db-683b9a8859d9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#whole corpus WZ"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "be803222-005f-4a68-9a27-228e255b32ce",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz = pd.read_csv('wrz.csv', sep = ',', encoding='utf-8')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d594f243-c1e4-4d25-a133-8d7ca4c8e58b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz_token = wz.drop('year', axis=1, inplace=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4025980c-faf5-4d38-9043-edd155fc9eb6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz_token['split'] = pd.NA"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "45f97a6e-d8f2-447e-b036-093c3aa42742",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "split = ['train', 'valid', 'test']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "28694d81-e44f-4ace-8b19-9d09a5f05c59",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz_token['split'] = wz_token['split'].apply(lambda x: np.random.choice(split, p=[0.6, 0.2, 0.2]))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c56609e5-51a6-41b5-8b91-3687b3f8ec09",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz_token['split'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f0f85ec8-8ba5-404e-8f63-198d83963e00",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz_token.to_csv('wz_raw.tsv', sep='\\t', index=True, header=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "75a511a5-326a-472a-a103-6794e7dc4d18",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = pd.read_csv('wz_raw.tsv', sep = '\\t', encoding='utf-8')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "4f5edd4f-e0f2-4e41-b04f-da490c659e0d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from somajo import SoMaJo\n",
+ "\n",
+ "tokenizer = SoMaJo(\"de_CMC\", split_camel_case=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "ee4959e5-e49c-4715-89f8-44d3e0c2ee9c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sentence = tokenizer.tokenize_text_file(df['ocr'], paragraph_separator=\"single_newlines\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "023a1e55-e291-4ab9-a7e5-4f6916b063a5",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "4156.50820016861\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Unnamed: 0 | \n",
+ " manifest_id | \n",
+ " ocr | \n",
+ " split | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " wrz17850101 | \n",
+ " [itz, ., r, F-, Nro, ., Sonnabmd, den, r., ^an... | \n",
+ " valid | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1 | \n",
+ " wrz17850105 | \n",
+ " [5i, ., 2Y, F, Mittwoch, den, 5., Iäner, ., 17... | \n",
+ " train | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2 | \n",
+ " wrz17850108 | \n",
+ " [57, ^, Sonnabend, den, 8., Janer, ., 1735, .,... | \n",
+ " valid | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 3 | \n",
+ " wrz17850112 | \n",
+ " [5k, 8i, F, Mittwoch, den, Iäner, ., 173, ?, .... | \n",
+ " train | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 4 | \n",
+ " wrz17850115 | \n",
+ " [^, 109, ^, Sonnabend, den, 15., Ianer, ., 178... | \n",
+ " train | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Unnamed: 0 manifest_id ocr \\\n",
+ "0 0 wrz17850101 [itz, ., r, F-, Nro, ., Sonnabmd, den, r., ^an... \n",
+ "1 1 wrz17850105 [5i, ., 2Y, F, Mittwoch, den, 5., Iäner, ., 17... \n",
+ "2 2 wrz17850108 [57, ^, Sonnabend, den, 8., Janer, ., 1735, .,... \n",
+ "3 3 wrz17850112 [5k, 8i, F, Mittwoch, den, Iäner, ., 173, ?, .... \n",
+ "4 4 wrz17850115 [^, 109, ^, Sonnabend, den, 15., Ianer, ., 178... \n",
+ "\n",
+ " split \n",
+ "0 valid \n",
+ "1 train \n",
+ "2 valid \n",
+ "3 train \n",
+ "4 train "
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import time\n",
+ "\n",
+ "#df = pd.read_csv('test_raw.tsv', sep = '\\t', encoding='utf-8')\n",
+ "df = pd.read_csv('wz_raw.tsv', sep = '\\t', encoding='utf-8')\n",
+ "\n",
+ "ts = time.time()\n",
+ "for i, ocr in df['ocr'].items():\n",
+ " sentences = [s for s in tokenizer.tokenize_text(ocr.split('\\n'), parallel=4)]\n",
+ " all_tokens = []\n",
+ " for sentence in sentences:\n",
+ " for t in sentence:\n",
+ " all_tokens.append(t.text)\n",
+ " df.at[i, 'ocr'] = all_tokens\n",
+ "print(time.time() - ts)\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "eb2fcf60-6051-46b4-84b0-33b1b2bd1e41",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = df.iloc[: , 1:]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "5ec0718e-fbfe-40af-938f-5fc6d6ebc68a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = df.reindex(columns= ['ocr', 'split', 'manifest_id'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "35981c7d-9d7e-48d4-976b-671f32973096",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ocr | \n",
+ " split | \n",
+ " manifest_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " [itz, ., r, F-, Nro, ., Sonnabmd, den, r., ^an... | \n",
+ " valid | \n",
+ " wrz17850101 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " [5i, ., 2Y, F, Mittwoch, den, 5., Iäner, ., 17... | \n",
+ " train | \n",
+ " wrz17850105 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " [57, ^, Sonnabend, den, 8., Janer, ., 1735, .,... | \n",
+ " valid | \n",
+ " wrz17850108 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " [5k, 8i, F, Mittwoch, den, Iäner, ., 173, ?, .... | \n",
+ " train | \n",
+ " wrz17850112 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " [^, 109, ^, Sonnabend, den, 15., Ianer, ., 178... | \n",
+ " train | \n",
+ " wrz17850115 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 1299 | \n",
+ " [Sonnabend, ,, den, 14, *, December, 1799, ..,... | \n",
+ " train | \n",
+ " wrz17991214 | \n",
+ "
\n",
+ " \n",
+ " | 1300 | \n",
+ " [I, m, 4, -, 77, i', jLiii, ., 1, J, Is, i0, n... | \n",
+ " valid | \n",
+ " wrz17991218 | \n",
+ "
\n",
+ " \n",
+ " | 1301 | \n",
+ " [L, Sonnabend, ,, den, 21., December, 1799, .,... | \n",
+ " valid | \n",
+ " wrz17991221 | \n",
+ "
\n",
+ " \n",
+ " | 1302 | \n",
+ " [WVr, 4, S73, ZMAr, if, >, Mittewoche, ,, den,... | \n",
+ " test | \n",
+ " wrz17991225 | \n",
+ "
\n",
+ " \n",
+ " | 1303 | \n",
+ " [Sonnabend, ,, den, rz, «, December, 1799, ., ... | \n",
+ " test | \n",
+ " wrz17991228 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1304 rows × 3 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ocr split manifest_id\n",
+ "0 [itz, ., r, F-, Nro, ., Sonnabmd, den, r., ^an... valid wrz17850101\n",
+ "1 [5i, ., 2Y, F, Mittwoch, den, 5., Iäner, ., 17... train wrz17850105\n",
+ "2 [57, ^, Sonnabend, den, 8., Janer, ., 1735, .,... valid wrz17850108\n",
+ "3 [5k, 8i, F, Mittwoch, den, Iäner, ., 173, ?, .... train wrz17850112\n",
+ "4 [^, 109, ^, Sonnabend, den, 15., Ianer, ., 178... train wrz17850115\n",
+ "... ... ... ...\n",
+ "1299 [Sonnabend, ,, den, 14, *, December, 1799, ..,... train wrz17991214\n",
+ "1300 [I, m, 4, -, 77, i', jLiii, ., 1, J, Is, i0, n... valid wrz17991218\n",
+ "1301 [L, Sonnabend, ,, den, 21., December, 1799, .,... valid wrz17991221\n",
+ "1302 [WVr, 4, S73, ZMAr, if, >, Mittewoche, ,, den,... test wrz17991225\n",
+ "1303 [Sonnabend, ,, den, rz, «, December, 1799, ., ... test wrz17991228\n",
+ "\n",
+ "[1304 rows x 3 columns]"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "b8e6b43c-624e-440e-bc1a-f10f88f4b428",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.to_csv('wz_tok.tsv', sep='\\t', index=True, header=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "id": "80fa2ade-390b-4229-94a0-e99bdd646e0a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#df = pd.read_csv('wz_tok.tsv', sep='\\t', encoding='utf-8')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "641b1d29-fd70-4b18-b7be-78e6b8162cae",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "97f038f1-f618-4266-b660-4f66a3ab8bc9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz = pd.read_csv('sza.csv', sep=',', encoding='UTF-8')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "532fb694-ccde-4906-ad0d-fca376262192",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " manifest_id | \n",
+ " year | \n",
+ " ocr | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " sza1785bl01 | \n",
+ " 1785 | \n",
+ " Salzburger\\nI ntelligenzblatt.\\n\\nums-\\n\"\" H e... | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " sza1785bl02 | \n",
+ " 1785 | \n",
+ " -\"-\\n\\nI Innländische, und auswärtige, besonde... | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " sza17850105 | \n",
+ " 1785 | \n",
+ " \\n\\nSa|\\n\\ne mehr du Mensch bist, desto mehr g... | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " sza17850112 | \n",
+ " 1785 | \n",
+ " -----\\n\\nva ar nicht die ehrlichkeit\\n\\n\\n, so... | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " sza17850126 | \n",
+ " 1785 | \n",
+ " ---\\nT\\nT--\\n\\nSalzburger\\n\\n- I. Verordnungen... | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 625 | \n",
+ " sza17991207 | \n",
+ " 1799 | \n",
+ " 769\\n\\nSalzburger\\n\\n770\\n\\n---\\n\\nIntelligenz... | \n",
+ "
\n",
+ " \n",
+ " | 626 | \n",
+ " sza17991214 | \n",
+ " 1799 | \n",
+ " 785-\\n\\nSalzburger\\n\\nIntelligenzblatt.\\n\\n796... | \n",
+ "
\n",
+ " \n",
+ " | 627 | \n",
+ " sza17991221 | \n",
+ " 1799 | \n",
+ " -\\n\\n--\\n\\n-\\n\\n99 E- S urger-\\n\\nSZ--- 2-- p-... | \n",
+ "
\n",
+ " \n",
+ " | 628 | \n",
+ " sza17991228 | \n",
+ " 1799 | \n",
+ " Intelligenzblatt.\\n\\nLII. St. Sonnabend, den 2... | \n",
+ "
\n",
+ " \n",
+ " | 629 | \n",
+ " sza1799bl01 | \n",
+ " 1799 | \n",
+ " –-–\\n\\n-------------------------------------\\n... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
630 rows × 3 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " manifest_id year ocr\n",
+ "0 sza1785bl01 1785 Salzburger\\nI ntelligenzblatt.\\n\\nums-\\n\"\" H e...\n",
+ "1 sza1785bl02 1785 -\"-\\n\\nI Innländische, und auswärtige, besonde...\n",
+ "2 sza17850105 1785 \\n\\nSa|\\n\\ne mehr du Mensch bist, desto mehr g...\n",
+ "3 sza17850112 1785 -----\\n\\nva ar nicht die ehrlichkeit\\n\\n\\n, so...\n",
+ "4 sza17850126 1785 ---\\nT\\nT--\\n\\nSalzburger\\n\\n- I. Verordnungen...\n",
+ ".. ... ... ...\n",
+ "625 sza17991207 1799 769\\n\\nSalzburger\\n\\n770\\n\\n---\\n\\nIntelligenz...\n",
+ "626 sza17991214 1799 785-\\n\\nSalzburger\\n\\nIntelligenzblatt.\\n\\n796...\n",
+ "627 sza17991221 1799 -\\n\\n--\\n\\n-\\n\\n99 E- S urger-\\n\\nSZ--- 2-- p-...\n",
+ "628 sza17991228 1799 Intelligenzblatt.\\n\\nLII. St. Sonnabend, den 2...\n",
+ "629 sza1799bl01 1799 –-–\\n\\n-------------------------------------\\n...\n",
+ "\n",
+ "[630 rows x 3 columns]"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sz"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "7efdb2a9-bfc3-4b88-ae19-a249a4fd21b3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz_token = sz.drop('year', axis=1, inplace=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "3b33320e-924d-483c-82ca-428f4a9799c6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz_token['split'] = pd.NA"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "290dbc99-e3c2-4922-b20d-110e338b1b32",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "split = ['train', 'valid', 'test']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "3e0e95cf-82e0-473d-85bb-7a848cde595f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz_token['split'] = sz_token['split'].apply(lambda x: np.random.choice(split, p=[0.6, 0.2, 0.2]))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "af6f0417-becb-4506-bb4d-31340052e51a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "train 367\n",
+ "valid 139\n",
+ "test 124\n",
+ "Name: split, dtype: int64"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sz_token['split'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "02a44c70-f3f0-4d76-9682-b7e0c543a665",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz_token = sz_token.reindex(columns= ['ocr', 'split', 'manifest_id'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "1219234f-7ca7-4ec8-b1d1-fdb8008e7ad3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz_token.to_csv('sz_raw.tsv',sep='\\t',index=True,header=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "e159229d-4156-4c39-af18-f06f32c5a1d2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = pd.read_csv('sz_raw.tsv', sep = '\\t', encoding='utf-8')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "3b152ed7-2f4d-4244-9cc5-a3c573a9a52f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sentence = tokenizer.tokenize_text_file(df['ocr'], paragraph_separator=\"single_newlines\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "0d527d61-c6da-4562-9c8f-a94cf0a97931",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "558.7360711097717\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Unnamed: 0 | \n",
+ " ocr | \n",
+ " split | \n",
+ " manifest_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " [Salzburger, I, ntelligenzblatt, ., ums-, \", \"... | \n",
+ " train | \n",
+ " sza1785bl01 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1 | \n",
+ " [-, \", -, I, Innländische, ,, und, auswärtige,... | \n",
+ " train | \n",
+ " sza1785bl02 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2 | \n",
+ " [Sa|, e, mehr, du, Mensch, bist, ,, desto, meh... | \n",
+ " valid | \n",
+ " sza17850105 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 3 | \n",
+ " [-----, va, ar, nicht, die, ehrlichkeit, ,, so... | \n",
+ " train | \n",
+ " sza17850112 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 4 | \n",
+ " [---, T, T--, Salzburger, -, I., Verordnungen,... | \n",
+ " test | \n",
+ " sza17850126 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Unnamed: 0 ocr split \\\n",
+ "0 0 [Salzburger, I, ntelligenzblatt, ., ums-, \", \"... train \n",
+ "1 1 [-, \", -, I, Innländische, ,, und, auswärtige,... train \n",
+ "2 2 [Sa|, e, mehr, du, Mensch, bist, ,, desto, meh... valid \n",
+ "3 3 [-----, va, ar, nicht, die, ehrlichkeit, ,, so... train \n",
+ "4 4 [---, T, T--, Salzburger, -, I., Verordnungen,... test \n",
+ "\n",
+ " manifest_id \n",
+ "0 sza1785bl01 \n",
+ "1 sza1785bl02 \n",
+ "2 sza17850105 \n",
+ "3 sza17850112 \n",
+ "4 sza17850126 "
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import time\n",
+ "\n",
+ "#df = pd.read_csv('test_raw.tsv', sep = '\\t', encoding='utf-8')\n",
+ "df = pd.read_csv('sz_raw.tsv', sep = '\\t', encoding='utf-8')\n",
+ "\n",
+ "ts = time.time()\n",
+ "for i, ocr in df['ocr'].items():\n",
+ " sentences = [s for s in tokenizer.tokenize_text(ocr.split('\\n'), parallel=4)]\n",
+ " all_tokens = []\n",
+ " for sentence in sentences:\n",
+ " for t in sentence:\n",
+ " all_tokens.append(t.text)\n",
+ " df.at[i, 'ocr'] = all_tokens\n",
+ "print(time.time() - ts)\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "57bea547-6fba-42da-8e26-42c4f02380d5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = df.iloc[: , 1:]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "01cfbbf3-ee87-4614-b454-e48859b50504",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ocr | \n",
+ " split | \n",
+ " manifest_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " [Salzburger, I, ntelligenzblatt, ., ums-, \", \"... | \n",
+ " train | \n",
+ " sza1785bl01 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " [-, \", -, I, Innländische, ,, und, auswärtige,... | \n",
+ " train | \n",
+ " sza1785bl02 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " [Sa|, e, mehr, du, Mensch, bist, ,, desto, meh... | \n",
+ " valid | \n",
+ " sza17850105 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " [-----, va, ar, nicht, die, ehrlichkeit, ,, so... | \n",
+ " train | \n",
+ " sza17850112 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " [---, T, T--, Salzburger, -, I., Verordnungen,... | \n",
+ " test | \n",
+ " sza17850126 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 625 | \n",
+ " [769, Salzburger, 770, ---, Intelligenzblatt, ... | \n",
+ " valid | \n",
+ " sza17991207 | \n",
+ "
\n",
+ " \n",
+ " | 626 | \n",
+ " [785-, Salzburger, Intelligenzblatt, ., 796, -... | \n",
+ " train | \n",
+ " sza17991214 | \n",
+ "
\n",
+ " \n",
+ " | 627 | \n",
+ " [-, --, -, 99, E-, S, urger-, SZ---, 2-, -, p-... | \n",
+ " train | \n",
+ " sza17991221 | \n",
+ "
\n",
+ " \n",
+ " | 628 | \n",
+ " [Intelligenzblatt, ., LII, ., St., Sonnabend, ... | \n",
+ " train | \n",
+ " sza17991228 | \n",
+ "
\n",
+ " \n",
+ " | 629 | \n",
+ " [–, -, –, ------------------------------------... | \n",
+ " test | \n",
+ " sza1799bl01 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
630 rows × 3 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ocr split manifest_id\n",
+ "0 [Salzburger, I, ntelligenzblatt, ., ums-, \", \"... train sza1785bl01\n",
+ "1 [-, \", -, I, Innländische, ,, und, auswärtige,... train sza1785bl02\n",
+ "2 [Sa|, e, mehr, du, Mensch, bist, ,, desto, meh... valid sza17850105\n",
+ "3 [-----, va, ar, nicht, die, ehrlichkeit, ,, so... train sza17850112\n",
+ "4 [---, T, T--, Salzburger, -, I., Verordnungen,... test sza17850126\n",
+ ".. ... ... ...\n",
+ "625 [769, Salzburger, 770, ---, Intelligenzblatt, ... valid sza17991207\n",
+ "626 [785-, Salzburger, Intelligenzblatt, ., 796, -... train sza17991214\n",
+ "627 [-, --, -, 99, E-, S, urger-, SZ---, 2-, -, p-... train sza17991221\n",
+ "628 [Intelligenzblatt, ., LII, ., St., Sonnabend, ... train sza17991228\n",
+ "629 [–, -, –, ------------------------------------... test sza1799bl01\n",
+ "\n",
+ "[630 rows x 3 columns]"
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "id": "45f88e29-5854-4b29-b514-58215326bf9d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.to_csv('sz_tok.tsv', sep='\\t', index=True, header=True)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}