diff --git a/tokenization.ipynb b/tokenization.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..e3ecde65337b30b12455e1566e5118103289fcda --- /dev/null +++ b/tokenization.ipynb @@ -0,0 +1,1074 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "1c9cc762-eb3a-4e4a-8392-b38b897df498", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import csv\n", + "import pathlib\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import nltk\n", + "import re\n", + "import statistics\n", + "import string\n", + "from collections import Counter\n", + "from collections import defaultdict\n", + "from sklearn.model_selection import train_test_split\n", + "import octis" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "7ea8760d-97e5-44ae-a271-1478989fd97b", + "metadata": {}, + "outputs": [], + "source": [ + "import gensim\n", + "import spacy\n", + "import sklearn\n", + "import torch\n", + "import libsvm\n", + "import flask\n", + "import sentence_transformers\n", + "import requests\n", + "import tomotopy" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "32ea9351-4c82-48dc-9d85-9888243f0d3f", + "metadata": {}, + "outputs": [], + "source": [ + "os.chdir(r\"C:\\Users\\onb1202\\OneDrive - Österreichische Nationalbibliothek\\Praktikum TK\\daten\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f3e3f35-d073-47e8-9ca8-03d995120d3a", + "metadata": {}, + "outputs": [], + "source": [ + "#small test corpus" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "471091f8-cee3-4bff-bfeb-f403b879c830", + "metadata": {}, + "outputs": [], + "source": [ + "#df = pd.read_csv('test_raw.tsv', sep = '\\t', encoding='utf-8')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "9091c711-de70-46b3-9e42-517c9ecf106d", + "metadata": {}, + "outputs": [], + "source": [ + "#from somajo import SoMaJo\n", + "\n", + "#tokenizer = SoMaJo(\"de_CMC\", split_camel_case=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "669d3e3b-c043-4594-8e88-106020e3b945", + "metadata": {}, + "outputs": [], + "source": [ + "#sentence = tokenizer.tokenize_text_file(df['ocr'], paragraph_separator=\"single_newlines\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5012333-27a2-4f1c-ac15-170fa4e5fcb8", + "metadata": {}, + "outputs": [], + "source": [ + "#type(sentence)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e83b6b9c-f515-4b18-9eeb-354ba8bb395d", + "metadata": {}, + "outputs": [], + "source": [ + "#import time\n", + "\n", + "#df = pd.read_csv('test_raw.tsv', sep = '\\t', encoding='utf-8')\n", + "\n", + "#ts = time.time()\n", + "#for i, ocr in df['ocr'].items():\n", + "# sentences = [s for s in tokenizer.tokenize_text(ocr.split('\\n'), parallel=4)]\n", + "# all_tokens = []\n", + "# for sentence in sentences[:15]:\n", + "# for t in sentence:\n", + "# all_tokens.append(t.text)\n", + "# df.at[i, 'ocr'] = all_tokens\n", + "#print(time.time() - ts)\n", + "#df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9abcd566-9acd-45e1-88db-683b9a8859d9", + "metadata": {}, + "outputs": [], + "source": [ + "#whole corpus WZ" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "be803222-005f-4a68-9a27-228e255b32ce", + "metadata": {}, + "outputs": [], + "source": [ + "wz = pd.read_csv('wrz.csv', sep = ',', encoding='utf-8')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d594f243-c1e4-4d25-a133-8d7ca4c8e58b", + "metadata": {}, + "outputs": [], + "source": [ + "wz_token = wz.drop('year', axis=1, inplace=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4025980c-faf5-4d38-9043-edd155fc9eb6", + "metadata": {}, + "outputs": [], + "source": [ + "wz_token['split'] = pd.NA" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45f97a6e-d8f2-447e-b036-093c3aa42742", + "metadata": {}, + "outputs": [], + "source": [ + "split = ['train', 'valid', 'test']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28694d81-e44f-4ace-8b19-9d09a5f05c59", + "metadata": {}, + "outputs": [], + "source": [ + "wz_token['split'] = wz_token['split'].apply(lambda x: np.random.choice(split, p=[0.6, 0.2, 0.2]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c56609e5-51a6-41b5-8b91-3687b3f8ec09", + "metadata": {}, + "outputs": [], + "source": [ + "wz_token['split'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0f85ec8-8ba5-404e-8f63-198d83963e00", + "metadata": {}, + "outputs": [], + "source": [ + "wz_token.to_csv('wz_raw.tsv', sep='\\t', index=True, header=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "75a511a5-326a-472a-a103-6794e7dc4d18", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('wz_raw.tsv', sep = '\\t', encoding='utf-8')" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "4f5edd4f-e0f2-4e41-b04f-da490c659e0d", + "metadata": {}, + "outputs": [], + "source": [ + "from somajo import SoMaJo\n", + "\n", + "tokenizer = SoMaJo(\"de_CMC\", split_camel_case=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "ee4959e5-e49c-4715-89f8-44d3e0c2ee9c", + "metadata": {}, + "outputs": [], + "source": [ + "sentence = tokenizer.tokenize_text_file(df['ocr'], paragraph_separator=\"single_newlines\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "023a1e55-e291-4ab9-a7e5-4f6916b063a5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4156.50820016861\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0manifest_idocrsplit
00wrz17850101[itz, ., r, F-, Nro, ., Sonnabmd, den, r., ^an...valid
11wrz17850105[5i, ., 2Y, F, Mittwoch, den, 5., Iäner, ., 17...train
22wrz17850108[57, ^, Sonnabend, den, 8., Janer, ., 1735, .,...valid
33wrz17850112[5k, 8i, F, Mittwoch, den, Iäner, ., 173, ?, ....train
44wrz17850115[^, 109, ^, Sonnabend, den, 15., Ianer, ., 178...train
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 manifest_id ocr \\\n", + "0 0 wrz17850101 [itz, ., r, F-, Nro, ., Sonnabmd, den, r., ^an... \n", + "1 1 wrz17850105 [5i, ., 2Y, F, Mittwoch, den, 5., Iäner, ., 17... \n", + "2 2 wrz17850108 [57, ^, Sonnabend, den, 8., Janer, ., 1735, .,... \n", + "3 3 wrz17850112 [5k, 8i, F, Mittwoch, den, Iäner, ., 173, ?, .... \n", + "4 4 wrz17850115 [^, 109, ^, Sonnabend, den, 15., Ianer, ., 178... \n", + "\n", + " split \n", + "0 valid \n", + "1 train \n", + "2 valid \n", + "3 train \n", + "4 train " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import time\n", + "\n", + "#df = pd.read_csv('test_raw.tsv', sep = '\\t', encoding='utf-8')\n", + "df = pd.read_csv('wz_raw.tsv', sep = '\\t', encoding='utf-8')\n", + "\n", + "ts = time.time()\n", + "for i, ocr in df['ocr'].items():\n", + " sentences = [s for s in tokenizer.tokenize_text(ocr.split('\\n'), parallel=4)]\n", + " all_tokens = []\n", + " for sentence in sentences:\n", + " for t in sentence:\n", + " all_tokens.append(t.text)\n", + " df.at[i, 'ocr'] = all_tokens\n", + "print(time.time() - ts)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "eb2fcf60-6051-46b4-84b0-33b1b2bd1e41", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.iloc[: , 1:]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "5ec0718e-fbfe-40af-938f-5fc6d6ebc68a", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.reindex(columns= ['ocr', 'split', 'manifest_id'])" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "35981c7d-9d7e-48d4-976b-671f32973096", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ocrsplitmanifest_id
0[itz, ., r, F-, Nro, ., Sonnabmd, den, r., ^an...validwrz17850101
1[5i, ., 2Y, F, Mittwoch, den, 5., Iäner, ., 17...trainwrz17850105
2[57, ^, Sonnabend, den, 8., Janer, ., 1735, .,...validwrz17850108
3[5k, 8i, F, Mittwoch, den, Iäner, ., 173, ?, ....trainwrz17850112
4[^, 109, ^, Sonnabend, den, 15., Ianer, ., 178...trainwrz17850115
............
1299[Sonnabend, ,, den, 14, *, December, 1799, ..,...trainwrz17991214
1300[I, m, 4, -, 77, i', jLiii, ., 1, J, Is, i0, n...validwrz17991218
1301[L, Sonnabend, ,, den, 21., December, 1799, .,...validwrz17991221
1302[WVr, 4, S73, ZMAr, if, >, Mittewoche, ,, den,...testwrz17991225
1303[Sonnabend, ,, den, rz, «, December, 1799, ., ...testwrz17991228
\n", + "

1304 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " ocr split manifest_id\n", + "0 [itz, ., r, F-, Nro, ., Sonnabmd, den, r., ^an... valid wrz17850101\n", + "1 [5i, ., 2Y, F, Mittwoch, den, 5., Iäner, ., 17... train wrz17850105\n", + "2 [57, ^, Sonnabend, den, 8., Janer, ., 1735, .,... valid wrz17850108\n", + "3 [5k, 8i, F, Mittwoch, den, Iäner, ., 173, ?, .... train wrz17850112\n", + "4 [^, 109, ^, Sonnabend, den, 15., Ianer, ., 178... train wrz17850115\n", + "... ... ... ...\n", + "1299 [Sonnabend, ,, den, 14, *, December, 1799, ..,... train wrz17991214\n", + "1300 [I, m, 4, -, 77, i', jLiii, ., 1, J, Is, i0, n... valid wrz17991218\n", + "1301 [L, Sonnabend, ,, den, 21., December, 1799, .,... valid wrz17991221\n", + "1302 [WVr, 4, S73, ZMAr, if, >, Mittewoche, ,, den,... test wrz17991225\n", + "1303 [Sonnabend, ,, den, rz, «, December, 1799, ., ... test wrz17991228\n", + "\n", + "[1304 rows x 3 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "b8e6b43c-624e-440e-bc1a-f10f88f4b428", + "metadata": {}, + "outputs": [], + "source": [ + "df.to_csv('wz_tok.tsv', sep='\\t', index=True, header=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "80fa2ade-390b-4229-94a0-e99bdd646e0a", + "metadata": {}, + "outputs": [], + "source": [ + "#df = pd.read_csv('wz_tok.tsv', sep='\\t', encoding='utf-8')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "641b1d29-fd70-4b18-b7be-78e6b8162cae", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "97f038f1-f618-4266-b660-4f66a3ab8bc9", + "metadata": {}, + "outputs": [], + "source": [ + "sz = pd.read_csv('sza.csv', sep=',', encoding='UTF-8')" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "532fb694-ccde-4906-ad0d-fca376262192", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
manifest_idyearocr
0sza1785bl011785Salzburger\\nI ntelligenzblatt.\\n\\nums-\\n\"\" H e...
1sza1785bl021785-\"-\\n\\nI Innländische, und auswärtige, besonde...
2sza178501051785\\n\\nSa|\\n\\ne mehr du Mensch bist, desto mehr g...
3sza178501121785-----\\n\\nva ar nicht die ehrlichkeit\\n\\n\\n, so...
4sza178501261785---\\nT\\nT--\\n\\nSalzburger\\n\\n- I. Verordnungen...
............
625sza179912071799769\\n\\nSalzburger\\n\\n770\\n\\n---\\n\\nIntelligenz...
626sza179912141799785-\\n\\nSalzburger\\n\\nIntelligenzblatt.\\n\\n796...
627sza179912211799-\\n\\n--\\n\\n-\\n\\n99 E- S urger-\\n\\nSZ--- 2-- p-...
628sza179912281799Intelligenzblatt.\\n\\nLII. St. Sonnabend, den 2...
629sza1799bl011799–-–\\n\\n-------------------------------------\\n...
\n", + "

630 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " manifest_id year ocr\n", + "0 sza1785bl01 1785 Salzburger\\nI ntelligenzblatt.\\n\\nums-\\n\"\" H e...\n", + "1 sza1785bl02 1785 -\"-\\n\\nI Innländische, und auswärtige, besonde...\n", + "2 sza17850105 1785 \\n\\nSa|\\n\\ne mehr du Mensch bist, desto mehr g...\n", + "3 sza17850112 1785 -----\\n\\nva ar nicht die ehrlichkeit\\n\\n\\n, so...\n", + "4 sza17850126 1785 ---\\nT\\nT--\\n\\nSalzburger\\n\\n- I. Verordnungen...\n", + ".. ... ... ...\n", + "625 sza17991207 1799 769\\n\\nSalzburger\\n\\n770\\n\\n---\\n\\nIntelligenz...\n", + "626 sza17991214 1799 785-\\n\\nSalzburger\\n\\nIntelligenzblatt.\\n\\n796...\n", + "627 sza17991221 1799 -\\n\\n--\\n\\n-\\n\\n99 E- S urger-\\n\\nSZ--- 2-- p-...\n", + "628 sza17991228 1799 Intelligenzblatt.\\n\\nLII. St. Sonnabend, den 2...\n", + "629 sza1799bl01 1799 –-–\\n\\n-------------------------------------\\n...\n", + "\n", + "[630 rows x 3 columns]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sz" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "7efdb2a9-bfc3-4b88-ae19-a249a4fd21b3", + "metadata": {}, + "outputs": [], + "source": [ + "sz_token = sz.drop('year', axis=1, inplace=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "3b33320e-924d-483c-82ca-428f4a9799c6", + "metadata": {}, + "outputs": [], + "source": [ + "sz_token['split'] = pd.NA" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "290dbc99-e3c2-4922-b20d-110e338b1b32", + "metadata": {}, + "outputs": [], + "source": [ + "split = ['train', 'valid', 'test']" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "3e0e95cf-82e0-473d-85bb-7a848cde595f", + "metadata": {}, + "outputs": [], + "source": [ + "sz_token['split'] = sz_token['split'].apply(lambda x: np.random.choice(split, p=[0.6, 0.2, 0.2]))" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "af6f0417-becb-4506-bb4d-31340052e51a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "train 367\n", + "valid 139\n", + "test 124\n", + "Name: split, dtype: int64" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sz_token['split'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "02a44c70-f3f0-4d76-9682-b7e0c543a665", + "metadata": {}, + "outputs": [], + "source": [ + "sz_token = sz_token.reindex(columns= ['ocr', 'split', 'manifest_id'])" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "1219234f-7ca7-4ec8-b1d1-fdb8008e7ad3", + "metadata": {}, + "outputs": [], + "source": [ + "sz_token.to_csv('sz_raw.tsv',sep='\\t',index=True,header=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "e159229d-4156-4c39-af18-f06f32c5a1d2", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('sz_raw.tsv', sep = '\\t', encoding='utf-8')" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "3b152ed7-2f4d-4244-9cc5-a3c573a9a52f", + "metadata": {}, + "outputs": [], + "source": [ + "sentence = tokenizer.tokenize_text_file(df['ocr'], paragraph_separator=\"single_newlines\")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "0d527d61-c6da-4562-9c8f-a94cf0a97931", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "558.7360711097717\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0ocrsplitmanifest_id
00[Salzburger, I, ntelligenzblatt, ., ums-, \", \"...trainsza1785bl01
11[-, \", -, I, Innländische, ,, und, auswärtige,...trainsza1785bl02
22[Sa|, e, mehr, du, Mensch, bist, ,, desto, meh...validsza17850105
33[-----, va, ar, nicht, die, ehrlichkeit, ,, so...trainsza17850112
44[---, T, T--, Salzburger, -, I., Verordnungen,...testsza17850126
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 ocr split \\\n", + "0 0 [Salzburger, I, ntelligenzblatt, ., ums-, \", \"... train \n", + "1 1 [-, \", -, I, Innländische, ,, und, auswärtige,... train \n", + "2 2 [Sa|, e, mehr, du, Mensch, bist, ,, desto, meh... valid \n", + "3 3 [-----, va, ar, nicht, die, ehrlichkeit, ,, so... train \n", + "4 4 [---, T, T--, Salzburger, -, I., Verordnungen,... test \n", + "\n", + " manifest_id \n", + "0 sza1785bl01 \n", + "1 sza1785bl02 \n", + "2 sza17850105 \n", + "3 sza17850112 \n", + "4 sza17850126 " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import time\n", + "\n", + "#df = pd.read_csv('test_raw.tsv', sep = '\\t', encoding='utf-8')\n", + "df = pd.read_csv('sz_raw.tsv', sep = '\\t', encoding='utf-8')\n", + "\n", + "ts = time.time()\n", + "for i, ocr in df['ocr'].items():\n", + " sentences = [s for s in tokenizer.tokenize_text(ocr.split('\\n'), parallel=4)]\n", + " all_tokens = []\n", + " for sentence in sentences:\n", + " for t in sentence:\n", + " all_tokens.append(t.text)\n", + " df.at[i, 'ocr'] = all_tokens\n", + "print(time.time() - ts)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "57bea547-6fba-42da-8e26-42c4f02380d5", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.iloc[: , 1:]" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "01cfbbf3-ee87-4614-b454-e48859b50504", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ocrsplitmanifest_id
0[Salzburger, I, ntelligenzblatt, ., ums-, \", \"...trainsza1785bl01
1[-, \", -, I, Innländische, ,, und, auswärtige,...trainsza1785bl02
2[Sa|, e, mehr, du, Mensch, bist, ,, desto, meh...validsza17850105
3[-----, va, ar, nicht, die, ehrlichkeit, ,, so...trainsza17850112
4[---, T, T--, Salzburger, -, I., Verordnungen,...testsza17850126
............
625[769, Salzburger, 770, ---, Intelligenzblatt, ...validsza17991207
626[785-, Salzburger, Intelligenzblatt, ., 796, -...trainsza17991214
627[-, --, -, 99, E-, S, urger-, SZ---, 2-, -, p-...trainsza17991221
628[Intelligenzblatt, ., LII, ., St., Sonnabend, ...trainsza17991228
629[–, -, –, ------------------------------------...testsza1799bl01
\n", + "

630 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " ocr split manifest_id\n", + "0 [Salzburger, I, ntelligenzblatt, ., ums-, \", \"... train sza1785bl01\n", + "1 [-, \", -, I, Innländische, ,, und, auswärtige,... train sza1785bl02\n", + "2 [Sa|, e, mehr, du, Mensch, bist, ,, desto, meh... valid sza17850105\n", + "3 [-----, va, ar, nicht, die, ehrlichkeit, ,, so... train sza17850112\n", + "4 [---, T, T--, Salzburger, -, I., Verordnungen,... test sza17850126\n", + ".. ... ... ...\n", + "625 [769, Salzburger, 770, ---, Intelligenzblatt, ... valid sza17991207\n", + "626 [785-, Salzburger, Intelligenzblatt, ., 796, -... train sza17991214\n", + "627 [-, --, -, 99, E-, S, urger-, SZ---, 2-, -, p-... train sza17991221\n", + "628 [Intelligenzblatt, ., LII, ., St., Sonnabend, ... train sza17991228\n", + "629 [–, -, –, ------------------------------------... test sza1799bl01\n", + "\n", + "[630 rows x 3 columns]" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "45f88e29-5854-4b29-b514-58215326bf9d", + "metadata": {}, + "outputs": [], + "source": [ + "df.to_csv('sz_tok.tsv', sep='\\t', index=True, header=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}