Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
TopiCompare
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Thomas Kirchmair
TopiCompare
Commits
14f03ab6
Commit
14f03ab6
authored
3 years ago
by
Thomas Kirchmair
Browse files
Options
Downloads
Patches
Plain Diff
Upload New File
parent
49a991b8
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
tokenization.ipynb
+1074
-0
1074 additions, 0 deletions
tokenization.ipynb
with
1074 additions
and
0 deletions
tokenization.ipynb
0 → 100644
+
1074
−
0
View file @
14f03ab6
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "1c9cc762-eb3a-4e4a-8392-b38b897df498",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import csv\n",
"import pathlib\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import nltk\n",
"import re\n",
"import statistics\n",
"import string\n",
"from collections import Counter\n",
"from collections import defaultdict\n",
"from sklearn.model_selection import train_test_split\n",
"import octis"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "7ea8760d-97e5-44ae-a271-1478989fd97b",
"metadata": {},
"outputs": [],
"source": [
"import gensim\n",
"import spacy\n",
"import sklearn\n",
"import torch\n",
"import libsvm\n",
"import flask\n",
"import sentence_transformers\n",
"import requests\n",
"import tomotopy"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "32ea9351-4c82-48dc-9d85-9888243f0d3f",
"metadata": {},
"outputs": [],
"source": [
"os.chdir(r\"C:\\Users\\onb1202\\OneDrive - Österreichische Nationalbibliothek\\Praktikum TK\\daten\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8f3e3f35-d073-47e8-9ca8-03d995120d3a",
"metadata": {},
"outputs": [],
"source": [
"#small test corpus"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "471091f8-cee3-4bff-bfeb-f403b879c830",
"metadata": {},
"outputs": [],
"source": [
"#df = pd.read_csv('test_raw.tsv', sep = '\\t', encoding='utf-8')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "9091c711-de70-46b3-9e42-517c9ecf106d",
"metadata": {},
"outputs": [],
"source": [
"#from somajo import SoMaJo\n",
"\n",
"#tokenizer = SoMaJo(\"de_CMC\", split_camel_case=True)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "669d3e3b-c043-4594-8e88-106020e3b945",
"metadata": {},
"outputs": [],
"source": [
"#sentence = tokenizer.tokenize_text_file(df['ocr'], paragraph_separator=\"single_newlines\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b5012333-27a2-4f1c-ac15-170fa4e5fcb8",
"metadata": {},
"outputs": [],
"source": [
"#type(sentence)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "e83b6b9c-f515-4b18-9eeb-354ba8bb395d",
"metadata": {},
"outputs": [],
"source": [
"#import time\n",
"\n",
"#df = pd.read_csv('test_raw.tsv', sep = '\\t', encoding='utf-8')\n",
"\n",
"#ts = time.time()\n",
"#for i, ocr in df['ocr'].items():\n",
"# sentences = [s for s in tokenizer.tokenize_text(ocr.split('\\n'), parallel=4)]\n",
"# all_tokens = []\n",
"# for sentence in sentences[:15]:\n",
"# for t in sentence:\n",
"# all_tokens.append(t.text)\n",
"# df.at[i, 'ocr'] = all_tokens\n",
"#print(time.time() - ts)\n",
"#df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9abcd566-9acd-45e1-88db-683b9a8859d9",
"metadata": {},
"outputs": [],
"source": [
"#whole corpus WZ"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "be803222-005f-4a68-9a27-228e255b32ce",
"metadata": {},
"outputs": [],
"source": [
"wz = pd.read_csv('wrz.csv', sep = ',', encoding='utf-8')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d594f243-c1e4-4d25-a133-8d7ca4c8e58b",
"metadata": {},
"outputs": [],
"source": [
"wz_token = wz.drop('year', axis=1, inplace=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4025980c-faf5-4d38-9043-edd155fc9eb6",
"metadata": {},
"outputs": [],
"source": [
"wz_token['split'] = pd.NA"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "45f97a6e-d8f2-447e-b036-093c3aa42742",
"metadata": {},
"outputs": [],
"source": [
"split = ['train', 'valid', 'test']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "28694d81-e44f-4ace-8b19-9d09a5f05c59",
"metadata": {},
"outputs": [],
"source": [
"wz_token['split'] = wz_token['split'].apply(lambda x: np.random.choice(split, p=[0.6, 0.2, 0.2]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c56609e5-51a6-41b5-8b91-3687b3f8ec09",
"metadata": {},
"outputs": [],
"source": [
"wz_token['split'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f0f85ec8-8ba5-404e-8f63-198d83963e00",
"metadata": {},
"outputs": [],
"source": [
"wz_token.to_csv('wz_raw.tsv', sep='\\t', index=True, header=True)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "75a511a5-326a-472a-a103-6794e7dc4d18",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('wz_raw.tsv', sep = '\\t', encoding='utf-8')"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "4f5edd4f-e0f2-4e41-b04f-da490c659e0d",
"metadata": {},
"outputs": [],
"source": [
"from somajo import SoMaJo\n",
"\n",
"tokenizer = SoMaJo(\"de_CMC\", split_camel_case=True)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "ee4959e5-e49c-4715-89f8-44d3e0c2ee9c",
"metadata": {},
"outputs": [],
"source": [
"sentence = tokenizer.tokenize_text_file(df['ocr'], paragraph_separator=\"single_newlines\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "023a1e55-e291-4ab9-a7e5-4f6916b063a5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4156.50820016861\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>manifest_id</th>\n",
" <th>ocr</th>\n",
" <th>split</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>wrz17850101</td>\n",
" <td>[itz, ., r, F-, Nro, ., Sonnabmd, den, r., ^an...</td>\n",
" <td>valid</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>wrz17850105</td>\n",
" <td>[5i, ., 2Y, F, Mittwoch, den, 5., Iäner, ., 17...</td>\n",
" <td>train</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>wrz17850108</td>\n",
" <td>[57, ^, Sonnabend, den, 8., Janer, ., 1735, .,...</td>\n",
" <td>valid</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>wrz17850112</td>\n",
" <td>[5k, 8i, F, Mittwoch, den, Iäner, ., 173, ?, ....</td>\n",
" <td>train</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>wrz17850115</td>\n",
" <td>[^, 109, ^, Sonnabend, den, 15., Ianer, ., 178...</td>\n",
" <td>train</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 manifest_id ocr \\\n",
"0 0 wrz17850101 [itz, ., r, F-, Nro, ., Sonnabmd, den, r., ^an... \n",
"1 1 wrz17850105 [5i, ., 2Y, F, Mittwoch, den, 5., Iäner, ., 17... \n",
"2 2 wrz17850108 [57, ^, Sonnabend, den, 8., Janer, ., 1735, .,... \n",
"3 3 wrz17850112 [5k, 8i, F, Mittwoch, den, Iäner, ., 173, ?, .... \n",
"4 4 wrz17850115 [^, 109, ^, Sonnabend, den, 15., Ianer, ., 178... \n",
"\n",
" split \n",
"0 valid \n",
"1 train \n",
"2 valid \n",
"3 train \n",
"4 train "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import time\n",
"\n",
"#df = pd.read_csv('test_raw.tsv', sep = '\\t', encoding='utf-8')\n",
"df = pd.read_csv('wz_raw.tsv', sep = '\\t', encoding='utf-8')\n",
"\n",
"ts = time.time()\n",
"for i, ocr in df['ocr'].items():\n",
" sentences = [s for s in tokenizer.tokenize_text(ocr.split('\\n'), parallel=4)]\n",
" all_tokens = []\n",
" for sentence in sentences:\n",
" for t in sentence:\n",
" all_tokens.append(t.text)\n",
" df.at[i, 'ocr'] = all_tokens\n",
"print(time.time() - ts)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "eb2fcf60-6051-46b4-84b0-33b1b2bd1e41",
"metadata": {},
"outputs": [],
"source": [
"df = df.iloc[: , 1:]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "5ec0718e-fbfe-40af-938f-5fc6d6ebc68a",
"metadata": {},
"outputs": [],
"source": [
"df = df.reindex(columns= ['ocr', 'split', 'manifest_id'])"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "35981c7d-9d7e-48d4-976b-671f32973096",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ocr</th>\n",
" <th>split</th>\n",
" <th>manifest_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>[itz, ., r, F-, Nro, ., Sonnabmd, den, r., ^an...</td>\n",
" <td>valid</td>\n",
" <td>wrz17850101</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>[5i, ., 2Y, F, Mittwoch, den, 5., Iäner, ., 17...</td>\n",
" <td>train</td>\n",
" <td>wrz17850105</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>[57, ^, Sonnabend, den, 8., Janer, ., 1735, .,...</td>\n",
" <td>valid</td>\n",
" <td>wrz17850108</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>[5k, 8i, F, Mittwoch, den, Iäner, ., 173, ?, ....</td>\n",
" <td>train</td>\n",
" <td>wrz17850112</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>[^, 109, ^, Sonnabend, den, 15., Ianer, ., 178...</td>\n",
" <td>train</td>\n",
" <td>wrz17850115</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1299</th>\n",
" <td>[Sonnabend, ,, den, 14, *, December, 1799, ..,...</td>\n",
" <td>train</td>\n",
" <td>wrz17991214</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1300</th>\n",
" <td>[I, m, 4, -, 77, i', jLiii, ., 1, J, Is, i0, n...</td>\n",
" <td>valid</td>\n",
" <td>wrz17991218</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1301</th>\n",
" <td>[L, Sonnabend, ,, den, 21., December, 1799, .,...</td>\n",
" <td>valid</td>\n",
" <td>wrz17991221</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1302</th>\n",
" <td>[WVr, 4, S73, ZMAr, if, >, Mittewoche, ,, den,...</td>\n",
" <td>test</td>\n",
" <td>wrz17991225</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1303</th>\n",
" <td>[Sonnabend, ,, den, rz, «, December, 1799, ., ...</td>\n",
" <td>test</td>\n",
" <td>wrz17991228</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1304 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" ocr split manifest_id\n",
"0 [itz, ., r, F-, Nro, ., Sonnabmd, den, r., ^an... valid wrz17850101\n",
"1 [5i, ., 2Y, F, Mittwoch, den, 5., Iäner, ., 17... train wrz17850105\n",
"2 [57, ^, Sonnabend, den, 8., Janer, ., 1735, .,... valid wrz17850108\n",
"3 [5k, 8i, F, Mittwoch, den, Iäner, ., 173, ?, .... train wrz17850112\n",
"4 [^, 109, ^, Sonnabend, den, 15., Ianer, ., 178... train wrz17850115\n",
"... ... ... ...\n",
"1299 [Sonnabend, ,, den, 14, *, December, 1799, ..,... train wrz17991214\n",
"1300 [I, m, 4, -, 77, i', jLiii, ., 1, J, Is, i0, n... valid wrz17991218\n",
"1301 [L, Sonnabend, ,, den, 21., December, 1799, .,... valid wrz17991221\n",
"1302 [WVr, 4, S73, ZMAr, if, >, Mittewoche, ,, den,... test wrz17991225\n",
"1303 [Sonnabend, ,, den, rz, «, December, 1799, ., ... test wrz17991228\n",
"\n",
"[1304 rows x 3 columns]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "b8e6b43c-624e-440e-bc1a-f10f88f4b428",
"metadata": {},
"outputs": [],
"source": [
"df.to_csv('wz_tok.tsv', sep='\\t', index=True, header=True)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "80fa2ade-390b-4229-94a0-e99bdd646e0a",
"metadata": {},
"outputs": [],
"source": [
"#df = pd.read_csv('wz_tok.tsv', sep='\\t', encoding='utf-8')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "641b1d29-fd70-4b18-b7be-78e6b8162cae",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 20,
"id": "97f038f1-f618-4266-b660-4f66a3ab8bc9",
"metadata": {},
"outputs": [],
"source": [
"sz = pd.read_csv('sza.csv', sep=',', encoding='UTF-8')"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "532fb694-ccde-4906-ad0d-fca376262192",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>manifest_id</th>\n",
" <th>year</th>\n",
" <th>ocr</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>sza1785bl01</td>\n",
" <td>1785</td>\n",
" <td>Salzburger\\nI ntelligenzblatt.\\n\\nums-\\n\"\" H e...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>sza1785bl02</td>\n",
" <td>1785</td>\n",
" <td>-\"-\\n\\nI Innländische, und auswärtige, besonde...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>sza17850105</td>\n",
" <td>1785</td>\n",
" <td>\\n\\nSa|\\n\\ne mehr du Mensch bist, desto mehr g...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>sza17850112</td>\n",
" <td>1785</td>\n",
" <td>-----\\n\\nva ar nicht die ehrlichkeit\\n\\n\\n, so...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>sza17850126</td>\n",
" <td>1785</td>\n",
" <td>---\\nT\\nT--\\n\\nSalzburger\\n\\n- I. Verordnungen...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>625</th>\n",
" <td>sza17991207</td>\n",
" <td>1799</td>\n",
" <td>769\\n\\nSalzburger\\n\\n770\\n\\n---\\n\\nIntelligenz...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>626</th>\n",
" <td>sza17991214</td>\n",
" <td>1799</td>\n",
" <td>785-\\n\\nSalzburger\\n\\nIntelligenzblatt.\\n\\n796...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>627</th>\n",
" <td>sza17991221</td>\n",
" <td>1799</td>\n",
" <td>-\\n\\n--\\n\\n-\\n\\n99 E- S urger-\\n\\nSZ--- 2-- p-...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>628</th>\n",
" <td>sza17991228</td>\n",
" <td>1799</td>\n",
" <td>Intelligenzblatt.\\n\\nLII. St. Sonnabend, den 2...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>629</th>\n",
" <td>sza1799bl01</td>\n",
" <td>1799</td>\n",
" <td>–-–\\n\\n-------------------------------------\\n...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>630 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" manifest_id year ocr\n",
"0 sza1785bl01 1785 Salzburger\\nI ntelligenzblatt.\\n\\nums-\\n\"\" H e...\n",
"1 sza1785bl02 1785 -\"-\\n\\nI Innländische, und auswärtige, besonde...\n",
"2 sza17850105 1785 \\n\\nSa|\\n\\ne mehr du Mensch bist, desto mehr g...\n",
"3 sza17850112 1785 -----\\n\\nva ar nicht die ehrlichkeit\\n\\n\\n, so...\n",
"4 sza17850126 1785 ---\\nT\\nT--\\n\\nSalzburger\\n\\n- I. Verordnungen...\n",
".. ... ... ...\n",
"625 sza17991207 1799 769\\n\\nSalzburger\\n\\n770\\n\\n---\\n\\nIntelligenz...\n",
"626 sza17991214 1799 785-\\n\\nSalzburger\\n\\nIntelligenzblatt.\\n\\n796...\n",
"627 sza17991221 1799 -\\n\\n--\\n\\n-\\n\\n99 E- S urger-\\n\\nSZ--- 2-- p-...\n",
"628 sza17991228 1799 Intelligenzblatt.\\n\\nLII. St. Sonnabend, den 2...\n",
"629 sza1799bl01 1799 –-–\\n\\n-------------------------------------\\n...\n",
"\n",
"[630 rows x 3 columns]"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sz"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "7efdb2a9-bfc3-4b88-ae19-a249a4fd21b3",
"metadata": {},
"outputs": [],
"source": [
"sz_token = sz.drop('year', axis=1, inplace=False)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "3b33320e-924d-483c-82ca-428f4a9799c6",
"metadata": {},
"outputs": [],
"source": [
"sz_token['split'] = pd.NA"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "290dbc99-e3c2-4922-b20d-110e338b1b32",
"metadata": {},
"outputs": [],
"source": [
"split = ['train', 'valid', 'test']"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "3e0e95cf-82e0-473d-85bb-7a848cde595f",
"metadata": {},
"outputs": [],
"source": [
"sz_token['split'] = sz_token['split'].apply(lambda x: np.random.choice(split, p=[0.6, 0.2, 0.2]))"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "af6f0417-becb-4506-bb4d-31340052e51a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"train 367\n",
"valid 139\n",
"test 124\n",
"Name: split, dtype: int64"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sz_token['split'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "02a44c70-f3f0-4d76-9682-b7e0c543a665",
"metadata": {},
"outputs": [],
"source": [
"sz_token = sz_token.reindex(columns= ['ocr', 'split', 'manifest_id'])"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "1219234f-7ca7-4ec8-b1d1-fdb8008e7ad3",
"metadata": {},
"outputs": [],
"source": [
"sz_token.to_csv('sz_raw.tsv',sep='\\t',index=True,header=True)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "e159229d-4156-4c39-af18-f06f32c5a1d2",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('sz_raw.tsv', sep = '\\t', encoding='utf-8')"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "3b152ed7-2f4d-4244-9cc5-a3c573a9a52f",
"metadata": {},
"outputs": [],
"source": [
"sentence = tokenizer.tokenize_text_file(df['ocr'], paragraph_separator=\"single_newlines\")"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "0d527d61-c6da-4562-9c8f-a94cf0a97931",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"558.7360711097717\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>ocr</th>\n",
" <th>split</th>\n",
" <th>manifest_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>[Salzburger, I, ntelligenzblatt, ., ums-, \", \"...</td>\n",
" <td>train</td>\n",
" <td>sza1785bl01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>[-, \", -, I, Innländische, ,, und, auswärtige,...</td>\n",
" <td>train</td>\n",
" <td>sza1785bl02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>[Sa|, e, mehr, du, Mensch, bist, ,, desto, meh...</td>\n",
" <td>valid</td>\n",
" <td>sza17850105</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>[-----, va, ar, nicht, die, ehrlichkeit, ,, so...</td>\n",
" <td>train</td>\n",
" <td>sza17850112</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>[---, T, T--, Salzburger, -, I., Verordnungen,...</td>\n",
" <td>test</td>\n",
" <td>sza17850126</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 ocr split \\\n",
"0 0 [Salzburger, I, ntelligenzblatt, ., ums-, \", \"... train \n",
"1 1 [-, \", -, I, Innländische, ,, und, auswärtige,... train \n",
"2 2 [Sa|, e, mehr, du, Mensch, bist, ,, desto, meh... valid \n",
"3 3 [-----, va, ar, nicht, die, ehrlichkeit, ,, so... train \n",
"4 4 [---, T, T--, Salzburger, -, I., Verordnungen,... test \n",
"\n",
" manifest_id \n",
"0 sza1785bl01 \n",
"1 sza1785bl02 \n",
"2 sza17850105 \n",
"3 sza17850112 \n",
"4 sza17850126 "
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import time\n",
"\n",
"#df = pd.read_csv('test_raw.tsv', sep = '\\t', encoding='utf-8')\n",
"df = pd.read_csv('sz_raw.tsv', sep = '\\t', encoding='utf-8')\n",
"\n",
"ts = time.time()\n",
"for i, ocr in df['ocr'].items():\n",
" sentences = [s for s in tokenizer.tokenize_text(ocr.split('\\n'), parallel=4)]\n",
" all_tokens = []\n",
" for sentence in sentences:\n",
" for t in sentence:\n",
" all_tokens.append(t.text)\n",
" df.at[i, 'ocr'] = all_tokens\n",
"print(time.time() - ts)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "57bea547-6fba-42da-8e26-42c4f02380d5",
"metadata": {},
"outputs": [],
"source": [
"df = df.iloc[: , 1:]"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "01cfbbf3-ee87-4614-b454-e48859b50504",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ocr</th>\n",
" <th>split</th>\n",
" <th>manifest_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>[Salzburger, I, ntelligenzblatt, ., ums-, \", \"...</td>\n",
" <td>train</td>\n",
" <td>sza1785bl01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>[-, \", -, I, Innländische, ,, und, auswärtige,...</td>\n",
" <td>train</td>\n",
" <td>sza1785bl02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>[Sa|, e, mehr, du, Mensch, bist, ,, desto, meh...</td>\n",
" <td>valid</td>\n",
" <td>sza17850105</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>[-----, va, ar, nicht, die, ehrlichkeit, ,, so...</td>\n",
" <td>train</td>\n",
" <td>sza17850112</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>[---, T, T--, Salzburger, -, I., Verordnungen,...</td>\n",
" <td>test</td>\n",
" <td>sza17850126</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>625</th>\n",
" <td>[769, Salzburger, 770, ---, Intelligenzblatt, ...</td>\n",
" <td>valid</td>\n",
" <td>sza17991207</td>\n",
" </tr>\n",
" <tr>\n",
" <th>626</th>\n",
" <td>[785-, Salzburger, Intelligenzblatt, ., 796, -...</td>\n",
" <td>train</td>\n",
" <td>sza17991214</td>\n",
" </tr>\n",
" <tr>\n",
" <th>627</th>\n",
" <td>[-, --, -, 99, E-, S, urger-, SZ---, 2-, -, p-...</td>\n",
" <td>train</td>\n",
" <td>sza17991221</td>\n",
" </tr>\n",
" <tr>\n",
" <th>628</th>\n",
" <td>[Intelligenzblatt, ., LII, ., St., Sonnabend, ...</td>\n",
" <td>train</td>\n",
" <td>sza17991228</td>\n",
" </tr>\n",
" <tr>\n",
" <th>629</th>\n",
" <td>[–, -, –, ------------------------------------...</td>\n",
" <td>test</td>\n",
" <td>sza1799bl01</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>630 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" ocr split manifest_id\n",
"0 [Salzburger, I, ntelligenzblatt, ., ums-, \", \"... train sza1785bl01\n",
"1 [-, \", -, I, Innländische, ,, und, auswärtige,... train sza1785bl02\n",
"2 [Sa|, e, mehr, du, Mensch, bist, ,, desto, meh... valid sza17850105\n",
"3 [-----, va, ar, nicht, die, ehrlichkeit, ,, so... train sza17850112\n",
"4 [---, T, T--, Salzburger, -, I., Verordnungen,... test sza17850126\n",
".. ... ... ...\n",
"625 [769, Salzburger, 770, ---, Intelligenzblatt, ... valid sza17991207\n",
"626 [785-, Salzburger, Intelligenzblatt, ., 796, -... train sza17991214\n",
"627 [-, --, -, 99, E-, S, urger-, SZ---, 2-, -, p-... train sza17991221\n",
"628 [Intelligenzblatt, ., LII, ., St., Sonnabend, ... train sza17991228\n",
"629 [–, -, –, ------------------------------------... test sza1799bl01\n",
"\n",
"[630 rows x 3 columns]"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "45f88e29-5854-4b29-b514-58215326bf9d",
"metadata": {},
"outputs": [],
"source": [
"df.to_csv('sz_tok.tsv', sep='\\t', index=True, header=True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
%% Cell type:code id:1c9cc762-eb3a-4e4a-8392-b38b897df498 tags:
```
python
import
os
import
csv
import
pathlib
import
pandas
as
pd
import
numpy
as
np
import
matplotlib.pyplot
as
plt
import
seaborn
as
sns
import
nltk
import
re
import
statistics
import
string
from
collections
import
Counter
from
collections
import
defaultdict
from
sklearn.model_selection
import
train_test_split
import
octis
```
%% Cell type:code id:7ea8760d-97e5-44ae-a271-1478989fd97b tags:
```
python
import
gensim
import
spacy
import
sklearn
import
torch
import
libsvm
import
flask
import
sentence_transformers
import
requests
import
tomotopy
```
%% Cell type:code id:32ea9351-4c82-48dc-9d85-9888243f0d3f tags:
```
python
os
.
chdir
(
r
"
C:\Users\onb1202\OneDrive - Österreichische Nationalbibliothek\Praktikum TK\daten
"
)
```
%% Cell type:code id:8f3e3f35-d073-47e8-9ca8-03d995120d3a tags:
```
python
#small test corpus
```
%% Cell type:code id:471091f8-cee3-4bff-bfeb-f403b879c830 tags:
```
python
#df = pd.read_csv('test_raw.tsv', sep = '\t', encoding='utf-8')
```
%% Cell type:code id:9091c711-de70-46b3-9e42-517c9ecf106d tags:
```
python
#from somajo import SoMaJo
#tokenizer = SoMaJo("de_CMC", split_camel_case=True)
```
%% Cell type:code id:669d3e3b-c043-4594-8e88-106020e3b945 tags:
```
python
#sentence = tokenizer.tokenize_text_file(df['ocr'], paragraph_separator="single_newlines")
```
%% Cell type:code id:b5012333-27a2-4f1c-ac15-170fa4e5fcb8 tags:
```
python
#type(sentence)
```
%% Cell type:code id:e83b6b9c-f515-4b18-9eeb-354ba8bb395d tags:
```
python
#import time
#df = pd.read_csv('test_raw.tsv', sep = '\t', encoding='utf-8')
#ts = time.time()
#for i, ocr in df['ocr'].items():
# sentences = [s for s in tokenizer.tokenize_text(ocr.split('\n'), parallel=4)]
# all_tokens = []
# for sentence in sentences[:15]:
# for t in sentence:
# all_tokens.append(t.text)
# df.at[i, 'ocr'] = all_tokens
#print(time.time() - ts)
#df.head()
```
%% Cell type:code id:9abcd566-9acd-45e1-88db-683b9a8859d9 tags:
```
python
#whole corpus WZ
```
%% Cell type:code id:be803222-005f-4a68-9a27-228e255b32ce tags:
```
python
wz
=
pd
.
read_csv
(
'
wrz.csv
'
,
sep
=
'
,
'
,
encoding
=
'
utf-8
'
)
```
%% Cell type:code id:d594f243-c1e4-4d25-a133-8d7ca4c8e58b tags:
```
python
wz_token
=
wz
.
drop
(
'
year
'
,
axis
=
1
,
inplace
=
False
)
```
%% Cell type:code id:4025980c-faf5-4d38-9043-edd155fc9eb6 tags:
```
python
wz_token
[
'
split
'
]
=
pd
.
NA
```
%% Cell type:code id:45f97a6e-d8f2-447e-b036-093c3aa42742 tags:
```
python
split
=
[
'
train
'
,
'
valid
'
,
'
test
'
]
```
%% Cell type:code id:28694d81-e44f-4ace-8b19-9d09a5f05c59 tags:
```
python
wz_token
[
'
split
'
]
=
wz_token
[
'
split
'
].
apply
(
lambda
x
:
np
.
random
.
choice
(
split
,
p
=
[
0.6
,
0.2
,
0.2
]))
```
%% Cell type:code id:c56609e5-51a6-41b5-8b91-3687b3f8ec09 tags:
```
python
wz_token
[
'
split
'
].
value_counts
()
```
%% Cell type:code id:f0f85ec8-8ba5-404e-8f63-198d83963e00 tags:
```
python
wz_token
.
to_csv
(
'
wz_raw.tsv
'
,
sep
=
'
\t
'
,
index
=
True
,
header
=
True
)
```
%% Cell type:code id:75a511a5-326a-472a-a103-6794e7dc4d18 tags:
```
python
df
=
pd
.
read_csv
(
'
wz_raw.tsv
'
,
sep
=
'
\t
'
,
encoding
=
'
utf-8
'
)
```
%% Cell type:code id:4f5edd4f-e0f2-4e41-b04f-da490c659e0d tags:
```
python
from
somajo
import
SoMaJo
tokenizer
=
SoMaJo
(
"
de_CMC
"
,
split_camel_case
=
True
)
```
%% Cell type:code id:ee4959e5-e49c-4715-89f8-44d3e0c2ee9c tags:
```
python
sentence
=
tokenizer
.
tokenize_text_file
(
df
[
'
ocr
'
],
paragraph_separator
=
"
single_newlines
"
)
```
%% Cell type:code id:023a1e55-e291-4ab9-a7e5-4f6916b063a5 tags:
```
python
import
time
#df = pd.read_csv('test_raw.tsv', sep = '\t', encoding='utf-8')
df
=
pd
.
read_csv
(
'
wz_raw.tsv
'
,
sep
=
'
\t
'
,
encoding
=
'
utf-8
'
)
ts
=
time
.
time
()
for
i
,
ocr
in
df
[
'
ocr
'
].
items
():
sentences
=
[
s
for
s
in
tokenizer
.
tokenize_text
(
ocr
.
split
(
'
\n
'
),
parallel
=
4
)]
all_tokens
=
[]
for
sentence
in
sentences
:
for
t
in
sentence
:
all_tokens
.
append
(
t
.
text
)
df
.
at
[
i
,
'
ocr
'
]
=
all_tokens
print
(
time
.
time
()
-
ts
)
df
.
head
()
```
%% Output
4156.50820016861
Unnamed: 0 manifest_id ocr \
0 0 wrz17850101 [itz, ., r, F-, Nro, ., Sonnabmd, den, r., ^an...
1 1 wrz17850105 [5i, ., 2Y, F, Mittwoch, den, 5., Iäner, ., 17...
2 2 wrz17850108 [57, ^, Sonnabend, den, 8., Janer, ., 1735, .,...
3 3 wrz17850112 [5k, 8i, F, Mittwoch, den, Iäner, ., 173, ?, ....
4 4 wrz17850115 [^, 109, ^, Sonnabend, den, 15., Ianer, ., 178...
split
0 valid
1 train
2 valid
3 train
4 train
%% Cell type:code id:eb2fcf60-6051-46b4-84b0-33b1b2bd1e41 tags:
```
python
df
=
df
.
iloc
[:
,
1
:]
```
%% Cell type:code id:5ec0718e-fbfe-40af-938f-5fc6d6ebc68a tags:
```
python
df
=
df
.
reindex
(
columns
=
[
'
ocr
'
,
'
split
'
,
'
manifest_id
'
])
```
%% Cell type:code id:35981c7d-9d7e-48d4-976b-671f32973096 tags:
```
python
df
```
%% Output
ocr split manifest_id
0 [itz, ., r, F-, Nro, ., Sonnabmd, den, r., ^an... valid wrz17850101
1 [5i, ., 2Y, F, Mittwoch, den, 5., Iäner, ., 17... train wrz17850105
2 [57, ^, Sonnabend, den, 8., Janer, ., 1735, .,... valid wrz17850108
3 [5k, 8i, F, Mittwoch, den, Iäner, ., 173, ?, .... train wrz17850112
4 [^, 109, ^, Sonnabend, den, 15., Ianer, ., 178... train wrz17850115
... ... ... ...
1299 [Sonnabend, ,, den, 14, *, December, 1799, ..,... train wrz17991214
1300 [I, m, 4, -, 77, i', jLiii, ., 1, J, Is, i0, n... valid wrz17991218
1301 [L, Sonnabend, ,, den, 21., December, 1799, .,... valid wrz17991221
1302 [WVr, 4, S73, ZMAr, if, >, Mittewoche, ,, den,... test wrz17991225
1303 [Sonnabend, ,, den, rz, «, December, 1799, ., ... test wrz17991228
[1304 rows x 3 columns]
%% Cell type:code id:b8e6b43c-624e-440e-bc1a-f10f88f4b428 tags:
```
python
df
.
to_csv
(
'
wz_tok.tsv
'
,
sep
=
'
\t
'
,
index
=
True
,
header
=
True
)
```
%% Cell type:code id:80fa2ade-390b-4229-94a0-e99bdd646e0a tags:
```
python
#df = pd.read_csv('wz_tok.tsv', sep='\t', encoding='utf-8')
```
%% Cell type:code id:641b1d29-fd70-4b18-b7be-78e6b8162cae tags:
```
python
``
`
%%
Cell
type
:
code
id
:
97
f038f1
-
f618
-
4266
-
b660
-
4
f66a3ab8bc9
tags
:
```
python
sz = pd.read_csv('sza.csv', sep=',', encoding='UTF-8')
```
%% Cell type:code id:532fb694-ccde-4906-ad0d-fca376262192 tags:
```
python
sz
```
%% Output
manifest_id year ocr
0 sza1785bl01 1785 Salzburger\nI ntelligenzblatt.\n\nums-\n"" H e...
1 sza1785bl02 1785 -"-\n\nI Innländische, und auswärtige, besonde...
2 sza17850105 1785 \n\nSa|\n\ne mehr du Mensch bist, desto mehr g...
3 sza17850112 1785 -----\n\nva ar nicht die ehrlichkeit\n\n\n, so...
4 sza17850126 1785 ---\nT\nT--\n\nSalzburger\n\n- I. Verordnungen...
.. ... ... ...
625 sza17991207 1799 769\n\nSalzburger\n\n770\n\n---\n\nIntelligenz...
626 sza17991214 1799 785-\n\nSalzburger\n\nIntelligenzblatt.\n\n796...
627 sza17991221 1799 -\n\n--\n\n-\n\n99 E- S urger-\n\nSZ--- 2-- p-...
628 sza17991228 1799 Intelligenzblatt.\n\nLII. St. Sonnabend, den 2...
629 sza1799bl01 1799 –-–\n\n-------------------------------------\n...
[630 rows x 3 columns]
%% Cell type:code id:7efdb2a9-bfc3-4b88-ae19-a249a4fd21b3 tags:
```
python
sz_token = sz.drop('year', axis=1, inplace=False)
```
%% Cell type:code id:3b33320e-924d-483c-82ca-428f4a9799c6 tags:
```
python
sz_token['split'] = pd.NA
```
%% Cell type:code id:290dbc99-e3c2-4922-b20d-110e338b1b32 tags:
```
python
split = ['train', 'valid', 'test']
```
%% Cell type:code id:3e0e95cf-82e0-473d-85bb-7a848cde595f tags:
```
python
sz_token['split'] = sz_token['split'].apply(lambda x: np.random.choice(split, p=[0.6, 0.2, 0.2]))
```
%% Cell type:code id:af6f0417-becb-4506-bb4d-31340052e51a tags:
```
python
sz_token['split'].value_counts()
```
%% Output
train 367
valid 139
test 124
Name: split, dtype: int64
%% Cell type:code id:02a44c70-f3f0-4d76-9682-b7e0c543a665 tags:
```
python
sz_token = sz_token.reindex(columns= ['ocr', 'split', 'manifest_id'])
```
%% Cell type:code id:1219234f-7ca7-4ec8-b1d1-fdb8008e7ad3 tags:
```
python
sz_token.to_csv('sz_raw.tsv',sep='
\t
',index=True,header=True)
```
%% Cell type:code id:e159229d-4156-4c39-af18-f06f32c5a1d2 tags:
```
python
df = pd.read_csv('sz_raw.tsv', sep = '
\t
', encoding='utf-8')
```
%% Cell type:code id:3b152ed7-2f4d-4244-9cc5-a3c573a9a52f tags:
```
python
sentence = tokenizer.tokenize_text_file(df['ocr'], paragraph_separator="single_newlines")
```
%% Cell type:code id:0d527d61-c6da-4562-9c8f-a94cf0a97931 tags:
```
python
import time
#df = pd.read_csv('test_raw.tsv', sep = '\t', encoding='utf-8')
df = pd.read_csv('sz_raw.tsv', sep = '
\t
', encoding='utf-8')
ts = time.time()
for i, ocr in df['ocr'].items():
sentences = [s for s in tokenizer.tokenize_text(ocr.split('
\n
'), parallel=4)]
all_tokens = []
for sentence in sentences:
for t in sentence:
all_tokens.append(t.text)
df.at[i, 'ocr'] = all_tokens
print(time.time() - ts)
df.head()
```
%% Output
558.7360711097717
Unnamed: 0 ocr split \
0 0 [Salzburger, I, ntelligenzblatt, ., ums-, ", "... train
1 1 [-, ", -, I, Innländische, ,, und, auswärtige,... train
2 2 [Sa|, e, mehr, du, Mensch, bist, ,, desto, meh... valid
3 3 [-----, va, ar, nicht, die, ehrlichkeit, ,, so... train
4 4 [---, T, T--, Salzburger, -, I., Verordnungen,... test
manifest_id
0 sza1785bl01
1 sza1785bl02
2 sza17850105
3 sza17850112
4 sza17850126
%% Cell type:code id:57bea547-6fba-42da-8e26-42c4f02380d5 tags:
```
python
df = df.iloc[: , 1:]
```
%% Cell type:code id:01cfbbf3-ee87-4614-b454-e48859b50504 tags:
```
python
df
```
%% Output
ocr split manifest_id
0 [Salzburger, I, ntelligenzblatt, ., ums-, ", "... train sza1785bl01
1 [-, ", -, I, Innländische, ,, und, auswärtige,... train sza1785bl02
2 [Sa|, e, mehr, du, Mensch, bist, ,, desto, meh... valid sza17850105
3 [-----, va, ar, nicht, die, ehrlichkeit, ,, so... train sza17850112
4 [---, T, T--, Salzburger, -, I., Verordnungen,... test sza17850126
.. ... ... ...
625 [769, Salzburger, 770, ---, Intelligenzblatt, ... valid sza17991207
626 [785-, Salzburger, Intelligenzblatt, ., 796, -... train sza17991214
627 [-, --, -, 99, E-, S, urger-, SZ---, 2-, -, p-... train sza17991221
628 [Intelligenzblatt, ., LII, ., St., Sonnabend, ... train sza17991228
629 [–, -, –, ------------------------------------... test sza1799bl01
[630 rows x 3 columns]
%% Cell type:code id:45f88e29-5854-4b29-b514-58215326bf9d tags:
```
python
df.to_csv('sz_tok.tsv', sep='
\t
', index=True, header=True)
```
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment