diff --git a/cleaning.ipynb b/cleaning.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..43f9cac714469a8ba2ff27bd75de4c13aa86588f
--- /dev/null
+++ b/cleaning.ipynb
@@ -0,0 +1,2731 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "e81f9560-1a1b-4d93-93c3-84e1263b04f0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import csv\n",
+ "import pathlib\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import nltk\n",
+ "import re\n",
+ "import string\n",
+ "from collections import Counter\n",
+ "from collections import defaultdict"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "85c59a91-3778-4973-9491-c99528fe39a3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "89bc3c22-b2f3-4389-b843-ebe2bed4ea35",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "os.chdir(r\"C:\\Users\\onb1202\\OneDrive - Österreichische Nationalbibliothek\\Praktikum TK\\daten\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "346bdb53-f234-408a-b666-16ddcc10a8e1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz = pd.read_csv('wz_tok.tsv', sep='\\t', encoding='utf-8')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "ca7bb501-ebc6-4b00-bf7b-cc6d7bd97927",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Unnamed: 0 | \n",
+ " ocr | \n",
+ " split | \n",
+ " manifest_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ['itz', '.', 'r', 'F-', 'Nro', '.', 'Sonnabmd'... | \n",
+ " valid | \n",
+ " wrz17850101 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " ['5i', '.', '2Y', 'F', 'Mittwoch', 'den', '5.'... | \n",
+ " train | \n",
+ " wrz17850105 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2 | \n",
+ " ['57', '^', 'Sonnabend', 'den', '8.', 'Janer',... | \n",
+ " valid | \n",
+ " wrz17850108 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3 | \n",
+ " ['5k', '8i', 'F', 'Mittwoch', 'den', 'Iäner', ... | \n",
+ " train | \n",
+ " wrz17850112 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 4 | \n",
+ " ['^', '109', '^', 'Sonnabend', 'den', '15.', '... | \n",
+ " train | \n",
+ " wrz17850115 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 1299 | \n",
+ " 1299 | \n",
+ " ['Sonnabend', ',', 'den', '14', '*', 'December... | \n",
+ " train | \n",
+ " wrz17991214 | \n",
+ "
\n",
+ " \n",
+ " 1300 | \n",
+ " 1300 | \n",
+ " ['I', 'm', '4', '-', '77', \"i'\", 'jLiii', '.',... | \n",
+ " valid | \n",
+ " wrz17991218 | \n",
+ "
\n",
+ " \n",
+ " 1301 | \n",
+ " 1301 | \n",
+ " ['L', 'Sonnabend', ',', 'den', '21.', 'Decembe... | \n",
+ " valid | \n",
+ " wrz17991221 | \n",
+ "
\n",
+ " \n",
+ " 1302 | \n",
+ " 1302 | \n",
+ " ['WVr', '4', 'S73', 'ZMAr', 'if', '>', 'Mittew... | \n",
+ " test | \n",
+ " wrz17991225 | \n",
+ "
\n",
+ " \n",
+ " 1303 | \n",
+ " 1303 | \n",
+ " ['Sonnabend', ',', 'den', 'rz', '«', 'December... | \n",
+ " test | \n",
+ " wrz17991228 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1304 rows × 4 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Unnamed: 0 ocr split \\\n",
+ "0 0 ['itz', '.', 'r', 'F-', 'Nro', '.', 'Sonnabmd'... valid \n",
+ "1 1 ['5i', '.', '2Y', 'F', 'Mittwoch', 'den', '5.'... train \n",
+ "2 2 ['57', '^', 'Sonnabend', 'den', '8.', 'Janer',... valid \n",
+ "3 3 ['5k', '8i', 'F', 'Mittwoch', 'den', 'Iäner', ... train \n",
+ "4 4 ['^', '109', '^', 'Sonnabend', 'den', '15.', '... train \n",
+ "... ... ... ... \n",
+ "1299 1299 ['Sonnabend', ',', 'den', '14', '*', 'December... train \n",
+ "1300 1300 ['I', 'm', '4', '-', '77', \"i'\", 'jLiii', '.',... valid \n",
+ "1301 1301 ['L', 'Sonnabend', ',', 'den', '21.', 'Decembe... valid \n",
+ "1302 1302 ['WVr', '4', 'S73', 'ZMAr', 'if', '>', 'Mittew... test \n",
+ "1303 1303 ['Sonnabend', ',', 'den', 'rz', '«', 'December... test \n",
+ "\n",
+ " manifest_id \n",
+ "0 wrz17850101 \n",
+ "1 wrz17850105 \n",
+ "2 wrz17850108 \n",
+ "3 wrz17850112 \n",
+ "4 wrz17850115 \n",
+ "... ... \n",
+ "1299 wrz17991214 \n",
+ "1300 wrz17991218 \n",
+ "1301 wrz17991221 \n",
+ "1302 wrz17991225 \n",
+ "1303 wrz17991228 \n",
+ "\n",
+ "[1304 rows x 4 columns]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "wz"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "e52205c3-1070-42f3-b84c-9647be21025f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz = wz.iloc[: , 1:]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "4a63eddd-0bab-48ba-b4b6-428913702f69",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz['year'] = wz['manifest_id'].str.findall(r\"(?<=\\D)\\d{4}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "142dc825-4002-4e82-81be-e52bf10ecfc4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz['year'] =wz['year'].str[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "6d5c7b47-5a09-4887-b1b5-7e1acb7c882d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz['year'] = wz['year'].astype(int)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "65beb413-379b-490a-9dfa-10c55434bbec",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz = wz.loc[wz['year'] > 1788]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9da52b1d-4075-4a7d-80ff-d51b8205d00f",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "cd67ebd7-256f-4a97-bae2-a1632f33730c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz = pd.read_csv('sz_tok.tsv', sep='\\t', encoding='utf-8')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "4cc7d7a8-8dd6-445f-84d9-ea49833579e0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "68ac11f8-9086-4918-a98b-1c70007f410c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz = sz.iloc[: , 1:]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "633734fb-a550-4e33-a5f0-c3dab670bed3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz['year'] = sz['manifest_id'].str.findall(r\"(?<=\\D)\\d{4}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "9b088ef5-2ed5-40e6-861b-dc288e28e53a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz['year'] =sz['year'].str[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "04b30aa1-0271-41ea-9f98-576f35aac76b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz['year'] = sz['year'].astype(int)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "990cbf1e-5095-4120-a25f-eb3ca4ba3c1d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz = sz.loc[sz['year'] > 1788]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0e25fc89-f7c4-44f0-98a8-a226d19deb62",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "82867c03-2673-496b-a8c8-ff835f269b69",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz['ocr'] = wz['ocr'].str.strip('[]').astype(str)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "7bfa4688-7130-4c84-bc73-b4a25af45e7f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz['ocr'] = sz['ocr'].str.strip('[]').astype(str)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b9101a01-1835-4240-8099-2ed785bd6bc4",
+ "metadata": {},
+ "source": [
+ "check token number
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "1405137c-5925-4f76-b113-d130cb724535",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "34126677"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sum(wz['ocr'].str.split().str.len())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "89637233-f308-4e07-8b9a-e97beb1ef723",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sum(sz['ocr'].str.split().str.len())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2aa172ca-9690-475d-96b4-e9011c35d40f",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "id": "73d929a5-c41b-48c2-894a-22470cfc6b52",
+ "metadata": {},
+ "source": [
+ "lowercase data
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "581f9264-c3a6-467f-abbc-d6c2cc4a0c03",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz['ocr'] = wz['ocr'].str.lower()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "9018a020-1f81-48a7-977f-df60db36bd5d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz['ocr'] = sz['ocr'].str.lower()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2281771e-c941-4069-8edf-1aed8fee1a48",
+ "metadata": {},
+ "source": [
+ "remove stopwords
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "35ca035b-808d-40ff-b1f8-2bee8be06219",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[nltk_data] Downloading package stopwords to\n",
+ "[nltk_data] C:\\Users\\onb1202\\AppData\\Roaming\\nltk_data...\n",
+ "[nltk_data] Package stopwords is already up-to-date!\n"
+ ]
+ }
+ ],
+ "source": [
+ "from nltk.corpus import stopwords\n",
+ "nltk.download('stopwords')\n",
+ "from nltk.corpus import stopwords\n",
+ "stopwords = nltk.corpus.stopwords.words('german')\n",
+ "stopwords_manual = ( 'worden', 'daher', 'seyn', 'hiemit', 'immer', 'ganz', 'nebst', 'wider', 'schon', 'weder', 'sicher', 'sowohl', 'binnen', 'deto', 'chen', 'sicher', 'sowohl', 'eben', 'hiezu', 'samt', 'darüber', 'wäre', 'nebst' )\n",
+ "stopwords.extend(stopwords_manual)\n",
+ "stop = stopwords"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "8314c831-ab58-415c-bd62-56a1306d83f6",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "C:\\Users\\onb1202\\AppData\\Local\\Temp\\ipykernel_6532\\3231380372.py:2: FutureWarning: The default value of regex will change from True to False in a future version.\n",
+ " wz['ocr'] = wz['ocr'].str.replace(pat, '')\n",
+ "C:\\Users\\onb1202\\AppData\\Local\\Temp\\ipykernel_6532\\3231380372.py:3: FutureWarning: The default value of regex will change from True to False in a future version.\n",
+ " wz['ocr'] = wz['ocr'].str.replace(r'\\s+', ' ')\n"
+ ]
+ }
+ ],
+ "source": [
+ "pat = r'\\b(?:{})\\b'.format('|'.join(stop))\n",
+ "wz['ocr'] = wz['ocr'].str.replace(pat, '')\n",
+ "wz['ocr'] = wz['ocr'].str.replace(r'\\s+', ' ')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "1b32c84b-e63c-4665-9af2-5b095901bf6e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "209 '', 'e', 'n', 'e', \"'r-\", 'ze', 'sonnabend', '...\n",
+ "210 '29', '^', 'mittewoche', '', '7.', 'ianer', '1...\n",
+ "211 'hi', '.', 'l3', \"izvrenei'\", 'zeilunn', '.', ...\n",
+ "212 '8r', '«', 'v', 'mittewoche', '', '14.', 'ickn...\n",
+ "213 'sonnabend', '', '17.', \"ia'ner\", '1789', '.',...\n",
+ " ... \n",
+ "1299 'sonnabend', ',', '', '14', '*', 'december', '...\n",
+ "1300 'i', 'm', '4', '-', '77', \"i'\", 'jliii', '.', ...\n",
+ "1301 'l', 'sonnabend', ',', '', '21.', 'december', ...\n",
+ "1302 'wvr', '4', 's73', 'zmar', 'if', '>', 'mittewo...\n",
+ "1303 'sonnabend', ',', '', 'rz', '«', 'december', '...\n",
+ "Name: ocr, Length: 1095, dtype: object"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "wz['ocr']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "2eda7364-acfa-41ca-b56a-32c4d9cbe7fe",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pat = r'\\b(?:{})\\b'.format('|'.join(stop))\n",
+ "sz['ocr'] = sz['ocr'].str.replace(pat, '')\n",
+ "sz['ocr'] = sz['ocr'].str.replace(r'\\s+', ' ')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0534849d-7374-4f82-9ca7-9ec9adede7e3",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "14c691d7-5eb5-4f46-96c8-d3fbb0c81c9b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz['ocr'] = wz['ocr'].apply(lambda x: ' '.join([item for item in x.split() if item not in stopwords]))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "aa5bfa8a-c394-491e-ba00-56af0e88f3fc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz['ocr']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "2d8c7bd1-6077-4f47-a9de-eae1ee1d83c9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz['ocr'] = sz['ocr'].apply(lambda x: ' '.join([item for item in x.split() if item not in stopwords]))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "5c339a69-a320-4054-8743-e86f85adf92f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz['ocr']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6fd1ff62-d603-4ab3-abdc-a9a5e84c3610",
+ "metadata": {},
+ "source": [
+ " remove quotation marks
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "caac81b8-4e6e-4017-926f-ddb5bad809e2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz['ocr'] = wz['ocr'].str.replace(\"'\", \"\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "c49d9432-79d7-49b7-bd45-a54c3beacff0",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ocr | \n",
+ " split | \n",
+ " manifest_id | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 209 | \n",
+ " , e, n, e, \"r-\", ze, sonnabend, , z., iäner, ,... | \n",
+ " test | \n",
+ " wrz17890103 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 210 | \n",
+ " 29, ^, mittewoche, , 7., ianer, 1789, ., inlän... | \n",
+ " train | \n",
+ " wrz17890107 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 211 | \n",
+ " hi, ., l3, \"izvrenei\", zeilunn, ., scnnabend, ... | \n",
+ " test | \n",
+ " wrz17890110 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 212 | \n",
+ " 8r, «, v, mittewoche, , 14., ickner, 1739, ., ... | \n",
+ " train | \n",
+ " wrz17890114 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 213 | \n",
+ " sonnabend, , 17., \"ianer\", 1789, ., inländisch... | \n",
+ " train | \n",
+ " wrz17890117 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ocr split manifest_id \\\n",
+ "209 , e, n, e, \"r-\", ze, sonnabend, , z., iäner, ,... test wrz17890103 \n",
+ "210 29, ^, mittewoche, , 7., ianer, 1789, ., inlän... train wrz17890107 \n",
+ "211 hi, ., l3, \"izvrenei\", zeilunn, ., scnnabend, ... test wrz17890110 \n",
+ "212 8r, «, v, mittewoche, , 14., ickner, 1739, ., ... train wrz17890114 \n",
+ "213 sonnabend, , 17., \"ianer\", 1789, ., inländisch... train wrz17890117 \n",
+ "\n",
+ " year \n",
+ "209 1789 \n",
+ "210 1789 \n",
+ "211 1789 \n",
+ "212 1789 \n",
+ "213 1789 "
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "wz.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "9c2e0a02-df0b-42b0-b95d-674d6bdd64bb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz['ocr'] = sz['ocr'].str.replace(\"'\", \"\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "e668bdbd-fe33-47db-b299-f32b38c3bbca",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d382622b-0ec1-4327-a34e-23c296b991e6",
+ "metadata": {},
+ "source": [
+ "remove commas
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "id": "a2cd53fb-da51-4fc5-8ad3-b1871d3d6387",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz['ocr'] = wz['ocr'].str.replace(\",\", \"\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "id": "89811f64-5939-44df-972b-df803a82da24",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ocr | \n",
+ " split | \n",
+ " manifest_id | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 209 | \n",
+ " e n e \"r-\" ze sonnabend z. iäner 7 ^ 9. inlä... | \n",
+ " test | \n",
+ " wrz17890103 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 210 | \n",
+ " 29 ^ mittewoche 7. ianer 1789 . inländische b... | \n",
+ " train | \n",
+ " wrz17890107 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 211 | \n",
+ " hi . l3 \"izvrenei\" zeilunn . scnnabend dtii iv... | \n",
+ " test | \n",
+ " wrz17890110 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 212 | \n",
+ " 8r « v mittewoche 14. ickner 1739 . mändische... | \n",
+ " train | \n",
+ " wrz17890114 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 213 | \n",
+ " sonnabend 17. \"ianer\" 1789 . inländische bege... | \n",
+ " train | \n",
+ " wrz17890117 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ocr split manifest_id \\\n",
+ "209 e n e \"r-\" ze sonnabend z. iäner 7 ^ 9. inlä... test wrz17890103 \n",
+ "210 29 ^ mittewoche 7. ianer 1789 . inländische b... train wrz17890107 \n",
+ "211 hi . l3 \"izvrenei\" zeilunn . scnnabend dtii iv... test wrz17890110 \n",
+ "212 8r « v mittewoche 14. ickner 1739 . mändische... train wrz17890114 \n",
+ "213 sonnabend 17. \"ianer\" 1789 . inländische bege... train wrz17890117 \n",
+ "\n",
+ " year \n",
+ "209 1789 \n",
+ "210 1789 \n",
+ "211 1789 \n",
+ "212 1789 \n",
+ "213 1789 "
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "wz.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "id": "61aeaeb6-e776-4a5b-a9db-ec1612a85643",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz['ocr'] = sz['ocr'].str.replace(\",\", \"\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "id": "3d5b2d54-6857-4917-92f3-0233b584ad69",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ced70950-9f74-42d5-8c4a-f3020fbb016b",
+ "metadata": {},
+ "source": [
+ "remove white space before punctuation
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "id": "e523592c-d7f4-4fb9-9a77-abca1765975b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz['ocr'] = wz['ocr'].replace( { r\"\\s(?=[.,:;])+\" : '' }, regex = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "id": "d84bf43d-9aa4-4732-a67e-1e27e1179cb2",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ocr | \n",
+ " split | \n",
+ " manifest_id | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 209 | \n",
+ " e n e \"r-\" ze sonnabend z. iäner 7 ^ 9. inlä... | \n",
+ " test | \n",
+ " wrz17890103 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 210 | \n",
+ " 29 ^ mittewoche 7. ianer 1789. inländische be... | \n",
+ " train | \n",
+ " wrz17890107 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 211 | \n",
+ " hi. l3 \"izvrenei\" zeilunn. scnnabend dtii iv. ... | \n",
+ " test | \n",
+ " wrz17890110 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 212 | \n",
+ " 8r « v mittewoche 14. ickner 1739. mändische ... | \n",
+ " train | \n",
+ " wrz17890114 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 213 | \n",
+ " sonnabend 17. \"ianer\" 1789. inländische begeb... | \n",
+ " train | \n",
+ " wrz17890117 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ocr split manifest_id \\\n",
+ "209 e n e \"r-\" ze sonnabend z. iäner 7 ^ 9. inlä... test wrz17890103 \n",
+ "210 29 ^ mittewoche 7. ianer 1789. inländische be... train wrz17890107 \n",
+ "211 hi. l3 \"izvrenei\" zeilunn. scnnabend dtii iv. ... test wrz17890110 \n",
+ "212 8r « v mittewoche 14. ickner 1739. mändische ... train wrz17890114 \n",
+ "213 sonnabend 17. \"ianer\" 1789. inländische begeb... train wrz17890117 \n",
+ "\n",
+ " year \n",
+ "209 1789 \n",
+ "210 1789 \n",
+ "211 1789 \n",
+ "212 1789 \n",
+ "213 1789 "
+ ]
+ },
+ "execution_count": 39,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "wz.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "id": "f0e31e67-0c3c-44db-860c-c6f56e904010",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz['ocr'] = sz['ocr'].replace( { r\"\\s(?=[.,:;])+\" : '' }, regex = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "id": "e8e48a17-872b-4918-aa66-563b26c958ca",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3e63a61f-e62a-4a21-9221-0e3b2baf280f",
+ "metadata": {},
+ "source": [
+ "remove punctuation at end of string
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "id": "b312714a-118f-462f-850a-9f3a15f86a80",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz['ocr'] = wz['ocr'].replace( { r\"w*[.?\\!*]\\s+\" : ' ' }, regex = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "id": "611d5373-1286-427c-b97e-bab769aa96f6",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ocr | \n",
+ " split | \n",
+ " manifest_id | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 209 | \n",
+ " e n e \"r-\" ze sonnabend z iäner 7 ^ 9 inländ... | \n",
+ " test | \n",
+ " wrz17890103 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 210 | \n",
+ " 29 ^ mittewoche 7 ianer 1789 inländische bege... | \n",
+ " train | \n",
+ " wrz17890107 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 211 | \n",
+ " hi l3 \"izvrenei\" zeilunn scnnabend dtii iv ^än... | \n",
+ " test | \n",
+ " wrz17890110 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 212 | \n",
+ " 8r « v mittewoche 14 ickner 1739 mändische be... | \n",
+ " train | \n",
+ " wrz17890114 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 213 | \n",
+ " sonnabend 17 \"ianer\" 1789 inländische begeben... | \n",
+ " train | \n",
+ " wrz17890117 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ocr split manifest_id \\\n",
+ "209 e n e \"r-\" ze sonnabend z iäner 7 ^ 9 inländ... test wrz17890103 \n",
+ "210 29 ^ mittewoche 7 ianer 1789 inländische bege... train wrz17890107 \n",
+ "211 hi l3 \"izvrenei\" zeilunn scnnabend dtii iv ^än... test wrz17890110 \n",
+ "212 8r « v mittewoche 14 ickner 1739 mändische be... train wrz17890114 \n",
+ "213 sonnabend 17 \"ianer\" 1789 inländische begeben... train wrz17890117 \n",
+ "\n",
+ " year \n",
+ "209 1789 \n",
+ "210 1789 \n",
+ "211 1789 \n",
+ "212 1789 \n",
+ "213 1789 "
+ ]
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "wz.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "id": "8e2a3351-7568-4b0a-8702-2a64aa6dc185",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz['ocr'] = sz['ocr'].replace( { r\"w*[.?\\!*]\\s+\" : ' ' }, regex = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "6e1255c8-9331-4bb3-970e-e34da5bdc988",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b9af4de4-0cd7-4b83-8815-6a669f26a4e8",
+ "metadata": {},
+ "source": [
+ "remove words containing numbers
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "id": "60ddf3c1-dbc9-4a69-bbe6-4437484131df",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz['ocr'] = wz['ocr'].replace( { r\"\\w*\\d\\w*\" : '' }, regex = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "id": "5e97a82e-8418-4e37-a34a-6ab1be80bb4c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ocr | \n",
+ " split | \n",
+ " manifest_id | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 209 | \n",
+ " e n e \"r-\" ze sonnabend z iäner ^ inländis... | \n",
+ " test | \n",
+ " wrz17890103 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 210 | \n",
+ " ^ mittewoche ianer inländische begebenheit... | \n",
+ " train | \n",
+ " wrz17890107 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 211 | \n",
+ " hi \"izvrenei\" zeilunn scnnabend dtii iv ^äner... | \n",
+ " test | \n",
+ " wrz17890110 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 212 | \n",
+ " « v mittewoche ickner mändische begebenhei... | \n",
+ " train | \n",
+ " wrz17890114 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 213 | \n",
+ " sonnabend \"ianer\" inländische begebenheit «... | \n",
+ " train | \n",
+ " wrz17890117 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ocr split manifest_id \\\n",
+ "209 e n e \"r-\" ze sonnabend z iäner ^ inländis... test wrz17890103 \n",
+ "210 ^ mittewoche ianer inländische begebenheit... train wrz17890107 \n",
+ "211 hi \"izvrenei\" zeilunn scnnabend dtii iv ^äner... test wrz17890110 \n",
+ "212 « v mittewoche ickner mändische begebenhei... train wrz17890114 \n",
+ "213 sonnabend \"ianer\" inländische begebenheit «... train wrz17890117 \n",
+ "\n",
+ " year \n",
+ "209 1789 \n",
+ "210 1789 \n",
+ "211 1789 \n",
+ "212 1789 \n",
+ "213 1789 "
+ ]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "wz.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "id": "0006ce39-f7c1-4554-855f-036c878c35e6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz['ocr'] = sz['ocr'].replace( { r\"\\w*\\d\\w*\" : '' }, regex = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "id": "791c92b5-7b44-47ec-9619-96fd6c6b3e35",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0cc26679-7405-4baa-99bd-7c63ba7945ab",
+ "metadata": {},
+ "source": [
+ "remove words containing punctuation/special characters
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "id": "6d80ee6a-6e7a-4377-b4b6-c2f918178269",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz['ocr'] = wz['ocr'].replace( { r\"\\S*[\\!\\,\\.\\+\\$\\^\\&\\(\\)\\'\\?\\*\\<\\>\\:\\\"\\@\\\\\\/]+\\S+\" : '' }, regex = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "id": "bcdb14ed-d068-49ea-b3c7-f25a2c5eff5d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ocr | \n",
+ " split | \n",
+ " manifest_id | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 209 | \n",
+ " e n e ze sonnabend z iäner ^ inländische ... | \n",
+ " test | \n",
+ " wrz17890103 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 210 | \n",
+ " ^ mittewoche ianer inländische begebenheit... | \n",
+ " train | \n",
+ " wrz17890107 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 211 | \n",
+ " hi zeilunn scnnabend dtii iv inländische b... | \n",
+ " test | \n",
+ " wrz17890110 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 212 | \n",
+ " « v mittewoche ickner mändische begebenhei... | \n",
+ " train | \n",
+ " wrz17890114 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 213 | \n",
+ " sonnabend inländische begebenheit « « « ... | \n",
+ " train | \n",
+ " wrz17890117 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ocr split manifest_id \\\n",
+ "209 e n e ze sonnabend z iäner ^ inländische ... test wrz17890103 \n",
+ "210 ^ mittewoche ianer inländische begebenheit... train wrz17890107 \n",
+ "211 hi zeilunn scnnabend dtii iv inländische b... test wrz17890110 \n",
+ "212 « v mittewoche ickner mändische begebenhei... train wrz17890114 \n",
+ "213 sonnabend inländische begebenheit « « « ... train wrz17890117 \n",
+ "\n",
+ " year \n",
+ "209 1789 \n",
+ "210 1789 \n",
+ "211 1789 \n",
+ "212 1789 \n",
+ "213 1789 "
+ ]
+ },
+ "execution_count": 51,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "wz.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "id": "b6a507b8-e626-44c8-ace1-3fa4aa1690f1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz['ocr'] = sz['ocr'].replace( { r\"\\S*[\\!\\,\\.\\+\\$\\^\\&\\(\\)\\'\\?\\*\\<\\>\\:\\\"\\@\\\\\\/]+\\S+\" : '' }, regex = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "id": "2353ec0a-6530-4e36-8142-6f178ee31937",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d9244eea-ed90-47aa-8239-4bc5c3a03dac",
+ "metadata": {},
+ "source": [
+ "remove hyphen and equal signs between words
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "id": "d2ca9b6e-e284-4cd3-95ec-a39ba4bd0f78",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz['ocr'] = wz['ocr'].replace( { r\"w*\\-|=\" : '' }, regex = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "id": "5cc2b794-39e8-45c9-a028-50c3d86bf73f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ocr | \n",
+ " split | \n",
+ " manifest_id | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 209 | \n",
+ " e n e ze sonnabend z iäner ^ inländische ... | \n",
+ " test | \n",
+ " wrz17890103 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 210 | \n",
+ " ^ mittewoche ianer inländische begebenheit... | \n",
+ " train | \n",
+ " wrz17890107 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 211 | \n",
+ " hi zeilunn scnnabend dtii iv inländische b... | \n",
+ " test | \n",
+ " wrz17890110 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 212 | \n",
+ " « v mittewoche ickner mändische begebenhei... | \n",
+ " train | \n",
+ " wrz17890114 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 213 | \n",
+ " sonnabend inländische begebenheit « « « ... | \n",
+ " train | \n",
+ " wrz17890117 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ocr split manifest_id \\\n",
+ "209 e n e ze sonnabend z iäner ^ inländische ... test wrz17890103 \n",
+ "210 ^ mittewoche ianer inländische begebenheit... train wrz17890107 \n",
+ "211 hi zeilunn scnnabend dtii iv inländische b... test wrz17890110 \n",
+ "212 « v mittewoche ickner mändische begebenhei... train wrz17890114 \n",
+ "213 sonnabend inländische begebenheit « « « ... train wrz17890117 \n",
+ "\n",
+ " year \n",
+ "209 1789 \n",
+ "210 1789 \n",
+ "211 1789 \n",
+ "212 1789 \n",
+ "213 1789 "
+ ]
+ },
+ "execution_count": 55,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "wz.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "id": "e8dad840-dc08-416d-8aec-09fb6675dcd0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz['ocr'] = sz['ocr'].replace( { r\"w*\\-|=\" : '' }, regex = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "id": "3f09ac51-5f23-4b0d-b185-98c4acd118de",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1e9dea85-2fe8-4394-8e3a-45e4594a7c6e",
+ "metadata": {},
+ "source": [
+ "remove all punctuation
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "id": "4462f01f-8f7e-4d2e-8fe7-83db114d9df3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz['ocr'] = wz['ocr'].replace( { r\"[^\\w\\s]\" : '' }, regex = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "id": "39756f59-5b3d-4056-96a7-09e13e4850f1",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ocr | \n",
+ " split | \n",
+ " manifest_id | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 209 | \n",
+ " e n e ze sonnabend z iäner inländische b... | \n",
+ " test | \n",
+ " wrz17890103 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 210 | \n",
+ " mittewoche ianer inländische begebenheit ... | \n",
+ " train | \n",
+ " wrz17890107 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 211 | \n",
+ " hi zeilunn scnnabend dtii iv inländische b... | \n",
+ " test | \n",
+ " wrz17890110 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 212 | \n",
+ " v mittewoche ickner mändische begebenheit... | \n",
+ " train | \n",
+ " wrz17890114 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 213 | \n",
+ " sonnabend inländische begebenheit mam... | \n",
+ " train | \n",
+ " wrz17890117 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ocr split manifest_id \\\n",
+ "209 e n e ze sonnabend z iäner inländische b... test wrz17890103 \n",
+ "210 mittewoche ianer inländische begebenheit ... train wrz17890107 \n",
+ "211 hi zeilunn scnnabend dtii iv inländische b... test wrz17890110 \n",
+ "212 v mittewoche ickner mändische begebenheit... train wrz17890114 \n",
+ "213 sonnabend inländische begebenheit mam... train wrz17890117 \n",
+ "\n",
+ " year \n",
+ "209 1789 \n",
+ "210 1789 \n",
+ "211 1789 \n",
+ "212 1789 \n",
+ "213 1789 "
+ ]
+ },
+ "execution_count": 59,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "wz.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "id": "40771a60-00de-4488-9eab-bb6b5d26cd8e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz['ocr'] = sz['ocr'].replace( { r\"[^\\w\\s]\" : '' }, regex = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "id": "6617f24a-2d5e-4046-a419-6ce4cf1c2f83",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fa46b891-dffb-4433-9d12-1dda1a01febf",
+ "metadata": {},
+ "source": [
+ "remove words consisting of three or less characters
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "id": "e5447cba-5fc0-40c7-99c1-df90df5aae6b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz['ocr'] = wz['ocr'].replace( { r\"\\b\\w{1,3}\\b\" : '' }, regex = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "id": "53f8d4b5-cc51-4ecd-8006-f79748b36edf",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ocr | \n",
+ " split | \n",
+ " manifest_id | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 209 | \n",
+ " sonnabend iäner inländische begeben... | \n",
+ " test | \n",
+ " wrz17890103 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 210 | \n",
+ " mittewoche ianer inländische begebenheit ... | \n",
+ " train | \n",
+ " wrz17890107 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 211 | \n",
+ " zeilunn scnnabend dtii inländische begeb... | \n",
+ " test | \n",
+ " wrz17890110 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 212 | \n",
+ " mittewoche ickner mändische begebenheite... | \n",
+ " train | \n",
+ " wrz17890114 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 213 | \n",
+ " sonnabend inländische begebenheit mam... | \n",
+ " train | \n",
+ " wrz17890117 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ocr split manifest_id \\\n",
+ "209 sonnabend iäner inländische begeben... test wrz17890103 \n",
+ "210 mittewoche ianer inländische begebenheit ... train wrz17890107 \n",
+ "211 zeilunn scnnabend dtii inländische begeb... test wrz17890110 \n",
+ "212 mittewoche ickner mändische begebenheite... train wrz17890114 \n",
+ "213 sonnabend inländische begebenheit mam... train wrz17890117 \n",
+ "\n",
+ " year \n",
+ "209 1789 \n",
+ "210 1789 \n",
+ "211 1789 \n",
+ "212 1789 \n",
+ "213 1789 "
+ ]
+ },
+ "execution_count": 63,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "wz.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "id": "5a0c0461-4c23-4679-bc62-f7689ea49353",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz['ocr'] = sz['ocr'].replace( { r\"\\b\\w{1,3}\\b\" : '' }, regex = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "id": "2cc24462-5c73-4802-b738-014c089181a2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "20c32401-64c5-4166-88e2-ba57a5f804d1",
+ "metadata": {},
+ "source": [
+ "remove multiple white spaces etc
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "id": "cfb65a70-786b-4da0-9fde-b6a783e087e4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz['ocr'] = wz['ocr'].replace( { r\"\\s\\s+\" : ' ' }, regex = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "id": "2e2356cf-2d59-4e84-8752-c080efd62434",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ocr | \n",
+ " split | \n",
+ " manifest_id | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 209 | \n",
+ " sonnabend iäner inländische begebenheiten pln... | \n",
+ " test | \n",
+ " wrz17890103 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 210 | \n",
+ " mittewoche ianer inländische begebenheit wien... | \n",
+ " train | \n",
+ " wrz17890107 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 211 | \n",
+ " zeilunn scnnabend dtii inländische begebenhei... | \n",
+ " test | \n",
+ " wrz17890110 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 212 | \n",
+ " mittewoche ickner mändische begebenheiten krw... | \n",
+ " train | \n",
+ " wrz17890114 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ " 213 | \n",
+ " sonnabend inländische begebenheit mamst horch ... | \n",
+ " train | \n",
+ " wrz17890117 | \n",
+ " 1789 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ocr split manifest_id \\\n",
+ "209 sonnabend iäner inländische begebenheiten pln... test wrz17890103 \n",
+ "210 mittewoche ianer inländische begebenheit wien... train wrz17890107 \n",
+ "211 zeilunn scnnabend dtii inländische begebenhei... test wrz17890110 \n",
+ "212 mittewoche ickner mändische begebenheiten krw... train wrz17890114 \n",
+ "213 sonnabend inländische begebenheit mamst horch ... train wrz17890117 \n",
+ "\n",
+ " year \n",
+ "209 1789 \n",
+ "210 1789 \n",
+ "211 1789 \n",
+ "212 1789 \n",
+ "213 1789 "
+ ]
+ },
+ "execution_count": 67,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "wz.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7d441b37-27be-4688-b927-b9ff2e68b56b",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 68,
+ "id": "12b46594-b85c-4f78-aa8f-dcc5bdbbae9d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz.to_csv('wz_initial_cleaned.tsv', sep='\\t', index=True, header=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "id": "7a9da207-240a-47ee-9ca6-fc262c48946c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz['ocr'] = sz['ocr'].replace( { r\"\\s\\s+\" : ' ' }, regex = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "id": "a16fef5a-7242-41da-ba2d-8ffd9c6ce2a5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bb556189-b8f3-4441-acd1-59940a3c47dd",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "remove words occurring only one time
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 71,
+ "id": "f01dcaeb-1901-462e-8630-e417d2869470",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "texts = wz['ocr'].sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 72,
+ "id": "07ab63ef-5b1d-4be4-b526-cfbfb0655a2b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "texts = texts.split()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 73,
+ "id": "7b989ecb-f647-4cbd-896b-dbfe7c4cebb5",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "11258539"
+ ]
+ },
+ "execution_count": 73,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(texts)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 74,
+ "id": "4cfa31d7-1ffb-42c5-8ebf-71c503f7111c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[('gemacht', 40810),\n",
+ " ('bekannt', 39128),\n",
+ " ('wien', 37478),\n",
+ " ('ersten', 27831),\n",
+ " ('joseph', 27448),\n",
+ " ('johann', 25810),\n",
+ " ('früh', 24408),\n",
+ " ('erscheinen', 24250),\n",
+ " ('herrschaft', 22730),\n",
+ " ('stock', 22341),\n",
+ " ('franz', 22142),\n",
+ " ('haus', 21875),\n",
+ " ('gläubiger', 20226),\n",
+ " ('mehr', 19099),\n",
+ " ('stadt', 18789),\n",
+ " ('mann', 18313),\n",
+ " ('wegen', 17831),\n",
+ " ('forderung', 16504),\n",
+ " ('verkaufen', 16463),\n",
+ " ('sammt', 15851)]"
+ ]
+ },
+ "execution_count": 74,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "freqdist = Counter(texts)\n",
+ "freqdist.most_common(20)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 75,
+ "id": "ed5c7471-a46a-4c21-802b-e6c66ee7e4aa",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_freq_wz = pd.DataFrame.from_dict(freqdist, orient='index').reset_index()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 76,
+ "id": "b9ddfd77-0dc8-4270-8912-aa7d02a7c234",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_freq_wz = df_freq_wz.rename(columns={'index':'word', 0:'count'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 77,
+ "id": "4ad203d1-8560-433d-9da3-25d49375c706",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " word | \n",
+ " count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " sonnabend | \n",
+ " 536 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " iäner | \n",
+ " 1503 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " inländische | \n",
+ " 870 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " begebenheiten | \n",
+ " 1767 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " plngsthta | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 2491253 | \n",
+ " ftttb | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2491254 | \n",
+ " bmvir | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2491255 | \n",
+ " spünchrkn | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2491256 | \n",
+ " eräuye | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2491257 | \n",
+ " otivtl | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2491258 rows × 2 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " word count\n",
+ "0 sonnabend 536\n",
+ "1 iäner 1503\n",
+ "2 inländische 870\n",
+ "3 begebenheiten 1767\n",
+ "4 plngsthta 1\n",
+ "... ... ...\n",
+ "2491253 ftttb 1\n",
+ "2491254 bmvir 1\n",
+ "2491255 spünchrkn 1\n",
+ "2491256 eräuye 1\n",
+ "2491257 otivtl 1\n",
+ "\n",
+ "[2491258 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 77,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_freq_wz"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 83,
+ "id": "34c0b1b7-69fb-4b6f-b94f-f204c7ea849d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "freq_wz = df_freq_wz.loc[df_freq_wz['count'] <= 5]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 84,
+ "id": "132fd305-59ff-468d-97af-8cd01b748c83",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " word | \n",
+ " count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 4 | \n",
+ " plngsthta | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " krfslgtt | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " bble | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " heinri | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " preasst | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 2491253 | \n",
+ " ftttb | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2491254 | \n",
+ " bmvir | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2491255 | \n",
+ " spünchrkn | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2491256 | \n",
+ " eräuye | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2491257 | \n",
+ " otivtl | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2373250 rows × 2 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " word count\n",
+ "4 plngsthta 1\n",
+ "5 krfslgtt 1\n",
+ "6 bble 1\n",
+ "9 heinri 3\n",
+ "10 preasst 1\n",
+ "... ... ...\n",
+ "2491253 ftttb 1\n",
+ "2491254 bmvir 1\n",
+ "2491255 spünchrkn 1\n",
+ "2491256 eräuye 1\n",
+ "2491257 otivtl 1\n",
+ "\n",
+ "[2373250 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 84,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "freq_wz"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 85,
+ "id": "7cd0dba8-0ab9-4c5f-a35e-f36ad8e2a5a0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "freq_token_wz = freq_wz['word'].tolist()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 86,
+ "id": "7f0bf4d6-e740-4eb6-8f7a-4c25692f293f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "set_token1 = set(freq_token_wz)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9f715423-25cf-4927-87e4-ad7a7738d78f",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "wz['ocr'] = wz['ocr'].apply(lambda x: ' '.join([item for item in x.split() if item not in freq_token_wz]))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 87,
+ "id": "f262dd53-3a14-4005-a6bd-3ba099b30415",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz['ocr'] = wz['ocr'].apply(lambda x: ' '.join([item for item in x.split() if item not in set_token1]))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "087cb3e2-3cb7-4a63-af4c-b70afedca0a2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "reg = re.compile(freq_token_wz)\n",
+ "rep = ''\n",
+ "text = wz['ocr']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7415d84c-21aa-4795-8f9b-afffa3e6ebbb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def sub_replace(reg, rep, text):\n",
+ " output = re.sub(reg, rep, text)\n",
+ " return output"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ae158b9c-484a-48d6-8aa8-0cbfd8f33eba",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "p = re.compile('|'.join(map(re.escape, freq_token_wz)))\n",
+ "wz['ocr'] = [p.sub('', text) for text in wz['ocr']] "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e852760f-6b80-4eb5-a52c-06b0c05fd0ce",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "pat = r'\\b(?:{})\\b'.format('|'.join(freq_token_wz))\n",
+ "wz['ocr'] = wz['ocr'].str.replace(pat, '')\n",
+ "wz['ocr'] = wz['ocr'].str.replace(r'\\s+', ' ')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ddb580d5-fe86-4915-8698-30d1fbe4253a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz['ocr'] = wz['ocr'].replace(freq_token_wz,'')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 88,
+ "id": "7ae47841-0769-42c1-9cf9-ab1a7adc757e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "8337421"
+ ]
+ },
+ "execution_count": 88,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(wz['ocr'].sum().split())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 89,
+ "id": "b46347ed-8252-4379-8371-bed9ea690ff5",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "209 sonnabend iäner inländische begebenheiten bens...\n",
+ "210 mittewoche ianer inländische begebenheit wien ...\n",
+ "211 dtii inländische begebenheiten vorthei zugleic...\n",
+ "212 mittewoche begebenheiten sonntag hofe gottesdi...\n",
+ "213 sonnabend inländische begebenheit horch kret r...\n",
+ " ... \n",
+ "1299 sonnabend december inländische majestät habew ...\n",
+ "1300 sranj unft ober unftre gehn tlgen stieg turd m...\n",
+ "1301 sonnabend december inländische begebenheiten w...\n",
+ "1302 zmar mittewoche december inländische vegebenhe...\n",
+ "1303 sonnabend december inländische bege hell chris...\n",
+ "Name: ocr, Length: 1095, dtype: object"
+ ]
+ },
+ "execution_count": 89,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "wz['ocr']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "85617731-7397-4913-9b19-37586ad6eee4",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f156a21e-bf1a-4f52-8594-24caab5e7132",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "texts_sz = sz['ocr'].sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7fbd1c5c-202b-47c6-abeb-8905a2ffab27",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "texts_sz = texts_sz.split()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4ccf175d-6e1b-4e44-8c27-0169b907372f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "len(texts_sz)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ec56b4c7-cf35-4fde-8e7f-0f8714b31d54",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "freqdist_sz = Counter(texts_sz)\n",
+ "freqdist_sz.most_common(20)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0b624c21-627e-4e5e-92d6-4f42b473f090",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_freq_sz = pd.DataFrame.from_dict(freqdist_sz, orient='index').reset_index()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "29db12a0-d868-4dbc-83a9-38a30709c053",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_freq_sz = df_freq_sz.rename(columns={'index':'word', 0:'count'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c29a88e6-9271-4be3-b580-81a9bf17a4a5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_freq_sz"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "985868ab-b0f1-41cc-9db0-86cfaed526bd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "freq_sz = df_freq_sz.loc[df_freq_sz['count'] == 1]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d133a443-cdff-4603-8de5-ab0e80992eb4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "freq_sz"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c85ac1e5-27cf-4424-a223-eb281fb18708",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "freq_token_sz = freq_sz['word'].tolist()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ca6a9d6c-cd55-4203-bc86-cf5fbd3fedca",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz['ocr'] = sz['ocr'].apply(lambda x: ' '.join([item for item in x.split() if item not in freq_token_sz]))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2006a836-24b6-4f10-97e0-0a7f4dc29ade",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz['ocr'] = sz['ocr'].replace(freq_token_sz,'')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5d524f41-3845-4790-9f18-5f3b71c14357",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "len(sz['ocr'].sum().split())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3ec870ac-4118-4edf-b7eb-e4fc6a2e209f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz['ocr']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f6b8e213-3ce0-4cd6-901c-1aa588f548be",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 90,
+ "id": "e63ed713-e77c-4fd2-8dad-6638bb834d27",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "8338515"
+ ]
+ },
+ "execution_count": 90,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sum(wz['ocr'].str.split().str.len())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7c58eec9-697f-4ad1-bbe2-1ac697958fc1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sum(sz['ocr'].str.split().str.len())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a1f6f45e-a1cd-46d7-bd10-fb45915d8bc7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "texts_wz_3 = wz['ocr'].sum()\n",
+ "texts_wz_3 = texts_wz_3.split(\" \")\n",
+ "vocabulary_wz_3 = set(texts_wz_3)\n",
+ "len(vocabulary_wz_3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "809aa6ad-371e-406c-bc2a-6aa16a194c23",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 92,
+ "id": "ee3586e5-7cc7-472a-b636-002cb1db373c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wz.to_csv('wz_clean5.tsv', sep='\\t', index=True, header=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a96ba6dc-06bb-4014-a35c-cd79000c56df",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sz.to_csv('sz_clean.tsv', sep='\\t', index=True, header=True)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}