From 93d8d33d19dde703671c72214b293c47e0499e9f Mon Sep 17 00:00:00 2001 From: Thomas Kirchmair Date: Wed, 24 Aug 2022 10:53:50 +0000 Subject: [PATCH] Upload New File --- cleaning.ipynb | 2731 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 2731 insertions(+) create mode 100644 cleaning.ipynb diff --git a/cleaning.ipynb b/cleaning.ipynb new file mode 100644 index 0000000..43f9cac --- /dev/null +++ b/cleaning.ipynb @@ -0,0 +1,2731 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "e81f9560-1a1b-4d93-93c3-84e1263b04f0", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import csv\n", + "import pathlib\n", + "import pandas as pd\n", + "import numpy as np\n", + "import nltk\n", + "import re\n", + "import string\n", + "from collections import Counter\n", + "from collections import defaultdict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85c59a91-3778-4973-9491-c99528fe39a3", + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89bc3c22-b2f3-4389-b843-ebe2bed4ea35", + "metadata": {}, + "outputs": [], + "source": [ + "os.chdir(r\"C:\\Users\\onb1202\\OneDrive - Österreichische Nationalbibliothek\\Praktikum TK\\daten\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "346bdb53-f234-408a-b666-16ddcc10a8e1", + "metadata": {}, + "outputs": [], + "source": [ + "wz = pd.read_csv('wz_tok.tsv', sep='\\t', encoding='utf-8')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ca7bb501-ebc6-4b00-bf7b-cc6d7bd97927", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0ocrsplitmanifest_id
00['itz', '.', 'r', 'F-', 'Nro', '.', 'Sonnabmd'...validwrz17850101
11['5i', '.', '2Y', 'F', 'Mittwoch', 'den', '5.'...trainwrz17850105
22['57', '^', 'Sonnabend', 'den', '8.', 'Janer',...validwrz17850108
33['5k', '8i', 'F', 'Mittwoch', 'den', 'Iäner', ...trainwrz17850112
44['^', '109', '^', 'Sonnabend', 'den', '15.', '...trainwrz17850115
...............
12991299['Sonnabend', ',', 'den', '14', '*', 'December...trainwrz17991214
13001300['I', 'm', '4', '-', '77', \"i'\", 'jLiii', '.',...validwrz17991218
13011301['L', 'Sonnabend', ',', 'den', '21.', 'Decembe...validwrz17991221
13021302['WVr', '4', 'S73', 'ZMAr', 'if', '>', 'Mittew...testwrz17991225
13031303['Sonnabend', ',', 'den', 'rz', '«', 'December...testwrz17991228
\n", + "

1304 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 ocr split \\\n", + "0 0 ['itz', '.', 'r', 'F-', 'Nro', '.', 'Sonnabmd'... valid \n", + "1 1 ['5i', '.', '2Y', 'F', 'Mittwoch', 'den', '5.'... train \n", + "2 2 ['57', '^', 'Sonnabend', 'den', '8.', 'Janer',... valid \n", + "3 3 ['5k', '8i', 'F', 'Mittwoch', 'den', 'Iäner', ... train \n", + "4 4 ['^', '109', '^', 'Sonnabend', 'den', '15.', '... train \n", + "... ... ... ... \n", + "1299 1299 ['Sonnabend', ',', 'den', '14', '*', 'December... train \n", + "1300 1300 ['I', 'm', '4', '-', '77', \"i'\", 'jLiii', '.',... valid \n", + "1301 1301 ['L', 'Sonnabend', ',', 'den', '21.', 'Decembe... valid \n", + "1302 1302 ['WVr', '4', 'S73', 'ZMAr', 'if', '>', 'Mittew... test \n", + "1303 1303 ['Sonnabend', ',', 'den', 'rz', '«', 'December... test \n", + "\n", + " manifest_id \n", + "0 wrz17850101 \n", + "1 wrz17850105 \n", + "2 wrz17850108 \n", + "3 wrz17850112 \n", + "4 wrz17850115 \n", + "... ... \n", + "1299 wrz17991214 \n", + "1300 wrz17991218 \n", + "1301 wrz17991221 \n", + "1302 wrz17991225 \n", + "1303 wrz17991228 \n", + "\n", + "[1304 rows x 4 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wz" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e52205c3-1070-42f3-b84c-9647be21025f", + "metadata": {}, + "outputs": [], + "source": [ + "wz = wz.iloc[: , 1:]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4a63eddd-0bab-48ba-b4b6-428913702f69", + "metadata": {}, + "outputs": [], + "source": [ + "wz['year'] = wz['manifest_id'].str.findall(r\"(?<=\\D)\\d{4}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "142dc825-4002-4e82-81be-e52bf10ecfc4", + "metadata": {}, + "outputs": [], + "source": [ + "wz['year'] =wz['year'].str[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "6d5c7b47-5a09-4887-b1b5-7e1acb7c882d", + "metadata": {}, + "outputs": [], + "source": [ + "wz['year'] = wz['year'].astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "65beb413-379b-490a-9dfa-10c55434bbec", + "metadata": {}, + "outputs": [], + "source": [ + "wz = wz.loc[wz['year'] > 1788]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9da52b1d-4075-4a7d-80ff-d51b8205d00f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "cd67ebd7-256f-4a97-bae2-a1632f33730c", + "metadata": {}, + "outputs": [], + "source": [ + "sz = pd.read_csv('sz_tok.tsv', sep='\\t', encoding='utf-8')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "4cc7d7a8-8dd6-445f-84d9-ea49833579e0", + "metadata": {}, + "outputs": [], + "source": [ + "sz" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "68ac11f8-9086-4918-a98b-1c70007f410c", + "metadata": {}, + "outputs": [], + "source": [ + "sz = sz.iloc[: , 1:]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "633734fb-a550-4e33-a5f0-c3dab670bed3", + "metadata": {}, + "outputs": [], + "source": [ + "sz['year'] = sz['manifest_id'].str.findall(r\"(?<=\\D)\\d{4}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "9b088ef5-2ed5-40e6-861b-dc288e28e53a", + "metadata": {}, + "outputs": [], + "source": [ + "sz['year'] =sz['year'].str[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "04b30aa1-0271-41ea-9f98-576f35aac76b", + "metadata": {}, + "outputs": [], + "source": [ + "sz['year'] = sz['year'].astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "990cbf1e-5095-4120-a25f-eb3ca4ba3c1d", + "metadata": {}, + "outputs": [], + "source": [ + "sz = sz.loc[sz['year'] > 1788]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e25fc89-f7c4-44f0-98a8-a226d19deb62", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "82867c03-2673-496b-a8c8-ff835f269b69", + "metadata": {}, + "outputs": [], + "source": [ + "wz['ocr'] = wz['ocr'].str.strip('[]').astype(str)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "7bfa4688-7130-4c84-bc73-b4a25af45e7f", + "metadata": {}, + "outputs": [], + "source": [ + "sz['ocr'] = sz['ocr'].str.strip('[]').astype(str)" + ] + }, + { + "cell_type": "markdown", + "id": "b9101a01-1835-4240-8099-2ed785bd6bc4", + "metadata": {}, + "source": [ + "

check token number

" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "1405137c-5925-4f76-b113-d130cb724535", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "34126677" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum(wz['ocr'].str.split().str.len())" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "89637233-f308-4e07-8b9a-e97beb1ef723", + "metadata": {}, + "outputs": [], + "source": [ + "sum(sz['ocr'].str.split().str.len())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2aa172ca-9690-475d-96b4-e9011c35d40f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "73d929a5-c41b-48c2-894a-22470cfc6b52", + "metadata": {}, + "source": [ + "

lowercase data

" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "581f9264-c3a6-467f-abbc-d6c2cc4a0c03", + "metadata": {}, + "outputs": [], + "source": [ + "wz['ocr'] = wz['ocr'].str.lower()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "9018a020-1f81-48a7-977f-df60db36bd5d", + "metadata": {}, + "outputs": [], + "source": [ + "sz['ocr'] = sz['ocr'].str.lower()" + ] + }, + { + "cell_type": "markdown", + "id": "2281771e-c941-4069-8edf-1aed8fee1a48", + "metadata": {}, + "source": [ + "

remove stopwords

" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "35ca035b-808d-40ff-b1f8-2bee8be06219", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to\n", + "[nltk_data] C:\\Users\\onb1202\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n" + ] + } + ], + "source": [ + "from nltk.corpus import stopwords\n", + "nltk.download('stopwords')\n", + "from nltk.corpus import stopwords\n", + "stopwords = nltk.corpus.stopwords.words('german')\n", + "stopwords_manual = ( 'worden', 'daher', 'seyn', 'hiemit', 'immer', 'ganz', 'nebst', 'wider', 'schon', 'weder', 'sicher', 'sowohl', 'binnen', 'deto', 'chen', 'sicher', 'sowohl', 'eben', 'hiezu', 'samt', 'darüber', 'wäre', 'nebst' )\n", + "stopwords.extend(stopwords_manual)\n", + "stop = stopwords" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "8314c831-ab58-415c-bd62-56a1306d83f6", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\onb1202\\AppData\\Local\\Temp\\ipykernel_6532\\3231380372.py:2: FutureWarning: The default value of regex will change from True to False in a future version.\n", + " wz['ocr'] = wz['ocr'].str.replace(pat, '')\n", + "C:\\Users\\onb1202\\AppData\\Local\\Temp\\ipykernel_6532\\3231380372.py:3: FutureWarning: The default value of regex will change from True to False in a future version.\n", + " wz['ocr'] = wz['ocr'].str.replace(r'\\s+', ' ')\n" + ] + } + ], + "source": [ + "pat = r'\\b(?:{})\\b'.format('|'.join(stop))\n", + "wz['ocr'] = wz['ocr'].str.replace(pat, '')\n", + "wz['ocr'] = wz['ocr'].str.replace(r'\\s+', ' ')" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "1b32c84b-e63c-4665-9af2-5b095901bf6e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "209 '', 'e', 'n', 'e', \"'r-\", 'ze', 'sonnabend', '...\n", + "210 '29', '^', 'mittewoche', '', '7.', 'ianer', '1...\n", + "211 'hi', '.', 'l3', \"izvrenei'\", 'zeilunn', '.', ...\n", + "212 '8r', '«', 'v', 'mittewoche', '', '14.', 'ickn...\n", + "213 'sonnabend', '', '17.', \"ia'ner\", '1789', '.',...\n", + " ... \n", + "1299 'sonnabend', ',', '', '14', '*', 'december', '...\n", + "1300 'i', 'm', '4', '-', '77', \"i'\", 'jliii', '.', ...\n", + "1301 'l', 'sonnabend', ',', '', '21.', 'december', ...\n", + "1302 'wvr', '4', 's73', 'zmar', 'if', '>', 'mittewo...\n", + "1303 'sonnabend', ',', '', 'rz', '«', 'december', '...\n", + "Name: ocr, Length: 1095, dtype: object" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wz['ocr']" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "2eda7364-acfa-41ca-b56a-32c4d9cbe7fe", + "metadata": {}, + "outputs": [], + "source": [ + "pat = r'\\b(?:{})\\b'.format('|'.join(stop))\n", + "sz['ocr'] = sz['ocr'].str.replace(pat, '')\n", + "sz['ocr'] = sz['ocr'].str.replace(r'\\s+', ' ')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0534849d-7374-4f82-9ca7-9ec9adede7e3", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "14c691d7-5eb5-4f46-96c8-d3fbb0c81c9b", + "metadata": {}, + "outputs": [], + "source": [ + "wz['ocr'] = wz['ocr'].apply(lambda x: ' '.join([item for item in x.split() if item not in stopwords]))" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "aa5bfa8a-c394-491e-ba00-56af0e88f3fc", + "metadata": {}, + "outputs": [], + "source": [ + "wz['ocr']" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "2d8c7bd1-6077-4f47-a9de-eae1ee1d83c9", + "metadata": {}, + "outputs": [], + "source": [ + "sz['ocr'] = sz['ocr'].apply(lambda x: ' '.join([item for item in x.split() if item not in stopwords]))" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "5c339a69-a320-4054-8743-e86f85adf92f", + "metadata": {}, + "outputs": [], + "source": [ + "sz['ocr']" + ] + }, + { + "cell_type": "markdown", + "id": "6fd1ff62-d603-4ab3-abdc-a9a5e84c3610", + "metadata": {}, + "source": [ + "

remove quotation marks

" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "caac81b8-4e6e-4017-926f-ddb5bad809e2", + "metadata": {}, + "outputs": [], + "source": [ + "wz['ocr'] = wz['ocr'].str.replace(\"'\", \"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "c49d9432-79d7-49b7-bd45-a54c3beacff0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ocrsplitmanifest_idyear
209, e, n, e, \"r-\", ze, sonnabend, , z., iäner, ,...testwrz178901031789
21029, ^, mittewoche, , 7., ianer, 1789, ., inlän...trainwrz178901071789
211hi, ., l3, \"izvrenei\", zeilunn, ., scnnabend, ...testwrz178901101789
2128r, «, v, mittewoche, , 14., ickner, 1739, ., ...trainwrz178901141789
213sonnabend, , 17., \"ianer\", 1789, ., inländisch...trainwrz178901171789
\n", + "
" + ], + "text/plain": [ + " ocr split manifest_id \\\n", + "209 , e, n, e, \"r-\", ze, sonnabend, , z., iäner, ,... test wrz17890103 \n", + "210 29, ^, mittewoche, , 7., ianer, 1789, ., inlän... train wrz17890107 \n", + "211 hi, ., l3, \"izvrenei\", zeilunn, ., scnnabend, ... test wrz17890110 \n", + "212 8r, «, v, mittewoche, , 14., ickner, 1739, ., ... train wrz17890114 \n", + "213 sonnabend, , 17., \"ianer\", 1789, ., inländisch... train wrz17890117 \n", + "\n", + " year \n", + "209 1789 \n", + "210 1789 \n", + "211 1789 \n", + "212 1789 \n", + "213 1789 " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wz.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "9c2e0a02-df0b-42b0-b95d-674d6bdd64bb", + "metadata": {}, + "outputs": [], + "source": [ + "sz['ocr'] = sz['ocr'].str.replace(\"'\", \"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "e668bdbd-fe33-47db-b299-f32b38c3bbca", + "metadata": {}, + "outputs": [], + "source": [ + "sz.head()" + ] + }, + { + "cell_type": "markdown", + "id": "d382622b-0ec1-4327-a34e-23c296b991e6", + "metadata": {}, + "source": [ + "

remove commas

" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "a2cd53fb-da51-4fc5-8ad3-b1871d3d6387", + "metadata": {}, + "outputs": [], + "source": [ + "wz['ocr'] = wz['ocr'].str.replace(\",\", \"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "89811f64-5939-44df-972b-df803a82da24", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ocrsplitmanifest_idyear
209e n e \"r-\" ze sonnabend z. iäner 7 ^ 9. inlä...testwrz178901031789
21029 ^ mittewoche 7. ianer 1789 . inländische b...trainwrz178901071789
211hi . l3 \"izvrenei\" zeilunn . scnnabend dtii iv...testwrz178901101789
2128r « v mittewoche 14. ickner 1739 . mändische...trainwrz178901141789
213sonnabend 17. \"ianer\" 1789 . inländische bege...trainwrz178901171789
\n", + "
" + ], + "text/plain": [ + " ocr split manifest_id \\\n", + "209 e n e \"r-\" ze sonnabend z. iäner 7 ^ 9. inlä... test wrz17890103 \n", + "210 29 ^ mittewoche 7. ianer 1789 . inländische b... train wrz17890107 \n", + "211 hi . l3 \"izvrenei\" zeilunn . scnnabend dtii iv... test wrz17890110 \n", + "212 8r « v mittewoche 14. ickner 1739 . mändische... train wrz17890114 \n", + "213 sonnabend 17. \"ianer\" 1789 . inländische bege... train wrz17890117 \n", + "\n", + " year \n", + "209 1789 \n", + "210 1789 \n", + "211 1789 \n", + "212 1789 \n", + "213 1789 " + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wz.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "61aeaeb6-e776-4a5b-a9db-ec1612a85643", + "metadata": {}, + "outputs": [], + "source": [ + "sz['ocr'] = sz['ocr'].str.replace(\",\", \"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "3d5b2d54-6857-4917-92f3-0233b584ad69", + "metadata": {}, + "outputs": [], + "source": [ + "sz.head()" + ] + }, + { + "cell_type": "markdown", + "id": "ced70950-9f74-42d5-8c4a-f3020fbb016b", + "metadata": {}, + "source": [ + "

remove white space before punctuation

" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "e523592c-d7f4-4fb9-9a77-abca1765975b", + "metadata": {}, + "outputs": [], + "source": [ + "wz['ocr'] = wz['ocr'].replace( { r\"\\s(?=[.,:;])+\" : '' }, regex = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "d84bf43d-9aa4-4732-a67e-1e27e1179cb2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ocrsplitmanifest_idyear
209e n e \"r-\" ze sonnabend z. iäner 7 ^ 9. inlä...testwrz178901031789
21029 ^ mittewoche 7. ianer 1789. inländische be...trainwrz178901071789
211hi. l3 \"izvrenei\" zeilunn. scnnabend dtii iv. ...testwrz178901101789
2128r « v mittewoche 14. ickner 1739. mändische ...trainwrz178901141789
213sonnabend 17. \"ianer\" 1789. inländische begeb...trainwrz178901171789
\n", + "
" + ], + "text/plain": [ + " ocr split manifest_id \\\n", + "209 e n e \"r-\" ze sonnabend z. iäner 7 ^ 9. inlä... test wrz17890103 \n", + "210 29 ^ mittewoche 7. ianer 1789. inländische be... train wrz17890107 \n", + "211 hi. l3 \"izvrenei\" zeilunn. scnnabend dtii iv. ... test wrz17890110 \n", + "212 8r « v mittewoche 14. ickner 1739. mändische ... train wrz17890114 \n", + "213 sonnabend 17. \"ianer\" 1789. inländische begeb... train wrz17890117 \n", + "\n", + " year \n", + "209 1789 \n", + "210 1789 \n", + "211 1789 \n", + "212 1789 \n", + "213 1789 " + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wz.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "f0e31e67-0c3c-44db-860c-c6f56e904010", + "metadata": {}, + "outputs": [], + "source": [ + "sz['ocr'] = sz['ocr'].replace( { r\"\\s(?=[.,:;])+\" : '' }, regex = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "e8e48a17-872b-4918-aa66-563b26c958ca", + "metadata": {}, + "outputs": [], + "source": [ + "sz.head()" + ] + }, + { + "cell_type": "markdown", + "id": "3e63a61f-e62a-4a21-9221-0e3b2baf280f", + "metadata": {}, + "source": [ + "

remove punctuation at end of string

" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "b312714a-118f-462f-850a-9f3a15f86a80", + "metadata": {}, + "outputs": [], + "source": [ + "wz['ocr'] = wz['ocr'].replace( { r\"w*[.?\\!*]\\s+\" : ' ' }, regex = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "611d5373-1286-427c-b97e-bab769aa96f6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ocrsplitmanifest_idyear
209e n e \"r-\" ze sonnabend z iäner 7 ^ 9 inländ...testwrz178901031789
21029 ^ mittewoche 7 ianer 1789 inländische bege...trainwrz178901071789
211hi l3 \"izvrenei\" zeilunn scnnabend dtii iv ^än...testwrz178901101789
2128r « v mittewoche 14 ickner 1739 mändische be...trainwrz178901141789
213sonnabend 17 \"ianer\" 1789 inländische begeben...trainwrz178901171789
\n", + "
" + ], + "text/plain": [ + " ocr split manifest_id \\\n", + "209 e n e \"r-\" ze sonnabend z iäner 7 ^ 9 inländ... test wrz17890103 \n", + "210 29 ^ mittewoche 7 ianer 1789 inländische bege... train wrz17890107 \n", + "211 hi l3 \"izvrenei\" zeilunn scnnabend dtii iv ^än... test wrz17890110 \n", + "212 8r « v mittewoche 14 ickner 1739 mändische be... train wrz17890114 \n", + "213 sonnabend 17 \"ianer\" 1789 inländische begeben... train wrz17890117 \n", + "\n", + " year \n", + "209 1789 \n", + "210 1789 \n", + "211 1789 \n", + "212 1789 \n", + "213 1789 " + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wz.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "8e2a3351-7568-4b0a-8702-2a64aa6dc185", + "metadata": {}, + "outputs": [], + "source": [ + "sz['ocr'] = sz['ocr'].replace( { r\"w*[.?\\!*]\\s+\" : ' ' }, regex = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "6e1255c8-9331-4bb3-970e-e34da5bdc988", + "metadata": {}, + "outputs": [], + "source": [ + "sz.head()" + ] + }, + { + "cell_type": "markdown", + "id": "b9af4de4-0cd7-4b83-8815-6a669f26a4e8", + "metadata": {}, + "source": [ + "

remove words containing numbers

" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "60ddf3c1-dbc9-4a69-bbe6-4437484131df", + "metadata": {}, + "outputs": [], + "source": [ + "wz['ocr'] = wz['ocr'].replace( { r\"\\w*\\d\\w*\" : '' }, regex = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "5e97a82e-8418-4e37-a34a-6ab1be80bb4c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ocrsplitmanifest_idyear
209e n e \"r-\" ze sonnabend z iäner ^ inländis...testwrz178901031789
210^ mittewoche ianer inländische begebenheit...trainwrz178901071789
211hi \"izvrenei\" zeilunn scnnabend dtii iv ^äner...testwrz178901101789
212« v mittewoche ickner mändische begebenhei...trainwrz178901141789
213sonnabend \"ianer\" inländische begebenheit «...trainwrz178901171789
\n", + "
" + ], + "text/plain": [ + " ocr split manifest_id \\\n", + "209 e n e \"r-\" ze sonnabend z iäner ^ inländis... test wrz17890103 \n", + "210 ^ mittewoche ianer inländische begebenheit... train wrz17890107 \n", + "211 hi \"izvrenei\" zeilunn scnnabend dtii iv ^äner... test wrz17890110 \n", + "212 « v mittewoche ickner mändische begebenhei... train wrz17890114 \n", + "213 sonnabend \"ianer\" inländische begebenheit «... train wrz17890117 \n", + "\n", + " year \n", + "209 1789 \n", + "210 1789 \n", + "211 1789 \n", + "212 1789 \n", + "213 1789 " + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wz.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "0006ce39-f7c1-4554-855f-036c878c35e6", + "metadata": {}, + "outputs": [], + "source": [ + "sz['ocr'] = sz['ocr'].replace( { r\"\\w*\\d\\w*\" : '' }, regex = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "791c92b5-7b44-47ec-9619-96fd6c6b3e35", + "metadata": {}, + "outputs": [], + "source": [ + "sz.head()" + ] + }, + { + "cell_type": "markdown", + "id": "0cc26679-7405-4baa-99bd-7c63ba7945ab", + "metadata": {}, + "source": [ + "

remove words containing punctuation/special characters

" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "6d80ee6a-6e7a-4377-b4b6-c2f918178269", + "metadata": {}, + "outputs": [], + "source": [ + "wz['ocr'] = wz['ocr'].replace( { r\"\\S*[\\!\\,\\.\\+\\$\\^\\&\\(\\)\\'\\?\\*\\<\\>\\:\\\"\\@\\\\\\/]+\\S+\" : '' }, regex = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "bcdb14ed-d068-49ea-b3c7-f25a2c5eff5d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ocrsplitmanifest_idyear
209e n e ze sonnabend z iäner ^ inländische ...testwrz178901031789
210^ mittewoche ianer inländische begebenheit...trainwrz178901071789
211hi zeilunn scnnabend dtii iv inländische b...testwrz178901101789
212« v mittewoche ickner mändische begebenhei...trainwrz178901141789
213sonnabend inländische begebenheit « « « ...trainwrz178901171789
\n", + "
" + ], + "text/plain": [ + " ocr split manifest_id \\\n", + "209 e n e ze sonnabend z iäner ^ inländische ... test wrz17890103 \n", + "210 ^ mittewoche ianer inländische begebenheit... train wrz17890107 \n", + "211 hi zeilunn scnnabend dtii iv inländische b... test wrz17890110 \n", + "212 « v mittewoche ickner mändische begebenhei... train wrz17890114 \n", + "213 sonnabend inländische begebenheit « « « ... train wrz17890117 \n", + "\n", + " year \n", + "209 1789 \n", + "210 1789 \n", + "211 1789 \n", + "212 1789 \n", + "213 1789 " + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wz.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "b6a507b8-e626-44c8-ace1-3fa4aa1690f1", + "metadata": {}, + "outputs": [], + "source": [ + "sz['ocr'] = sz['ocr'].replace( { r\"\\S*[\\!\\,\\.\\+\\$\\^\\&\\(\\)\\'\\?\\*\\<\\>\\:\\\"\\@\\\\\\/]+\\S+\" : '' }, regex = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "2353ec0a-6530-4e36-8142-6f178ee31937", + "metadata": {}, + "outputs": [], + "source": [ + "sz.head()" + ] + }, + { + "cell_type": "markdown", + "id": "d9244eea-ed90-47aa-8239-4bc5c3a03dac", + "metadata": {}, + "source": [ + "

remove hyphen and equal signs between words

" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "d2ca9b6e-e284-4cd3-95ec-a39ba4bd0f78", + "metadata": {}, + "outputs": [], + "source": [ + "wz['ocr'] = wz['ocr'].replace( { r\"w*\\-|=\" : '' }, regex = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "5cc2b794-39e8-45c9-a028-50c3d86bf73f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ocrsplitmanifest_idyear
209e n e ze sonnabend z iäner ^ inländische ...testwrz178901031789
210^ mittewoche ianer inländische begebenheit...trainwrz178901071789
211hi zeilunn scnnabend dtii iv inländische b...testwrz178901101789
212« v mittewoche ickner mändische begebenhei...trainwrz178901141789
213sonnabend inländische begebenheit « « « ...trainwrz178901171789
\n", + "
" + ], + "text/plain": [ + " ocr split manifest_id \\\n", + "209 e n e ze sonnabend z iäner ^ inländische ... test wrz17890103 \n", + "210 ^ mittewoche ianer inländische begebenheit... train wrz17890107 \n", + "211 hi zeilunn scnnabend dtii iv inländische b... test wrz17890110 \n", + "212 « v mittewoche ickner mändische begebenhei... train wrz17890114 \n", + "213 sonnabend inländische begebenheit « « « ... train wrz17890117 \n", + "\n", + " year \n", + "209 1789 \n", + "210 1789 \n", + "211 1789 \n", + "212 1789 \n", + "213 1789 " + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wz.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "e8dad840-dc08-416d-8aec-09fb6675dcd0", + "metadata": {}, + "outputs": [], + "source": [ + "sz['ocr'] = sz['ocr'].replace( { r\"w*\\-|=\" : '' }, regex = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "3f09ac51-5f23-4b0d-b185-98c4acd118de", + "metadata": {}, + "outputs": [], + "source": [ + "sz.head()" + ] + }, + { + "cell_type": "markdown", + "id": "1e9dea85-2fe8-4394-8e3a-45e4594a7c6e", + "metadata": {}, + "source": [ + "

remove all punctuation

" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "4462f01f-8f7e-4d2e-8fe7-83db114d9df3", + "metadata": {}, + "outputs": [], + "source": [ + "wz['ocr'] = wz['ocr'].replace( { r\"[^\\w\\s]\" : '' }, regex = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "39756f59-5b3d-4056-96a7-09e13e4850f1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ocrsplitmanifest_idyear
209e n e ze sonnabend z iäner inländische b...testwrz178901031789
210mittewoche ianer inländische begebenheit ...trainwrz178901071789
211hi zeilunn scnnabend dtii iv inländische b...testwrz178901101789
212v mittewoche ickner mändische begebenheit...trainwrz178901141789
213sonnabend inländische begebenheit mam...trainwrz178901171789
\n", + "
" + ], + "text/plain": [ + " ocr split manifest_id \\\n", + "209 e n e ze sonnabend z iäner inländische b... test wrz17890103 \n", + "210 mittewoche ianer inländische begebenheit ... train wrz17890107 \n", + "211 hi zeilunn scnnabend dtii iv inländische b... test wrz17890110 \n", + "212 v mittewoche ickner mändische begebenheit... train wrz17890114 \n", + "213 sonnabend inländische begebenheit mam... train wrz17890117 \n", + "\n", + " year \n", + "209 1789 \n", + "210 1789 \n", + "211 1789 \n", + "212 1789 \n", + "213 1789 " + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wz.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "40771a60-00de-4488-9eab-bb6b5d26cd8e", + "metadata": {}, + "outputs": [], + "source": [ + "sz['ocr'] = sz['ocr'].replace( { r\"[^\\w\\s]\" : '' }, regex = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "6617f24a-2d5e-4046-a419-6ce4cf1c2f83", + "metadata": {}, + "outputs": [], + "source": [ + "sz.head()" + ] + }, + { + "cell_type": "markdown", + "id": "fa46b891-dffb-4433-9d12-1dda1a01febf", + "metadata": {}, + "source": [ + "

remove words consisting of three or less characters

" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "e5447cba-5fc0-40c7-99c1-df90df5aae6b", + "metadata": {}, + "outputs": [], + "source": [ + "wz['ocr'] = wz['ocr'].replace( { r\"\\b\\w{1,3}\\b\" : '' }, regex = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "53f8d4b5-cc51-4ecd-8006-f79748b36edf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ocrsplitmanifest_idyear
209sonnabend iäner inländische begeben...testwrz178901031789
210mittewoche ianer inländische begebenheit ...trainwrz178901071789
211zeilunn scnnabend dtii inländische begeb...testwrz178901101789
212mittewoche ickner mändische begebenheite...trainwrz178901141789
213sonnabend inländische begebenheit mam...trainwrz178901171789
\n", + "
" + ], + "text/plain": [ + " ocr split manifest_id \\\n", + "209 sonnabend iäner inländische begeben... test wrz17890103 \n", + "210 mittewoche ianer inländische begebenheit ... train wrz17890107 \n", + "211 zeilunn scnnabend dtii inländische begeb... test wrz17890110 \n", + "212 mittewoche ickner mändische begebenheite... train wrz17890114 \n", + "213 sonnabend inländische begebenheit mam... train wrz17890117 \n", + "\n", + " year \n", + "209 1789 \n", + "210 1789 \n", + "211 1789 \n", + "212 1789 \n", + "213 1789 " + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wz.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "5a0c0461-4c23-4679-bc62-f7689ea49353", + "metadata": {}, + "outputs": [], + "source": [ + "sz['ocr'] = sz['ocr'].replace( { r\"\\b\\w{1,3}\\b\" : '' }, regex = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "2cc24462-5c73-4802-b738-014c089181a2", + "metadata": {}, + "outputs": [], + "source": [ + "sz.head()" + ] + }, + { + "cell_type": "markdown", + "id": "20c32401-64c5-4166-88e2-ba57a5f804d1", + "metadata": {}, + "source": [ + "

remove multiple white spaces etc

" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "cfb65a70-786b-4da0-9fde-b6a783e087e4", + "metadata": {}, + "outputs": [], + "source": [ + "wz['ocr'] = wz['ocr'].replace( { r\"\\s\\s+\" : ' ' }, regex = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "2e2356cf-2d59-4e84-8752-c080efd62434", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ocrsplitmanifest_idyear
209sonnabend iäner inländische begebenheiten pln...testwrz178901031789
210mittewoche ianer inländische begebenheit wien...trainwrz178901071789
211zeilunn scnnabend dtii inländische begebenhei...testwrz178901101789
212mittewoche ickner mändische begebenheiten krw...trainwrz178901141789
213sonnabend inländische begebenheit mamst horch ...trainwrz178901171789
\n", + "
" + ], + "text/plain": [ + " ocr split manifest_id \\\n", + "209 sonnabend iäner inländische begebenheiten pln... test wrz17890103 \n", + "210 mittewoche ianer inländische begebenheit wien... train wrz17890107 \n", + "211 zeilunn scnnabend dtii inländische begebenhei... test wrz17890110 \n", + "212 mittewoche ickner mändische begebenheiten krw... train wrz17890114 \n", + "213 sonnabend inländische begebenheit mamst horch ... train wrz17890117 \n", + "\n", + " year \n", + "209 1789 \n", + "210 1789 \n", + "211 1789 \n", + "212 1789 \n", + "213 1789 " + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wz.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d441b37-27be-4688-b927-b9ff2e68b56b", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "12b46594-b85c-4f78-aa8f-dcc5bdbbae9d", + "metadata": {}, + "outputs": [], + "source": [ + "wz.to_csv('wz_initial_cleaned.tsv', sep='\\t', index=True, header=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "7a9da207-240a-47ee-9ca6-fc262c48946c", + "metadata": {}, + "outputs": [], + "source": [ + "sz['ocr'] = sz['ocr'].replace( { r\"\\s\\s+\" : ' ' }, regex = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "a16fef5a-7242-41da-ba2d-8ffd9c6ce2a5", + "metadata": {}, + "outputs": [], + "source": [ + "sz.head()" + ] + }, + { + "cell_type": "markdown", + "id": "bb556189-b8f3-4441-acd1-59940a3c47dd", + "metadata": { + "tags": [] + }, + "source": [ + "

remove words occurring only one time

" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "f01dcaeb-1901-462e-8630-e417d2869470", + "metadata": {}, + "outputs": [], + "source": [ + "texts = wz['ocr'].sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "07ab63ef-5b1d-4be4-b526-cfbfb0655a2b", + "metadata": {}, + "outputs": [], + "source": [ + "texts = texts.split()" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "7b989ecb-f647-4cbd-896b-dbfe7c4cebb5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "11258539" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(texts)" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "4cfa31d7-1ffb-42c5-8ebf-71c503f7111c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('gemacht', 40810),\n", + " ('bekannt', 39128),\n", + " ('wien', 37478),\n", + " ('ersten', 27831),\n", + " ('joseph', 27448),\n", + " ('johann', 25810),\n", + " ('früh', 24408),\n", + " ('erscheinen', 24250),\n", + " ('herrschaft', 22730),\n", + " ('stock', 22341),\n", + " ('franz', 22142),\n", + " ('haus', 21875),\n", + " ('gläubiger', 20226),\n", + " ('mehr', 19099),\n", + " ('stadt', 18789),\n", + " ('mann', 18313),\n", + " ('wegen', 17831),\n", + " ('forderung', 16504),\n", + " ('verkaufen', 16463),\n", + " ('sammt', 15851)]" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "freqdist = Counter(texts)\n", + "freqdist.most_common(20)" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "ed5c7471-a46a-4c21-802b-e6c66ee7e4aa", + "metadata": {}, + "outputs": [], + "source": [ + "df_freq_wz = pd.DataFrame.from_dict(freqdist, orient='index').reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "b9ddfd77-0dc8-4270-8912-aa7d02a7c234", + "metadata": {}, + "outputs": [], + "source": [ + "df_freq_wz = df_freq_wz.rename(columns={'index':'word', 0:'count'})" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "4ad203d1-8560-433d-9da3-25d49375c706", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
wordcount
0sonnabend536
1iäner1503
2inländische870
3begebenheiten1767
4plngsthta1
.........
2491253ftttb1
2491254bmvir1
2491255spünchrkn1
2491256eräuye1
2491257otivtl1
\n", + "

2491258 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " word count\n", + "0 sonnabend 536\n", + "1 iäner 1503\n", + "2 inländische 870\n", + "3 begebenheiten 1767\n", + "4 plngsthta 1\n", + "... ... ...\n", + "2491253 ftttb 1\n", + "2491254 bmvir 1\n", + "2491255 spünchrkn 1\n", + "2491256 eräuye 1\n", + "2491257 otivtl 1\n", + "\n", + "[2491258 rows x 2 columns]" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_freq_wz" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "34c0b1b7-69fb-4b6f-b94f-f204c7ea849d", + "metadata": {}, + "outputs": [], + "source": [ + "freq_wz = df_freq_wz.loc[df_freq_wz['count'] <= 5]" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "132fd305-59ff-468d-97af-8cd01b748c83", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
wordcount
4plngsthta1
5krfslgtt1
6bble1
9heinri3
10preasst1
.........
2491253ftttb1
2491254bmvir1
2491255spünchrkn1
2491256eräuye1
2491257otivtl1
\n", + "

2373250 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " word count\n", + "4 plngsthta 1\n", + "5 krfslgtt 1\n", + "6 bble 1\n", + "9 heinri 3\n", + "10 preasst 1\n", + "... ... ...\n", + "2491253 ftttb 1\n", + "2491254 bmvir 1\n", + "2491255 spünchrkn 1\n", + "2491256 eräuye 1\n", + "2491257 otivtl 1\n", + "\n", + "[2373250 rows x 2 columns]" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "freq_wz" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "7cd0dba8-0ab9-4c5f-a35e-f36ad8e2a5a0", + "metadata": {}, + "outputs": [], + "source": [ + "freq_token_wz = freq_wz['word'].tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "7f0bf4d6-e740-4eb6-8f7a-4c25692f293f", + "metadata": {}, + "outputs": [], + "source": [ + "set_token1 = set(freq_token_wz)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f715423-25cf-4927-87e4-ad7a7738d78f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "wz['ocr'] = wz['ocr'].apply(lambda x: ' '.join([item for item in x.split() if item not in freq_token_wz]))" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "f262dd53-3a14-4005-a6bd-3ba099b30415", + "metadata": {}, + "outputs": [], + "source": [ + "wz['ocr'] = wz['ocr'].apply(lambda x: ' '.join([item for item in x.split() if item not in set_token1]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "087cb3e2-3cb7-4a63-af4c-b70afedca0a2", + "metadata": {}, + "outputs": [], + "source": [ + "reg = re.compile(freq_token_wz)\n", + "rep = ''\n", + "text = wz['ocr']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7415d84c-21aa-4795-8f9b-afffa3e6ebbb", + "metadata": {}, + "outputs": [], + "source": [ + "def sub_replace(reg, rep, text):\n", + " output = re.sub(reg, rep, text)\n", + " return output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae158b9c-484a-48d6-8aa8-0cbfd8f33eba", + "metadata": {}, + "outputs": [], + "source": [ + "p = re.compile('|'.join(map(re.escape, freq_token_wz)))\n", + "wz['ocr'] = [p.sub('', text) for text in wz['ocr']] " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e852760f-6b80-4eb5-a52c-06b0c05fd0ce", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "pat = r'\\b(?:{})\\b'.format('|'.join(freq_token_wz))\n", + "wz['ocr'] = wz['ocr'].str.replace(pat, '')\n", + "wz['ocr'] = wz['ocr'].str.replace(r'\\s+', ' ')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ddb580d5-fe86-4915-8698-30d1fbe4253a", + "metadata": {}, + "outputs": [], + "source": [ + "wz['ocr'] = wz['ocr'].replace(freq_token_wz,'')" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "id": "7ae47841-0769-42c1-9cf9-ab1a7adc757e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8337421" + ] + }, + "execution_count": 88, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(wz['ocr'].sum().split())" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "b46347ed-8252-4379-8371-bed9ea690ff5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "209 sonnabend iäner inländische begebenheiten bens...\n", + "210 mittewoche ianer inländische begebenheit wien ...\n", + "211 dtii inländische begebenheiten vorthei zugleic...\n", + "212 mittewoche begebenheiten sonntag hofe gottesdi...\n", + "213 sonnabend inländische begebenheit horch kret r...\n", + " ... \n", + "1299 sonnabend december inländische majestät habew ...\n", + "1300 sranj unft ober unftre gehn tlgen stieg turd m...\n", + "1301 sonnabend december inländische begebenheiten w...\n", + "1302 zmar mittewoche december inländische vegebenhe...\n", + "1303 sonnabend december inländische bege hell chris...\n", + "Name: ocr, Length: 1095, dtype: object" + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wz['ocr']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85617731-7397-4913-9b19-37586ad6eee4", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f156a21e-bf1a-4f52-8594-24caab5e7132", + "metadata": {}, + "outputs": [], + "source": [ + "texts_sz = sz['ocr'].sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7fbd1c5c-202b-47c6-abeb-8905a2ffab27", + "metadata": {}, + "outputs": [], + "source": [ + "texts_sz = texts_sz.split()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ccf175d-6e1b-4e44-8c27-0169b907372f", + "metadata": {}, + "outputs": [], + "source": [ + "len(texts_sz)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec56b4c7-cf35-4fde-8e7f-0f8714b31d54", + "metadata": {}, + "outputs": [], + "source": [ + "freqdist_sz = Counter(texts_sz)\n", + "freqdist_sz.most_common(20)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b624c21-627e-4e5e-92d6-4f42b473f090", + "metadata": {}, + "outputs": [], + "source": [ + "df_freq_sz = pd.DataFrame.from_dict(freqdist_sz, orient='index').reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29db12a0-d868-4dbc-83a9-38a30709c053", + "metadata": {}, + "outputs": [], + "source": [ + "df_freq_sz = df_freq_sz.rename(columns={'index':'word', 0:'count'})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c29a88e6-9271-4be3-b580-81a9bf17a4a5", + "metadata": {}, + "outputs": [], + "source": [ + "df_freq_sz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "985868ab-b0f1-41cc-9db0-86cfaed526bd", + "metadata": {}, + "outputs": [], + "source": [ + "freq_sz = df_freq_sz.loc[df_freq_sz['count'] == 1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d133a443-cdff-4603-8de5-ab0e80992eb4", + "metadata": {}, + "outputs": [], + "source": [ + "freq_sz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c85ac1e5-27cf-4424-a223-eb281fb18708", + "metadata": {}, + "outputs": [], + "source": [ + "freq_token_sz = freq_sz['word'].tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca6a9d6c-cd55-4203-bc86-cf5fbd3fedca", + "metadata": {}, + "outputs": [], + "source": [ + "sz['ocr'] = sz['ocr'].apply(lambda x: ' '.join([item for item in x.split() if item not in freq_token_sz]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2006a836-24b6-4f10-97e0-0a7f4dc29ade", + "metadata": {}, + "outputs": [], + "source": [ + "sz['ocr'] = sz['ocr'].replace(freq_token_sz,'')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d524f41-3845-4790-9f18-5f3b71c14357", + "metadata": {}, + "outputs": [], + "source": [ + "len(sz['ocr'].sum().split())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ec870ac-4118-4edf-b7eb-e4fc6a2e209f", + "metadata": {}, + "outputs": [], + "source": [ + "sz['ocr']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6b8e213-3ce0-4cd6-901c-1aa588f548be", + "metadata": {}, + "outputs": [], + "source": [ + "sz" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "e63ed713-e77c-4fd2-8dad-6638bb834d27", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8338515" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum(wz['ocr'].str.split().str.len())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c58eec9-697f-4ad1-bbe2-1ac697958fc1", + "metadata": {}, + "outputs": [], + "source": [ + "sum(sz['ocr'].str.split().str.len())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1f6f45e-a1cd-46d7-bd10-fb45915d8bc7", + "metadata": {}, + "outputs": [], + "source": [ + "texts_wz_3 = wz['ocr'].sum()\n", + "texts_wz_3 = texts_wz_3.split(\" \")\n", + "vocabulary_wz_3 = set(texts_wz_3)\n", + "len(vocabulary_wz_3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "809aa6ad-371e-406c-bc2a-6aa16a194c23", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 92, + "id": "ee3586e5-7cc7-472a-b636-002cb1db373c", + "metadata": {}, + "outputs": [], + "source": [ + "wz.to_csv('wz_clean5.tsv', sep='\\t', index=True, header=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a96ba6dc-06bb-4014-a35c-cd79000c56df", + "metadata": {}, + "outputs": [], + "source": [ + "sz.to_csv('sz_clean.tsv', sep='\\t', index=True, header=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} -- GitLab