Skip to content
cleaning.ipynb 73.1 KiB
Newer Older
Thomas Kirchmair's avatar
Thomas Kirchmair committed
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e81f9560-1a1b-4d93-93c3-84e1263b04f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import csv\n",
    "import pathlib\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import nltk\n",
    "import re\n",
    "import string\n",
    "from collections import Counter\n",
    "from collections import defaultdict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "85c59a91-3778-4973-9491-c99528fe39a3",
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "89bc3c22-b2f3-4389-b843-ebe2bed4ea35",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.chdir(r\"C:\\Users\\onb1202\\OneDrive - Österreichische Nationalbibliothek\\Praktikum TK\\daten\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "346bdb53-f234-408a-b666-16ddcc10a8e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "wz = pd.read_csv('wz_tok.tsv', sep='\\t', encoding='utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ca7bb501-ebc6-4b00-bf7b-cc6d7bd97927",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>ocr</th>\n",
       "      <th>split</th>\n",
       "      <th>manifest_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>['itz', '.', 'r', 'F-', 'Nro', '.', 'Sonnabmd'...</td>\n",
       "      <td>valid</td>\n",
       "      <td>wrz17850101</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>['5i', '.', '2Y', 'F', 'Mittwoch', 'den', '5.'...</td>\n",
       "      <td>train</td>\n",
       "      <td>wrz17850105</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>['57', '^', 'Sonnabend', 'den', '8.', 'Janer',...</td>\n",
       "      <td>valid</td>\n",
       "      <td>wrz17850108</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>['5k', '8i', 'F', 'Mittwoch', 'den', 'Iäner', ...</td>\n",
       "      <td>train</td>\n",
       "      <td>wrz17850112</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>['^', '109', '^', 'Sonnabend', 'den', '15.', '...</td>\n",
       "      <td>train</td>\n",
       "      <td>wrz17850115</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1299</th>\n",
       "      <td>1299</td>\n",
       "      <td>['Sonnabend', ',', 'den', '14', '*', 'December...</td>\n",
       "      <td>train</td>\n",
       "      <td>wrz17991214</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1300</th>\n",
       "      <td>1300</td>\n",
       "      <td>['I', 'm', '4', '-', '77', \"i'\", 'jLiii', '.',...</td>\n",
       "      <td>valid</td>\n",
       "      <td>wrz17991218</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1301</th>\n",
       "      <td>1301</td>\n",
       "      <td>['L', 'Sonnabend', ',', 'den', '21.', 'Decembe...</td>\n",
       "      <td>valid</td>\n",
       "      <td>wrz17991221</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1302</th>\n",
       "      <td>1302</td>\n",
       "      <td>['WVr', '4', 'S73', 'ZMAr', 'if', '&gt;', 'Mittew...</td>\n",
       "      <td>test</td>\n",
       "      <td>wrz17991225</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1303</th>\n",
       "      <td>1303</td>\n",
       "      <td>['Sonnabend', ',', 'den', 'rz', '«', 'December...</td>\n",
       "      <td>test</td>\n",
       "      <td>wrz17991228</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1304 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      Unnamed: 0                                                ocr  split  \\\n",
       "0              0  ['itz', '.', 'r', 'F-', 'Nro', '.', 'Sonnabmd'...  valid   \n",
       "1              1  ['5i', '.', '2Y', 'F', 'Mittwoch', 'den', '5.'...  train   \n",
       "2              2  ['57', '^', 'Sonnabend', 'den', '8.', 'Janer',...  valid   \n",
       "3              3  ['5k', '8i', 'F', 'Mittwoch', 'den', 'Iäner', ...  train   \n",
       "4              4  ['^', '109', '^', 'Sonnabend', 'den', '15.', '...  train   \n",
       "...          ...                                                ...    ...   \n",
       "1299        1299  ['Sonnabend', ',', 'den', '14', '*', 'December...  train   \n",
       "1300        1300  ['I', 'm', '4', '-', '77', \"i'\", 'jLiii', '.',...  valid   \n",
       "1301        1301  ['L', 'Sonnabend', ',', 'den', '21.', 'Decembe...  valid   \n",
       "1302        1302  ['WVr', '4', 'S73', 'ZMAr', 'if', '>', 'Mittew...   test   \n",
       "1303        1303  ['Sonnabend', ',', 'den', 'rz', '«', 'December...   test   \n",
       "\n",
       "      manifest_id  \n",
       "0     wrz17850101  \n",
       "1     wrz17850105  \n",
       "2     wrz17850108  \n",
       "3     wrz17850112  \n",
       "4     wrz17850115  \n",
       "...           ...  \n",
       "1299  wrz17991214  \n",
       "1300  wrz17991218  \n",
       "1301  wrz17991221  \n",
       "1302  wrz17991225  \n",
       "1303  wrz17991228  \n",
       "\n",
       "[1304 rows x 4 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
Loading
Loading full blame...