Skip to content
Snippets Groups Projects
Commit 14f03ab6 authored by Thomas Kirchmair's avatar Thomas Kirchmair
Browse files

Upload New File

parent 49a991b8
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id:1c9cc762-eb3a-4e4a-8392-b38b897df498 tags:
``` python
import os
import csv
import pathlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import statistics
import string
from collections import Counter
from collections import defaultdict
from sklearn.model_selection import train_test_split
import octis
```
%% Cell type:code id:7ea8760d-97e5-44ae-a271-1478989fd97b tags:
``` python
import gensim
import spacy
import sklearn
import torch
import libsvm
import flask
import sentence_transformers
import requests
import tomotopy
```
%% Cell type:code id:32ea9351-4c82-48dc-9d85-9888243f0d3f tags:
``` python
os.chdir(r"C:\Users\onb1202\OneDrive - Österreichische Nationalbibliothek\Praktikum TK\daten")
```
%% Cell type:code id:8f3e3f35-d073-47e8-9ca8-03d995120d3a tags:
``` python
#small test corpus
```
%% Cell type:code id:471091f8-cee3-4bff-bfeb-f403b879c830 tags:
``` python
#df = pd.read_csv('test_raw.tsv', sep = '\t', encoding='utf-8')
```
%% Cell type:code id:9091c711-de70-46b3-9e42-517c9ecf106d tags:
``` python
#from somajo import SoMaJo
#tokenizer = SoMaJo("de_CMC", split_camel_case=True)
```
%% Cell type:code id:669d3e3b-c043-4594-8e88-106020e3b945 tags:
``` python
#sentence = tokenizer.tokenize_text_file(df['ocr'], paragraph_separator="single_newlines")
```
%% Cell type:code id:b5012333-27a2-4f1c-ac15-170fa4e5fcb8 tags:
``` python
#type(sentence)
```
%% Cell type:code id:e83b6b9c-f515-4b18-9eeb-354ba8bb395d tags:
``` python
#import time
#df = pd.read_csv('test_raw.tsv', sep = '\t', encoding='utf-8')
#ts = time.time()
#for i, ocr in df['ocr'].items():
# sentences = [s for s in tokenizer.tokenize_text(ocr.split('\n'), parallel=4)]
# all_tokens = []
# for sentence in sentences[:15]:
# for t in sentence:
# all_tokens.append(t.text)
# df.at[i, 'ocr'] = all_tokens
#print(time.time() - ts)
#df.head()
```
%% Cell type:code id:9abcd566-9acd-45e1-88db-683b9a8859d9 tags:
``` python
#whole corpus WZ
```
%% Cell type:code id:be803222-005f-4a68-9a27-228e255b32ce tags:
``` python
wz = pd.read_csv('wrz.csv', sep = ',', encoding='utf-8')
```
%% Cell type:code id:d594f243-c1e4-4d25-a133-8d7ca4c8e58b tags:
``` python
wz_token = wz.drop('year', axis=1, inplace=False)
```
%% Cell type:code id:4025980c-faf5-4d38-9043-edd155fc9eb6 tags:
``` python
wz_token['split'] = pd.NA
```
%% Cell type:code id:45f97a6e-d8f2-447e-b036-093c3aa42742 tags:
``` python
split = ['train', 'valid', 'test']
```
%% Cell type:code id:28694d81-e44f-4ace-8b19-9d09a5f05c59 tags:
``` python
wz_token['split'] = wz_token['split'].apply(lambda x: np.random.choice(split, p=[0.6, 0.2, 0.2]))
```
%% Cell type:code id:c56609e5-51a6-41b5-8b91-3687b3f8ec09 tags:
``` python
wz_token['split'].value_counts()
```
%% Cell type:code id:f0f85ec8-8ba5-404e-8f63-198d83963e00 tags:
``` python
wz_token.to_csv('wz_raw.tsv', sep='\t', index=True, header=True)
```
%% Cell type:code id:75a511a5-326a-472a-a103-6794e7dc4d18 tags:
``` python
df = pd.read_csv('wz_raw.tsv', sep = '\t', encoding='utf-8')
```
%% Cell type:code id:4f5edd4f-e0f2-4e41-b04f-da490c659e0d tags:
``` python
from somajo import SoMaJo
tokenizer = SoMaJo("de_CMC", split_camel_case=True)
```
%% Cell type:code id:ee4959e5-e49c-4715-89f8-44d3e0c2ee9c tags:
``` python
sentence = tokenizer.tokenize_text_file(df['ocr'], paragraph_separator="single_newlines")
```
%% Cell type:code id:023a1e55-e291-4ab9-a7e5-4f6916b063a5 tags:
``` python
import time
#df = pd.read_csv('test_raw.tsv', sep = '\t', encoding='utf-8')
df = pd.read_csv('wz_raw.tsv', sep = '\t', encoding='utf-8')
ts = time.time()
for i, ocr in df['ocr'].items():
sentences = [s for s in tokenizer.tokenize_text(ocr.split('\n'), parallel=4)]
all_tokens = []
for sentence in sentences:
for t in sentence:
all_tokens.append(t.text)
df.at[i, 'ocr'] = all_tokens
print(time.time() - ts)
df.head()
```
%% Output
4156.50820016861
Unnamed: 0 manifest_id ocr \
0 0 wrz17850101 [itz, ., r, F-, Nro, ., Sonnabmd, den, r., ^an...
1 1 wrz17850105 [5i, ., 2Y, F, Mittwoch, den, 5., Iäner, ., 17...
2 2 wrz17850108 [57, ^, Sonnabend, den, 8., Janer, ., 1735, .,...
3 3 wrz17850112 [5k, 8i, F, Mittwoch, den, Iäner, ., 173, ?, ....
4 4 wrz17850115 [^, 109, ^, Sonnabend, den, 15., Ianer, ., 178...
split
0 valid
1 train
2 valid
3 train
4 train
%% Cell type:code id:eb2fcf60-6051-46b4-84b0-33b1b2bd1e41 tags:
``` python
df = df.iloc[: , 1:]
```
%% Cell type:code id:5ec0718e-fbfe-40af-938f-5fc6d6ebc68a tags:
``` python
df = df.reindex(columns= ['ocr', 'split', 'manifest_id'])
```
%% Cell type:code id:35981c7d-9d7e-48d4-976b-671f32973096 tags:
``` python
df
```
%% Output
ocr split manifest_id
0 [itz, ., r, F-, Nro, ., Sonnabmd, den, r., ^an... valid wrz17850101
1 [5i, ., 2Y, F, Mittwoch, den, 5., Iäner, ., 17... train wrz17850105
2 [57, ^, Sonnabend, den, 8., Janer, ., 1735, .,... valid wrz17850108
3 [5k, 8i, F, Mittwoch, den, Iäner, ., 173, ?, .... train wrz17850112
4 [^, 109, ^, Sonnabend, den, 15., Ianer, ., 178... train wrz17850115
... ... ... ...
1299 [Sonnabend, ,, den, 14, *, December, 1799, ..,... train wrz17991214
1300 [I, m, 4, -, 77, i', jLiii, ., 1, J, Is, i0, n... valid wrz17991218
1301 [L, Sonnabend, ,, den, 21., December, 1799, .,... valid wrz17991221
1302 [WVr, 4, S73, ZMAr, if, >, Mittewoche, ,, den,... test wrz17991225
1303 [Sonnabend, ,, den, rz, «, December, 1799, ., ... test wrz17991228
[1304 rows x 3 columns]
%% Cell type:code id:b8e6b43c-624e-440e-bc1a-f10f88f4b428 tags:
``` python
df.to_csv('wz_tok.tsv', sep='\t', index=True, header=True)
```
%% Cell type:code id:80fa2ade-390b-4229-94a0-e99bdd646e0a tags:
``` python
#df = pd.read_csv('wz_tok.tsv', sep='\t', encoding='utf-8')
```
%% Cell type:code id:641b1d29-fd70-4b18-b7be-78e6b8162cae tags:
``` python
```
%% Cell type:code id:97f038f1-f618-4266-b660-4f66a3ab8bc9 tags:
``` python
sz = pd.read_csv('sza.csv', sep=',', encoding='UTF-8')
```
%% Cell type:code id:532fb694-ccde-4906-ad0d-fca376262192 tags:
``` python
sz
```
%% Output
manifest_id year ocr
0 sza1785bl01 1785 Salzburger\nI ntelligenzblatt.\n\nums-\n"" H e...
1 sza1785bl02 1785 -"-\n\nI Innländische, und auswärtige, besonde...
2 sza17850105 1785 \n\nSa|\n\ne mehr du Mensch bist, desto mehr g...
3 sza17850112 1785 -----\n\nva ar nicht die ehrlichkeit\n\n\n, so...
4 sza17850126 1785 ---\nT\nT--\n\nSalzburger\n\n- I. Verordnungen...
.. ... ... ...
625 sza17991207 1799 769\n\nSalzburger\n\n770\n\n---\n\nIntelligenz...
626 sza17991214 1799 785-\n\nSalzburger\n\nIntelligenzblatt.\n\n796...
627 sza17991221 1799 -\n\n--\n\n-\n\n99 E- S urger-\n\nSZ--- 2-- p-...
628 sza17991228 1799 Intelligenzblatt.\n\nLII. St. Sonnabend, den 2...
629 sza1799bl01 1799 –-–\n\n-------------------------------------\n...
[630 rows x 3 columns]
%% Cell type:code id:7efdb2a9-bfc3-4b88-ae19-a249a4fd21b3 tags:
``` python
sz_token = sz.drop('year', axis=1, inplace=False)
```
%% Cell type:code id:3b33320e-924d-483c-82ca-428f4a9799c6 tags:
``` python
sz_token['split'] = pd.NA
```
%% Cell type:code id:290dbc99-e3c2-4922-b20d-110e338b1b32 tags:
``` python
split = ['train', 'valid', 'test']
```
%% Cell type:code id:3e0e95cf-82e0-473d-85bb-7a848cde595f tags:
``` python
sz_token['split'] = sz_token['split'].apply(lambda x: np.random.choice(split, p=[0.6, 0.2, 0.2]))
```
%% Cell type:code id:af6f0417-becb-4506-bb4d-31340052e51a tags:
``` python
sz_token['split'].value_counts()
```
%% Output
train 367
valid 139
test 124
Name: split, dtype: int64
%% Cell type:code id:02a44c70-f3f0-4d76-9682-b7e0c543a665 tags:
``` python
sz_token = sz_token.reindex(columns= ['ocr', 'split', 'manifest_id'])
```
%% Cell type:code id:1219234f-7ca7-4ec8-b1d1-fdb8008e7ad3 tags:
``` python
sz_token.to_csv('sz_raw.tsv',sep='\t',index=True,header=True)
```
%% Cell type:code id:e159229d-4156-4c39-af18-f06f32c5a1d2 tags:
``` python
df = pd.read_csv('sz_raw.tsv', sep = '\t', encoding='utf-8')
```
%% Cell type:code id:3b152ed7-2f4d-4244-9cc5-a3c573a9a52f tags:
``` python
sentence = tokenizer.tokenize_text_file(df['ocr'], paragraph_separator="single_newlines")
```
%% Cell type:code id:0d527d61-c6da-4562-9c8f-a94cf0a97931 tags:
``` python
import time
#df = pd.read_csv('test_raw.tsv', sep = '\t', encoding='utf-8')
df = pd.read_csv('sz_raw.tsv', sep = '\t', encoding='utf-8')
ts = time.time()
for i, ocr in df['ocr'].items():
sentences = [s for s in tokenizer.tokenize_text(ocr.split('\n'), parallel=4)]
all_tokens = []
for sentence in sentences:
for t in sentence:
all_tokens.append(t.text)
df.at[i, 'ocr'] = all_tokens
print(time.time() - ts)
df.head()
```
%% Output
558.7360711097717
Unnamed: 0 ocr split \
0 0 [Salzburger, I, ntelligenzblatt, ., ums-, ", "... train
1 1 [-, ", -, I, Innländische, ,, und, auswärtige,... train
2 2 [Sa|, e, mehr, du, Mensch, bist, ,, desto, meh... valid
3 3 [-----, va, ar, nicht, die, ehrlichkeit, ,, so... train
4 4 [---, T, T--, Salzburger, -, I., Verordnungen,... test
manifest_id
0 sza1785bl01
1 sza1785bl02
2 sza17850105
3 sza17850112
4 sza17850126
%% Cell type:code id:57bea547-6fba-42da-8e26-42c4f02380d5 tags:
``` python
df = df.iloc[: , 1:]
```
%% Cell type:code id:01cfbbf3-ee87-4614-b454-e48859b50504 tags:
``` python
df
```
%% Output
ocr split manifest_id
0 [Salzburger, I, ntelligenzblatt, ., ums-, ", "... train sza1785bl01
1 [-, ", -, I, Innländische, ,, und, auswärtige,... train sza1785bl02
2 [Sa|, e, mehr, du, Mensch, bist, ,, desto, meh... valid sza17850105
3 [-----, va, ar, nicht, die, ehrlichkeit, ,, so... train sza17850112
4 [---, T, T--, Salzburger, -, I., Verordnungen,... test sza17850126
.. ... ... ...
625 [769, Salzburger, 770, ---, Intelligenzblatt, ... valid sza17991207
626 [785-, Salzburger, Intelligenzblatt, ., 796, -... train sza17991214
627 [-, --, -, 99, E-, S, urger-, SZ---, 2-, -, p-... train sza17991221
628 [Intelligenzblatt, ., LII, ., St., Sonnabend, ... train sza17991228
629 [–, -, –, ------------------------------------... test sza1799bl01
[630 rows x 3 columns]
%% Cell type:code id:45f88e29-5854-4b29-b514-58215326bf9d tags:
``` python
df.to_csv('sz_tok.tsv', sep='\t', index=True, header=True)
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment