Newer
Older
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"df = pd.read_csv('https://labs.onb.ac.at/gitlab/labs-team/raw-metadata/raw/master/anno_labs_issues.csv.bz2', compression='bz2')"
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>manifest_id</th>\n",
" <th>aid</th>\n",
" <th>year</th>\n",
" <th>day</th>\n",
" <th>dc_title</th>\n",
" <th>dc_title_additional</th>\n",
" <th>subjects</th>\n",
" <th>place_of_publications</th>\n",
" <th>languages</th>\n",
" <th>dc_type</th>\n",
" <th>...</th>\n",
" <th>modification_datetime</th>\n",
" <th>longer_page_id</th>\n",
" <th>dc_date</th>\n",
" <th>link_pdf</th>\n",
" <th>link_old</th>\n",
" <th>has_ocr</th>\n",
" <th>meta_id</th>\n",
" <th>page_count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>fug15050701</td>\n",
" <td>fug</td>\n",
" <td>1505</td>\n",
" <td>15050701</td>\n",
" <td>Fugger - Zeitungen</td>\n",
" <td>Tageszeitung</td>\n",
" <td>o.O.</td>\n",
" <td>...</td>\n",
" <td>2013-06-27 13:28:35</td>\n",
" <td>1</td>\n",
" <td>1505-07-01</td>\n",
" <td>http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...</td>\n",
" <td>http://anno.onb.ac.at/cgi-content/anno?aid=fug...</td>\n",
" <td>1122198</td>\n",
" <td>2</td>\n",
" <td>fug15680120</td>\n",
" <td>fug</td>\n",
" <td>1568</td>\n",
" <td>15680120</td>\n",
" <td>Fugger - Zeitungen</td>\n",
" <td>Tageszeitung</td>\n",
" <td>o.O.</td>\n",
" <td>...</td>\n",
" <td>2013-06-27 13:27:59</td>\n",
" <td>1</td>\n",
" <td>1568-01-20</td>\n",
" <td>http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...</td>\n",
" <td>http://anno.onb.ac.at/cgi-content/anno?aid=fug...</td>\n",
" <td>1122201</td>\n",
" <td>1</td>\n",
" <td>fug15680124</td>\n",
" <td>fug</td>\n",
" <td>1568</td>\n",
" <td>15680124</td>\n",
" <td>Fugger - Zeitungen</td>\n",
" <td>Tageszeitung</td>\n",
" <td>o.O.</td>\n",
" <td>...</td>\n",
" <td>2013-06-27 13:27:58</td>\n",
" <td>1</td>\n",
" <td>1568-01-24</td>\n",
" <td>http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...</td>\n",
" <td>http://anno.onb.ac.at/cgi-content/anno?aid=fug...</td>\n",
" <td>1122202</td>\n",
" <td>2</td>\n",
" <td>fug15680228</td>\n",
" <td>fug</td>\n",
" <td>1568</td>\n",
" <td>15680228</td>\n",
" <td>Fugger - Zeitungen</td>\n",
" <td>Tageszeitung</td>\n",
" <td>o.O.</td>\n",
" <td>...</td>\n",
" <td>2013-06-27 13:27:59</td>\n",
" <td>1</td>\n",
" <td>1568-02-28</td>\n",
" <td>http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...</td>\n",
" <td>http://anno.onb.ac.at/cgi-content/anno?aid=fug...</td>\n",
" <td>1122203</td>\n",
" <td>1</td>\n",
" <td>fug15680304</td>\n",
" <td>fug</td>\n",
" <td>1568</td>\n",
" <td>15680304</td>\n",
" <td>Fugger - Zeitungen</td>\n",
" <td>Tageszeitung</td>\n",
" <td>o.O.</td>\n",
" <td>...</td>\n",
" <td>2013-06-27 13:27:58</td>\n",
" <td>1</td>\n",
" <td>1568-03-04</td>\n",
" <td>http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...</td>\n",
" <td>http://anno.onb.ac.at/cgi-content/anno?aid=fug...</td>\n",
" <td>1122204</td>\n",
" <td>4</td>\n",
"<p>5 rows × 21 columns</p>\n",
" manifest_id aid year day dc_title dc_title_additional \\\n",
"0 fug15050701 fug 1505 15050701 Fugger - Zeitungen NaN \n",
"1 fug15680120 fug 1568 15680120 Fugger - Zeitungen NaN \n",
"2 fug15680124 fug 1568 15680124 Fugger - Zeitungen NaN \n",
"3 fug15680228 fug 1568 15680228 Fugger - Zeitungen NaN \n",
"4 fug15680304 fug 1568 15680304 Fugger - Zeitungen NaN \n",
" subjects place_of_publications languages dc_type ... meta_type \\\n",
"0 Tageszeitung o.O. de newspaper ... brz \n",
"1 Tageszeitung o.O. de newspaper ... brz \n",
"2 Tageszeitung o.O. de newspaper ... brz \n",
"3 Tageszeitung o.O. de newspaper ... brz \n",
"4 Tageszeitung o.O. de newspaper ... brz \n",
" ini_type modification_datetime longer_page_id dc_date \\\n",
"0 anno 2013-06-27 13:28:35 1 1505-07-01 \n",
"1 anno 2013-06-27 13:27:59 1 1568-01-20 \n",
"2 anno 2013-06-27 13:27:58 1 1568-01-24 \n",
"3 anno 2013-06-27 13:27:59 1 1568-02-28 \n",
"4 anno 2013-06-27 13:27:58 1 1568-03-04 \n",
"\n",
" link_pdf \\\n",
"0 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n",
"1 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n",
"2 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n",
"3 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n",
"4 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n",
"\n",
" link_old has_ocr meta_id \\\n",
"0 http://anno.onb.ac.at/cgi-content/anno?aid=fug... 0 1122198 \n",
"1 http://anno.onb.ac.at/cgi-content/anno?aid=fug... 0 1122201 \n",
"2 http://anno.onb.ac.at/cgi-content/anno?aid=fug... 0 1122202 \n",
"3 http://anno.onb.ac.at/cgi-content/anno?aid=fug... 0 1122203 \n",
"4 http://anno.onb.ac.at/cgi-content/anno?aid=fug... 0 1122204 \n",
"\n",
" page_count \n",
"0 2 \n",
"1 1 \n",
"2 2 \n",
"3 1 \n",
"4 4 \n",
"\n",
"[5 rows x 21 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['(Linzer) Tages-Post',\n",
" '(Neue) Wiener Schachzeitung',\n",
" '(Neuigkeits) Welt Blatt',\n",
" 'Agramer Zeitung',\n",
" 'Allgemeine Bauzeitung',\n",
" 'Allgemeine Zeitschrift für Lehrerinnen',\n",
" 'Allgemeine land- und forstwirthschaftliche Zeitung',\n",
" 'Allgemeine musikalische Zeitung',\n",
" 'Allgemeine Österreichische Gerichtszeitung',\n",
" 'Amtliches Cursblatt der Wiener Börse',\n",
" 'Bade- und Reise-Journal',\n",
" 'Bericht über die Wirksamkeit des (...) Frauenvereines für Arbeitsschulen',\n",
" 'Blätter für Musik, Theater und Kunst',\n",
" 'Bregenzer Wochenblatt',\n",
" 'Brixner Diözesanblatt',\n",
" 'Buchdrucker-Zeitung',\n",
" 'Bukowina',\n",
" 'Carinthia. Zeitschrift für Vaterlandskunde, Belehrung und Unterhaltung',\n",
" 'Christliche Kunstblätter',\n",
" 'Cur- und Fremden-Liste des Curortes Baden bei Wien',\n",
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
" 'Das Vaterland',\n",
" 'Der Floh',\n",
" 'Der Humorist',\n",
" 'Der Militärarzt',\n",
" 'Der Tresor',\n",
" 'Der Zwischen-Akt',\n",
" 'Der Österreichische Schulbote',\n",
" 'Deutsche Musik-Zeitung',\n",
" 'Deutsche Zeitung',\n",
" 'Die Bombe',\n",
" 'Die Debatte',\n",
" 'Die Emancipation. Zeitschrift für Frauen',\n",
" 'Die Feuerwehr',\n",
" 'Die Gartenlaube für Österreich',\n",
" 'Die Hausfrau: Blätter für Haus und Wirthschaft',\n",
" 'Die Neuzeit',\n",
" 'Die Presse',\n",
" 'Die Vedette',\n",
" 'Eideseis dia ta anatolika mere',\n",
" 'Ephemeris',\n",
" 'Extract-Schreiben (Europaeische Zeitung)',\n",
" 'Feldkircher Anzeiger',\n",
" 'Feldkircher Wochenblatt',\n",
" 'Feldkircher Zeitung',\n",
" 'Figaro',\n",
" 'Frauenblätter',\n",
" 'Freie Pädagogische Blätter',\n",
" 'Fremden-Blatt',\n",
" \"Fromme's Österreichischer Feuerwehr-Kalender\",\n",
" 'Fugger - Zeitungen',\n",
" 'Gerichtshalle',\n",
" 'Grazer Volksblatt',\n",
" 'Halleiner Bothe',\n",
" 'Hellenikos telegraphos',\n",
" 'Hermes ho logios',\n",
" 'Illustriertes Österreichisches Journal',\n",
" 'Illustrirtes Wiener Extrablatt',\n",
" 'Innsbrucker Nachrichten',\n",
" 'Internationale Ausstellungs-Zeitung',\n",
" 'Ischler Bade-Liste',\n",
" 'Ischler Fremden-Salon',\n",
" 'Jahrbuch des Voralberger Landesmuseumsvereins',\n",
" 'Jahresbericht Akademisches Gymnasium Wien',\n",
" 'Jahresbericht Josefstädter Obergymnasium',\n",
" 'Jahresbericht Schottengymnasium Wien',\n",
" 'Jahresbericht Staats-Unterrealschule Margareten',\n",
" 'Jahresbericht der städtischen Volksschule für Mädchen',\n",
" 'Jahresbericht des Frauen-Wohlthätigkeits-Vereines für Wien und Umgebung',\n",
" 'Jahresbericht des Männergesangsvereines in Wien',\n",
" 'Jahresbericht des k.k. Maximiliangymnasium in Wien',\n",
" 'Jahresbericht über die israelitische Kinderbewahr-Anstalt zu Wien, Leopoldstadt, Schiffamtsgasse No 773',\n",
" 'Journal des Österreichischen Lloyd',\n",
" 'Journal für Freymaurer: Als Manuskript gedruckt für Brüder und Meister des Ordens',\n",
" 'Juristische Blätter',\n",
" 'Jörgel Briefe',\n",
" 'Kaufmännische Zeitschrift',\n",
" 'Kikeriki',\n",
" 'Klagenfurter Zeitung',\n",
" 'Kunst und Volk',\n",
" 'Laibacher Diöcesanblatt',\n",
" 'Leitmeritzer Zeitung',\n",
" 'Linzer Diözesanblatt',\n",
" 'Linzer Volksblatt',\n",
" 'Marburger Zeitung',\n",
" 'Mitteilungen der Gesellschaft für Salzburger Landeskunde',\n",
" 'Mittheilungen der kaiserl. königl. Central-Commission zur Erforschung und Erhaltung der Baudenkmale',\n",
" 'Mittheilungen der kaiserlich-königlichen Geographischen Gesellschaft',\n",
" 'Monatsschrift für den Orient',\n",
" 'Morgen-Post',\n",
" 'Musikalisch-literarischer Monatsbericht über neue Musikalien, musikalische Schriften und Abbildungen',\n",
" 'Musikalisches Wochenblatt',\n",
" 'Nasa Sloga',\n",
" 'Neue Freie Presse',\n",
" 'Neue Illustrirte Zeitung',\n",
" 'Neue Wiener Musik-Zeitung',\n",
" 'Neue Zeitschrift für Musik',\n",
" 'Neues Fremden-Blatt',\n",
" 'Nordböhmisches Volksblatt',\n",
" 'Oesterreichische Buchhändler-Correspondenz',\n",
" 'Oesterreichischer Soldatenfreund',\n",
" 'Oesterreichisches Journal',\n",
" 'Ordinariats-Blatt der Budweiser Diöcese',\n",
" 'Ost-Deutsche Post',\n",
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
" 'Philologikos telegraphos',\n",
" 'Photographische Correspondenz',\n",
" 'Pilsner Abendpost',\n",
" 'Pilsner Fremdenblatt',\n",
" 'Politische Frauen-Zeitung',\n",
" 'Populäre österreichische Gesundheits-Zeitung',\n",
" 'Prager Abendblatt',\n",
" 'Prager Tagblatt',\n",
" 'Salzburger Bote',\n",
" 'Salzburger Chronik für Stadt und Land',\n",
" 'Salzburger Volksblatt: unabh. Tageszeitung f. Stadt u. Land Salzburg',\n",
" 'Siebenbürgisch-Deutsches Wochenblatt',\n",
" 'Signale für die musikalische Welt',\n",
" 'Social-politische Frauen-Zeitung. Organ für die Gesammt-Interessen des Frauenlebens. ',\n",
" 'Sonntagsblätter',\n",
" 'Statistische Monatsschrift',\n",
" 'Steyrer Zeitung',\n",
" 'Teplitz-Schönauer Anzeiger',\n",
" 'Theater an der Wien - Theaterzettel',\n",
" 'Theaterzettel (Oper und Burgtheater in Wien)',\n",
" 'Union. Zeitschrift für Versicherungswesen und Volkswirtschaft',\n",
" 'Unterhaltungs-Blatt zum Alzeyer Anzeigeblatt',\n",
" 'Vaterländische Blätter',\n",
" 'Volksblatt für Stadt und Land',\n",
" 'Volksblätter aus Salzburg',\n",
" 'Vorarlberger Landes-Zeitung',\n",
" 'Vorarlberger Volksblatt',\n",
" 'Vorarlberger Zeitung',\n",
" 'Wiener Abendzeitung',\n",
" 'Wiener Diözesanblatt',\n",
" 'Wiener Feuerwehrzeitung',\n",
" 'Wiener Kommunal-Kalender und städtisches Jahrbuch',\n",
" 'Wiener Landwirtschaftliche Zeitung',\n",
" 'Wiener Medizinische Wochenschrift',\n",
" 'Wiener Moden Zeitung',\n",
" 'Wiener Salonblatt',\n",
" 'Wiener Sonn- und Montags-Zeitung',\n",
" 'Wiener Theater-Zeitung (Bäuerles Theaterzeitung)',\n",
" 'Wiener Vororte-Zeitung',\n",
" 'Wiener Zeitung',\n",
" 'Wienerische Kirchenzeitung',\n",
" 'Wochenzeitschrift des ö. Ingenieur- und Architektenvereins',\n",
" 'Wr. Weltaustellungs-Zeitung / Int. Austellungs-Zeitung',\n",
" 'Zeitschrift des deutschen und österreichischen Alpenvereins',\n",
" 'Zeitschrift für Notariat und freiwillige Gerichtsbarkeit in Österreich',\n",
" 'Zeitschrift für das Privat- und Öffentliche Recht der Gegenwart',\n",
" 'Znaimer Wochenblatt',\n",
" 'Österreichisch-kaiserlicher Hofkalender',\n",
" 'Österreichische Badezeitung',\n",
" 'Österreichische Feuerwehrzeitung',\n",
" 'Österreichische Revue',\n",
" 'Österreichische Schachzeitung',\n",
" 'Österreichische Verbands-Feuerwehr-Zeitung',\n",
" 'Österreichische Wochenschrift für Wissenschaft und Kunst',\n",
" 'Österreichische Zeitschrift für Gesetzgebung und Rechtsprechung',\n",
" 'Österreichische Zeitschrift für Verwaltung']"
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sorted(df.dc_title.unique())"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(df)"
]
},
{
"cell_type": "code",
"execution_count": 6,
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>year</th>\n",
" <th>from_abo</th>\n",
" <th>longer_page_id</th>\n",
" <th>has_ocr</th>\n",
" <th>meta_id</th>\n",
" <th>page_count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>228449.000000</td>\n",
" <td>228449.0</td>\n",
" <td>228449.000000</td>\n",
" <td>228449.000000</td>\n",
" <td>2.284490e+05</td>\n",
" <td>228449.000000</td>\n",
" <td>1841.248821</td>\n",
" <td>0.063121</td>\n",
" <td>0.782687</td>\n",
" <td>6.373957e+05</td>\n",
" <td>9.373917</td>\n",
" <td>58.808124</td>\n",
" <td>0.243182</td>\n",
" <td>0.412418</td>\n",
" <td>3.691163e+05</td>\n",
" <td>21.592944</td>\n",
" <td>1.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>3.506000e+03</td>\n",
" <td>1.000000</td>\n",
" <td>1834.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>2.696050e+05</td>\n",
" <td>4.000000</td>\n",
" <td>1858.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>7.375320e+05</td>\n",
" <td>6.000000</td>\n",
" <td>1870.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>9.717960e+05</td>\n",
" <td>12.000000</td>\n",
" <td>1877.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.139472e+06</td>\n",
" <td>1676.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" year from_abo longer_page_id has_ocr meta_id \\\n",
"count 228449.000000 228449.0 228449.000000 228449.000000 2.284490e+05 \n",
"mean 1841.248821 0.0 0.063121 0.782687 6.373957e+05 \n",
"std 58.808124 0.0 0.243182 0.412418 3.691163e+05 \n",
"min 1.000000 0.0 0.000000 0.000000 3.506000e+03 \n",
"25% 1834.000000 0.0 0.000000 1.000000 2.696050e+05 \n",
"50% 1858.000000 0.0 0.000000 1.000000 7.375320e+05 \n",
"75% 1870.000000 0.0 0.000000 1.000000 9.717960e+05 \n",
"max 1877.000000 0.0 1.000000 1.000000 1.139472e+06 \n",
" page_count \n",
"count 228449.000000 \n",
"mean 9.373917 \n",
"std 21.592944 \n",
"min 1.000000 \n",
"25% 4.000000 \n",
"50% 6.000000 \n",
"75% 12.000000 \n",
"max 1676.000000 "
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"older = df[(df['year'] < 1878) & (df['year'] > 1500)]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(older)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0])"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['from_abo'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/kst/tmp/dingsdi/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3049: DtypeWarning: Columns (1,2) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" interactivity=interactivity, compiler=compiler, result=result)\n"
]
}
],
"pages = pd.read_csv('https://labs.onb.ac.at/gitlab/labs-team/raw-metadata/raw/master/anno_labs_ocr_pages.tsv.bz2', sep='\\t', compression='bz2')"
"execution_count": 12,
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>manifest_id</th>\n",
" <th>page_id</th>\n",
" <th>structure_info</th>\n",
" <th>dateipfad</th>\n",
" <th>dateiname</th>\n",
" <th>has_ocr</th>\n",
" <th>alto_path</th>\n",
" <th>width</th>\n",
" <th>height</th>\n",
" <th>resolution</th>\n",
" <th>color_depth</th>\n",
" <th>is_not_for_publication</th>\n",
" <th>order_index</th>\n",
" <th>from_abo</th>\n",
" <th>year</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>abz1860ag0001</td>\n",
" <td>0007_00000363</td>\n",
" <td>Notizblatt</td>\n",
" <td>/cont01/periodika/abz/1860/18600007/00000363.tif</td>\n",
" <td>00000363.tif</td>\n",
" <td>1</td>\n",
" <td>/conttxt/periodika/abz/1860/18600007/alto/0000...</td>\n",
" <td>2368</td>\n",
" <td>3066</td>\n",
" <td>300</td>\n",
" <td>8</td>\n",
" <td>12325951</td>\n",
" <td>0</td>\n",
" <td>1860</td>\n",
" <td>abz1860ag0001</td>\n",
" <td>0007_00000364</td>\n",
" <td>Notizblatt</td>\n",
" <td>/cont01/periodika/abz/1860/18600007/00000364.tif</td>\n",
" <td>00000364.tif</td>\n",
" <td>/conttxt/periodika/abz/1860/18600007/alto/0000...</td>\n",
" <td>2368</td>\n",
" <td>3066</td>\n",
" <td>300</td>\n",
" <td>8</td>\n",
" <td>0</td>\n",
" <td>12325952</td>\n",
" <td>0</td>\n",
" <td>1860</td>\n",
" <td>abz1860ag0001</td>\n",
" <td>0007_00000365</td>\n",
" <td>Notizblatt</td>\n",
" <td>/cont01/periodika/abz/1860/18600007/00000365.tif</td>\n",
" <td>00000365.tif</td>\n",
" <td>1</td>\n",
" <td>/conttxt/periodika/abz/1860/18600007/alto/0000...</td>\n",
" <td>2368</td>\n",
" <td>3063</td>\n",
" <td>300</td>\n",
" <td>8</td>\n",
" <td>0</td>\n",
" <td>12325953</td>\n",
" <td>0</td>\n",
" <td>1860</td>\n",
" <td>abz1860ag0001</td>\n",
" <td>0007_00000366</td>\n",
" <td>Notizblatt</td>\n",
" <td>/cont01/periodika/abz/1860/18600007/00000366.tif</td>\n",
" <td>00000366.tif</td>\n",
" <td>1</td>\n",
" <td>/conttxt/periodika/abz/1860/18600007/alto/0000...</td>\n",
" <td>2368</td>\n",
" <td>3063</td>\n",
" <td>300</td>\n",
" <td>8</td>\n",
" <td>0</td>\n",
" <td>12325954</td>\n",
" <td>0</td>\n",
" <td>1860</td>\n",
" <td>abz1860ag0001</td>\n",
" <td>0007_00000367</td>\n",
" <td>Notizblatt</td>\n",
" <td>/cont01/periodika/abz/1860/18600007/00000367.tif</td>\n",
" <td>00000367.tif</td>\n",
" <td>1</td>\n",
" <td>/conttxt/periodika/abz/1860/18600007/alto/0000...</td>\n",
" <td>2368</td>\n",
" <td>3059</td>\n",
" <td>300</td>\n",
" <td>8</td>\n",
" <td>0</td>\n",
" <td>12325955</td>\n",
" <td>0</td>\n",
" <td>1860</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" manifest_id page_id structure_info \\\n",
"0 abz1860ag0001 0007_00000363 Notizblatt \n",
"1 abz1860ag0001 0007_00000364 Notizblatt \n",
"2 abz1860ag0001 0007_00000365 Notizblatt \n",
"3 abz1860ag0001 0007_00000366 Notizblatt \n",
"4 abz1860ag0001 0007_00000367 Notizblatt \n",
"\n",
" dateipfad dateiname has_ocr \\\n",
"0 /cont01/periodika/abz/1860/18600007/00000363.tif 00000363.tif 1 \n",
"1 /cont01/periodika/abz/1860/18600007/00000364.tif 00000364.tif 1 \n",
"2 /cont01/periodika/abz/1860/18600007/00000365.tif 00000365.tif 1 \n",
"3 /cont01/periodika/abz/1860/18600007/00000366.tif 00000366.tif 1 \n",
"4 /cont01/periodika/abz/1860/18600007/00000367.tif 00000367.tif 1 \n",
" alto_path width height \\\n",
"0 /conttxt/periodika/abz/1860/18600007/alto/0000... 2368 3066 \n",
"1 /conttxt/periodika/abz/1860/18600007/alto/0000... 2368 3066 \n",
"2 /conttxt/periodika/abz/1860/18600007/alto/0000... 2368 3063 \n",
"3 /conttxt/periodika/abz/1860/18600007/alto/0000... 2368 3063 \n",
"4 /conttxt/periodika/abz/1860/18600007/alto/0000... 2368 3059 \n",
"\n",
" resolution color_depth is_not_for_publication order_index from_abo \\\n",
"0 300 8 0 12325951 0 \n",
"1 300 8 0 12325952 0 \n",
"2 300 8 0 12325953 0 \n",
"3 300 8 0 12325954 0 \n",
"4 300 8 0 12325955 0 \n",
"\n",
" year \n",
"0 1860 \n",
"1 1860 \n",
"2 1860 \n",
"3 1860 \n",
"4 1860 "
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pages.head()"
]
},
{
"cell_type": "code",
"execution_count": 13,
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>manifest_id</th>\n",
" <th>aid</th>\n",
" <th>year</th>\n",
" <th>day</th>\n",
" <th>dc_title</th>\n",
" <th>dc_title_additional</th>\n",
" <th>subjects</th>\n",
" <th>place_of_publications</th>\n",
" <th>languages</th>\n",
" <th>dc_type</th>\n",
" <th>...</th>\n",
" <th>modification_datetime</th>\n",
" <th>longer_page_id</th>\n",
" <th>dc_date</th>\n",
" <th>link_pdf</th>\n",
" <th>link_old</th>\n",
" <th>has_ocr</th>\n",
" <th>meta_id</th>\n",
" <th>page_count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>fug15050701</td>\n",
" <td>fug</td>\n",
" <td>1505</td>\n",
" <td>15050701</td>\n",
" <td>Fugger - Zeitungen</td>\n",
" <td>NaN</td>\n",
" <td>Tageszeitung</td>\n",
" <td>o.O.</td>\n",
" <td>de</td>\n",
" <td>newspaper</td>\n",
" <td>...</td>\n",
" <td>2013-06-27 13:28:35</td>\n",
" <td>1</td>\n",
" <td>1505-07-01</td>\n",
" <td>http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...</td>\n",
" <td>http://anno.onb.ac.at/cgi-content/anno?aid=fug...</td>\n",
" <td>0</td>\n",
" <td>1122198</td>\n",
" <td>2</td>\n",
" <th>1</th>\n",
" <td>fug15680120</td>\n",
" <td>1568</td>\n",
" <td>15680120</td>\n",
" <td>Fugger - Zeitungen</td>\n",
" <td>NaN</td>\n",
" <td>Tageszeitung</td>\n",
" <td>o.O.</td>\n",
" <td>de</td>\n",
" <td>newspaper</td>\n",
" <td>...</td>\n",
" <td>2013-06-27 13:27:59</td>\n",
" <td>1</td>\n",
" <td>1568-01-20</td>\n",
" <td>http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...</td>\n",
" <td>http://anno.onb.ac.at/cgi-content/anno?aid=fug...</td>\n",
" <td>0</td>\n",
" <td>1122201</td>\n",
" <td>1</td>\n",
" <th>2</th>\n",
" <td>fug15680124</td>\n",
" <td>1568</td>\n",
" <td>15680124</td>\n",
" <td>Fugger - Zeitungen</td>\n",
" <td>NaN</td>\n",
" <td>Tageszeitung</td>\n",
" <td>o.O.</td>\n",
" <td>de</td>\n",
" <td>newspaper</td>\n",
" <td>...</td>\n",
" <td>2013-06-27 13:27:58</td>\n",
" <td>1</td>\n",
" <td>1568-01-24</td>\n",
" <td>http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...</td>\n",
" <td>http://anno.onb.ac.at/cgi-content/anno?aid=fug...</td>\n",
" <td>0</td>\n",
" <td>1122202</td>\n",
" <td>2</td>\n",
" <th>3</th>\n",
" <td>fug15680228</td>\n",
" <td>15680228</td>\n",
" <td>Fugger - Zeitungen</td>\n",
" <td>NaN</td>\n",
" <td>Tageszeitung</td>\n",
" <td>o.O.</td>\n",
" <td>de</td>\n",
" <td>newspaper</td>\n",
" <td>...</td>\n",
" <td>2013-06-27 13:27:59</td>\n",
" <td>1</td>\n",
" <td>1568-02-28</td>\n",
" <td>http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...</td>\n",
" <td>http://anno.onb.ac.at/cgi-content/anno?aid=fug...</td>\n",
" <td>0</td>\n",
" <td>1122203</td>\n",
" <td>1</td>\n",
" <th>4</th>\n",
" <td>fug15680304</td>\n",
" <td>15680304</td>\n",
" <td>Fugger - Zeitungen</td>\n",
" <td>NaN</td>\n",
" <td>Tageszeitung</td>\n",
" <td>o.O.</td>\n",
" <td>de</td>\n",
" <td>newspaper</td>\n",