{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('https://labs.onb.ac.at/gitlab/labs-team/raw-metadata/raw/master/anno_labs_issues.csv.bz2', compression='bz2')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
manifest_idaidyeardaydc_titledc_title_additionalsubjectsplace_of_publicationslanguagesdc_type...meta_typeini_typemodification_datetimelonger_page_iddc_datelink_pdflink_oldhas_ocrmeta_idpage_count
0fug15050701fug150515050701Fugger - ZeitungenNaNTageszeitungo.O.denewspaper...brzanno2013-06-27 13:28:3511505-07-01http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...http://anno.onb.ac.at/cgi-content/anno?aid=fug...011221982
1fug15680120fug156815680120Fugger - ZeitungenNaNTageszeitungo.O.denewspaper...brzanno2013-06-27 13:27:5911568-01-20http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...http://anno.onb.ac.at/cgi-content/anno?aid=fug...011222011
2fug15680124fug156815680124Fugger - ZeitungenNaNTageszeitungo.O.denewspaper...brzanno2013-06-27 13:27:5811568-01-24http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...http://anno.onb.ac.at/cgi-content/anno?aid=fug...011222022
3fug15680228fug156815680228Fugger - ZeitungenNaNTageszeitungo.O.denewspaper...brzanno2013-06-27 13:27:5911568-02-28http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...http://anno.onb.ac.at/cgi-content/anno?aid=fug...011222031
4fug15680304fug156815680304Fugger - ZeitungenNaNTageszeitungo.O.denewspaper...brzanno2013-06-27 13:27:5811568-03-04http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...http://anno.onb.ac.at/cgi-content/anno?aid=fug...011222044
\n", "

5 rows × 21 columns

\n", "
" ], "text/plain": [ " manifest_id aid year day dc_title dc_title_additional \\\n", "0 fug15050701 fug 1505 15050701 Fugger - Zeitungen NaN \n", "1 fug15680120 fug 1568 15680120 Fugger - Zeitungen NaN \n", "2 fug15680124 fug 1568 15680124 Fugger - Zeitungen NaN \n", "3 fug15680228 fug 1568 15680228 Fugger - Zeitungen NaN \n", "4 fug15680304 fug 1568 15680304 Fugger - Zeitungen NaN \n", "\n", " subjects place_of_publications languages dc_type ... meta_type \\\n", "0 Tageszeitung o.O. de newspaper ... brz \n", "1 Tageszeitung o.O. de newspaper ... brz \n", "2 Tageszeitung o.O. de newspaper ... brz \n", "3 Tageszeitung o.O. de newspaper ... brz \n", "4 Tageszeitung o.O. de newspaper ... brz \n", "\n", " ini_type modification_datetime longer_page_id dc_date \\\n", "0 anno 2013-06-27 13:28:35 1 1505-07-01 \n", "1 anno 2013-06-27 13:27:59 1 1568-01-20 \n", "2 anno 2013-06-27 13:27:58 1 1568-01-24 \n", "3 anno 2013-06-27 13:27:59 1 1568-02-28 \n", "4 anno 2013-06-27 13:27:58 1 1568-03-04 \n", "\n", " link_pdf \\\n", "0 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", "1 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", "2 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", "3 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", "4 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", "\n", " link_old has_ocr meta_id \\\n", "0 http://anno.onb.ac.at/cgi-content/anno?aid=fug... 0 1122198 \n", "1 http://anno.onb.ac.at/cgi-content/anno?aid=fug... 0 1122201 \n", "2 http://anno.onb.ac.at/cgi-content/anno?aid=fug... 0 1122202 \n", "3 http://anno.onb.ac.at/cgi-content/anno?aid=fug... 0 1122203 \n", "4 http://anno.onb.ac.at/cgi-content/anno?aid=fug... 0 1122204 \n", "\n", " page_count \n", "0 2 \n", "1 1 \n", "2 2 \n", "3 1 \n", "4 4 \n", "\n", "[5 rows x 21 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['(Linzer) Tages-Post',\n", " '(Neue) Wiener Schachzeitung',\n", " '(Neuigkeits) Welt Blatt',\n", " 'Agramer Zeitung',\n", " 'Allgemeine Bauzeitung',\n", " 'Allgemeine Zeitschrift für Lehrerinnen',\n", " 'Allgemeine land- und forstwirthschaftliche Zeitung',\n", " 'Allgemeine musikalische Zeitung',\n", " 'Allgemeine Österreichische Gerichtszeitung',\n", " 'Amtliches Cursblatt der Wiener Börse',\n", " 'Bade- und Reise-Journal',\n", " 'Bericht über die Wirksamkeit des (...) Frauenvereines für Arbeitsschulen',\n", " 'Blätter für Musik, Theater und Kunst',\n", " 'Bregenzer Wochenblatt',\n", " 'Brixner Diözesanblatt',\n", " 'Buchdrucker-Zeitung',\n", " 'Bukowina',\n", " 'Carinthia. Zeitschrift für Vaterlandskunde, Belehrung und Unterhaltung',\n", " 'Christliche Kunstblätter',\n", " 'Cur- und Fremden-Liste des Curortes Baden bei Wien',\n", " 'Das Vaterland',\n", " 'Der Floh',\n", " 'Der Humorist',\n", " 'Der Militärarzt',\n", " 'Der Tresor',\n", " 'Der Zwischen-Akt',\n", " 'Der Österreichische Schulbote',\n", " 'Deutsche Musik-Zeitung',\n", " 'Deutsche Zeitung',\n", " 'Die Bombe',\n", " 'Die Debatte',\n", " 'Die Emancipation. Zeitschrift für Frauen',\n", " 'Die Feuerwehr',\n", " 'Die Gartenlaube für Österreich',\n", " 'Die Hausfrau: Blätter für Haus und Wirthschaft',\n", " 'Die Neuzeit',\n", " 'Die Presse',\n", " 'Die Vedette',\n", " 'Eideseis dia ta anatolika mere',\n", " 'Ephemeris',\n", " 'Extract-Schreiben (Europaeische Zeitung)',\n", " 'Feldkircher Anzeiger',\n", " 'Feldkircher Wochenblatt',\n", " 'Feldkircher Zeitung',\n", " 'Figaro',\n", " 'Frauenblätter',\n", " 'Freie Pädagogische Blätter',\n", " 'Fremden-Blatt',\n", " \"Fromme's Österreichischer Feuerwehr-Kalender\",\n", " 'Fugger - Zeitungen',\n", " 'Gerichtshalle',\n", " 'Grazer Volksblatt',\n", " 'Halleiner Bothe',\n", " 'Hellenikos telegraphos',\n", " 'Hermes ho logios',\n", " 'Illustriertes Österreichisches Journal',\n", " 'Illustrirtes Wiener Extrablatt',\n", " 'Innsbrucker Nachrichten',\n", " 'Internationale Ausstellungs-Zeitung',\n", " 'Ischler Bade-Liste',\n", " 'Ischler Fremden-Salon',\n", " 'Jahrbuch des Voralberger Landesmuseumsvereins',\n", " 'Jahresbericht Akademisches Gymnasium Wien',\n", " 'Jahresbericht Josefstädter Obergymnasium',\n", " 'Jahresbericht Schottengymnasium Wien',\n", " 'Jahresbericht Staats-Unterrealschule Margareten',\n", " 'Jahresbericht der städtischen Volksschule für Mädchen',\n", " 'Jahresbericht des Frauen-Wohlthätigkeits-Vereines für Wien und Umgebung',\n", " 'Jahresbericht des Männergesangsvereines in Wien',\n", " 'Jahresbericht des k.k. Maximiliangymnasium in Wien',\n", " 'Jahresbericht über die israelitische Kinderbewahr-Anstalt zu Wien, Leopoldstadt, Schiffamtsgasse No 773',\n", " 'Janus',\n", " 'Journal des Österreichischen Lloyd',\n", " 'Journal für Freymaurer: Als Manuskript gedruckt für Brüder und Meister des Ordens',\n", " 'Juristische Blätter',\n", " 'Jörgel Briefe',\n", " 'Kaufmännische Zeitschrift',\n", " 'Kikeriki',\n", " 'Klagenfurter Zeitung',\n", " 'Kunst und Volk',\n", " 'Laibacher Diöcesanblatt',\n", " 'Leitmeritzer Zeitung',\n", " 'Linzer Diözesanblatt',\n", " 'Linzer Volksblatt',\n", " 'Marburger Zeitung',\n", " 'Mitteilungen der Gesellschaft für Salzburger Landeskunde',\n", " 'Mittheilungen der kaiserl. königl. Central-Commission zur Erforschung und Erhaltung der Baudenkmale',\n", " 'Mittheilungen der kaiserlich-königlichen Geographischen Gesellschaft',\n", " 'Monatsschrift für den Orient',\n", " 'Morgen-Post',\n", " 'Musikalisch-literarischer Monatsbericht über neue Musikalien, musikalische Schriften und Abbildungen',\n", " 'Musikalisches Wochenblatt',\n", " 'Nasa Sloga',\n", " 'Neue Freie Presse',\n", " 'Neue Illustrirte Zeitung',\n", " 'Neue Wiener Musik-Zeitung',\n", " 'Neue Zeitschrift für Musik',\n", " 'Neues Fremden-Blatt',\n", " 'Nordböhmisches Volksblatt',\n", " 'Oesterreichische Buchhändler-Correspondenz',\n", " 'Oesterreichischer Soldatenfreund',\n", " 'Oesterreichisches Journal',\n", " 'Ordinariats-Blatt der Budweiser Diöcese',\n", " 'Ost-Deutsche Post',\n", " 'Philologikos telegraphos',\n", " 'Photographische Correspondenz',\n", " 'Pilsner Abendpost',\n", " 'Pilsner Fremdenblatt',\n", " 'Politische Frauen-Zeitung',\n", " 'Populäre österreichische Gesundheits-Zeitung',\n", " 'Prager Abendblatt',\n", " 'Prager Tagblatt',\n", " 'Salzburger Bote',\n", " 'Salzburger Chronik für Stadt und Land',\n", " 'Salzburger Volksblatt: unabh. Tageszeitung f. Stadt u. Land Salzburg',\n", " 'Siebenbürgisch-Deutsches Wochenblatt',\n", " 'Signale für die musikalische Welt',\n", " 'Social-politische Frauen-Zeitung. Organ für die Gesammt-Interessen des Frauenlebens. ',\n", " 'Sonntagsblätter',\n", " 'Statistische Monatsschrift',\n", " 'Steyrer Zeitung',\n", " 'Teplitz-Schönauer Anzeiger',\n", " 'Theater an der Wien - Theaterzettel',\n", " 'Theaterzettel (Oper und Burgtheater in Wien)',\n", " 'Union. Zeitschrift für Versicherungswesen und Volkswirtschaft',\n", " 'Unterhaltungs-Blatt zum Alzeyer Anzeigeblatt',\n", " 'Vaterländische Blätter',\n", " 'Volksblatt für Stadt und Land',\n", " 'Volksblätter aus Salzburg',\n", " 'Vorarlberger Landes-Zeitung',\n", " 'Vorarlberger Volksblatt',\n", " 'Vorarlberger Zeitung',\n", " 'Wiener Abendzeitung',\n", " 'Wiener Diözesanblatt',\n", " 'Wiener Feuerwehrzeitung',\n", " 'Wiener Kommunal-Kalender und städtisches Jahrbuch',\n", " 'Wiener Landwirtschaftliche Zeitung',\n", " 'Wiener Medizinische Wochenschrift',\n", " 'Wiener Moden Zeitung',\n", " 'Wiener Salonblatt',\n", " 'Wiener Sonn- und Montags-Zeitung',\n", " 'Wiener Theater-Zeitung (Bäuerles Theaterzeitung)',\n", " 'Wiener Vororte-Zeitung',\n", " 'Wiener Zeitung',\n", " 'Wienerische Kirchenzeitung',\n", " 'Wochenzeitschrift des ö. Ingenieur- und Architektenvereins',\n", " 'Wr. Weltaustellungs-Zeitung / Int. Austellungs-Zeitung',\n", " 'Zeitschrift des deutschen und österreichischen Alpenvereins',\n", " 'Zeitschrift für Notariat und freiwillige Gerichtsbarkeit in Österreich',\n", " 'Zeitschrift für das Privat- und Öffentliche Recht der Gegenwart',\n", " 'Znaimer Wochenblatt',\n", " 'Österreichisch-kaiserlicher Hofkalender',\n", " 'Österreichische Badezeitung',\n", " 'Österreichische Feuerwehrzeitung',\n", " 'Österreichische Revue',\n", " 'Österreichische Schachzeitung',\n", " 'Österreichische Verbands-Feuerwehr-Zeitung',\n", " 'Österreichische Wochenschrift für Wissenschaft und Kunst',\n", " 'Österreichische Zeitschrift für Gesetzgebung und Rechtsprechung',\n", " 'Österreichische Zeitschrift für Verwaltung']" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sorted(df.dc_title.unique())" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "228449" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(df)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearfrom_abolonger_page_idhas_ocrmeta_idpage_count
count228449.000000228449.0228449.000000228449.0000002.284490e+05228449.000000
mean1841.2488210.00.0631210.7826876.373957e+059.373917
std58.8081240.00.2431820.4124183.691163e+0521.592944
min1.0000000.00.0000000.0000003.506000e+031.000000
25%1834.0000000.00.0000001.0000002.696050e+054.000000
50%1858.0000000.00.0000001.0000007.375320e+056.000000
75%1870.0000000.00.0000001.0000009.717960e+0512.000000
max1877.0000000.01.0000001.0000001.139472e+061676.000000
\n", "
" ], "text/plain": [ " year from_abo longer_page_id has_ocr meta_id \\\n", "count 228449.000000 228449.0 228449.000000 228449.000000 2.284490e+05 \n", "mean 1841.248821 0.0 0.063121 0.782687 6.373957e+05 \n", "std 58.808124 0.0 0.243182 0.412418 3.691163e+05 \n", "min 1.000000 0.0 0.000000 0.000000 3.506000e+03 \n", "25% 1834.000000 0.0 0.000000 1.000000 2.696050e+05 \n", "50% 1858.000000 0.0 0.000000 1.000000 7.375320e+05 \n", "75% 1870.000000 0.0 0.000000 1.000000 9.717960e+05 \n", "max 1877.000000 0.0 1.000000 1.000000 1.139472e+06 \n", "\n", " page_count \n", "count 228449.000000 \n", "mean 9.373917 \n", "std 21.592944 \n", "min 1.000000 \n", "25% 4.000000 \n", "50% 6.000000 \n", "75% 12.000000 \n", "max 1676.000000 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "older = df[(df['year'] < 1878) & (df['year'] > 1500)]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "228397" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(older)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0])" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['from_abo'].unique()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/kst/tmp/dingsdi/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3049: DtypeWarning: Columns (1,2) have mixed types. Specify dtype option on import or set low_memory=False.\n", " interactivity=interactivity, compiler=compiler, result=result)\n" ] } ], "source": [ "pages = pd.read_csv('https://labs.onb.ac.at/gitlab/labs-team/raw-metadata/raw/master/anno_labs_ocr_pages.tsv.bz2', sep='\\t', compression='bz2')" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
manifest_idpage_idstructure_infodateipfaddateinamehas_ocralto_pathwidthheightresolutioncolor_depthis_not_for_publicationorder_indexfrom_aboyear
0abz1860ag00010007_00000363Notizblatt/cont01/periodika/abz/1860/18600007/00000363.tif00000363.tif1/conttxt/periodika/abz/1860/18600007/alto/0000...23683066300801232595101860
1abz1860ag00010007_00000364Notizblatt/cont01/periodika/abz/1860/18600007/00000364.tif00000364.tif1/conttxt/periodika/abz/1860/18600007/alto/0000...23683066300801232595201860
2abz1860ag00010007_00000365Notizblatt/cont01/periodika/abz/1860/18600007/00000365.tif00000365.tif1/conttxt/periodika/abz/1860/18600007/alto/0000...23683063300801232595301860
3abz1860ag00010007_00000366Notizblatt/cont01/periodika/abz/1860/18600007/00000366.tif00000366.tif1/conttxt/periodika/abz/1860/18600007/alto/0000...23683063300801232595401860
4abz1860ag00010007_00000367Notizblatt/cont01/periodika/abz/1860/18600007/00000367.tif00000367.tif1/conttxt/periodika/abz/1860/18600007/alto/0000...23683059300801232595501860
\n", "
" ], "text/plain": [ " manifest_id page_id structure_info \\\n", "0 abz1860ag0001 0007_00000363 Notizblatt \n", "1 abz1860ag0001 0007_00000364 Notizblatt \n", "2 abz1860ag0001 0007_00000365 Notizblatt \n", "3 abz1860ag0001 0007_00000366 Notizblatt \n", "4 abz1860ag0001 0007_00000367 Notizblatt \n", "\n", " dateipfad dateiname has_ocr \\\n", "0 /cont01/periodika/abz/1860/18600007/00000363.tif 00000363.tif 1 \n", "1 /cont01/periodika/abz/1860/18600007/00000364.tif 00000364.tif 1 \n", "2 /cont01/periodika/abz/1860/18600007/00000365.tif 00000365.tif 1 \n", "3 /cont01/periodika/abz/1860/18600007/00000366.tif 00000366.tif 1 \n", "4 /cont01/periodika/abz/1860/18600007/00000367.tif 00000367.tif 1 \n", "\n", " alto_path width height \\\n", "0 /conttxt/periodika/abz/1860/18600007/alto/0000... 2368 3066 \n", "1 /conttxt/periodika/abz/1860/18600007/alto/0000... 2368 3066 \n", "2 /conttxt/periodika/abz/1860/18600007/alto/0000... 2368 3063 \n", "3 /conttxt/periodika/abz/1860/18600007/alto/0000... 2368 3063 \n", "4 /conttxt/periodika/abz/1860/18600007/alto/0000... 2368 3059 \n", "\n", " resolution color_depth is_not_for_publication order_index from_abo \\\n", "0 300 8 0 12325951 0 \n", "1 300 8 0 12325952 0 \n", "2 300 8 0 12325953 0 \n", "3 300 8 0 12325954 0 \n", "4 300 8 0 12325955 0 \n", "\n", " year \n", "0 1860 \n", "1 1860 \n", "2 1860 \n", "3 1860 \n", "4 1860 " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pages.head()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
manifest_idaidyeardaydc_titledc_title_additionalsubjectsplace_of_publicationslanguagesdc_type...meta_typeini_typemodification_datetimelonger_page_iddc_datelink_pdflink_oldhas_ocrmeta_idpage_count
0fug15050701fug150515050701Fugger - ZeitungenNaNTageszeitungo.O.denewspaper...brzanno2013-06-27 13:28:3511505-07-01http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...http://anno.onb.ac.at/cgi-content/anno?aid=fug...011221982
1fug15680120fug156815680120Fugger - ZeitungenNaNTageszeitungo.O.denewspaper...brzanno2013-06-27 13:27:5911568-01-20http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...http://anno.onb.ac.at/cgi-content/anno?aid=fug...011222011
2fug15680124fug156815680124Fugger - ZeitungenNaNTageszeitungo.O.denewspaper...brzanno2013-06-27 13:27:5811568-01-24http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...http://anno.onb.ac.at/cgi-content/anno?aid=fug...011222022
3fug15680228fug156815680228Fugger - ZeitungenNaNTageszeitungo.O.denewspaper...brzanno2013-06-27 13:27:5911568-02-28http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...http://anno.onb.ac.at/cgi-content/anno?aid=fug...011222031
4fug15680304fug156815680304Fugger - ZeitungenNaNTageszeitungo.O.denewspaper...brzanno2013-06-27 13:27:5811568-03-04http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...http://anno.onb.ac.at/cgi-content/anno?aid=fug...011222044
\n", "

5 rows × 21 columns

\n", "
" ], "text/plain": [ " manifest_id aid year day dc_title dc_title_additional \\\n", "0 fug15050701 fug 1505 15050701 Fugger - Zeitungen NaN \n", "1 fug15680120 fug 1568 15680120 Fugger - Zeitungen NaN \n", "2 fug15680124 fug 1568 15680124 Fugger - Zeitungen NaN \n", "3 fug15680228 fug 1568 15680228 Fugger - Zeitungen NaN \n", "4 fug15680304 fug 1568 15680304 Fugger - Zeitungen NaN \n", "\n", " subjects place_of_publications languages dc_type ... meta_type \\\n", "0 Tageszeitung o.O. de newspaper ... brz \n", "1 Tageszeitung o.O. de newspaper ... brz \n", "2 Tageszeitung o.O. de newspaper ... brz \n", "3 Tageszeitung o.O. de newspaper ... brz \n", "4 Tageszeitung o.O. de newspaper ... brz \n", "\n", " ini_type modification_datetime longer_page_id dc_date \\\n", "0 anno 2013-06-27 13:28:35 1 1505-07-01 \n", "1 anno 2013-06-27 13:27:59 1 1568-01-20 \n", "2 anno 2013-06-27 13:27:58 1 1568-01-24 \n", "3 anno 2013-06-27 13:27:59 1 1568-02-28 \n", "4 anno 2013-06-27 13:27:58 1 1568-03-04 \n", "\n", " link_pdf \\\n", "0 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", "1 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", "2 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", "3 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", "4 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", "\n", " link_old has_ocr meta_id \\\n", "0 http://anno.onb.ac.at/cgi-content/anno?aid=fug... 0 1122198 \n", "1 http://anno.onb.ac.at/cgi-content/anno?aid=fug... 0 1122201 \n", "2 http://anno.onb.ac.at/cgi-content/anno?aid=fug... 0 1122202 \n", "3 http://anno.onb.ac.at/cgi-content/anno?aid=fug... 0 1122203 \n", "4 http://anno.onb.ac.at/cgi-content/anno?aid=fug... 0 1122204 \n", "\n", " page_count \n", "0 2 \n", "1 1 \n", "2 2 \n", "3 1 \n", "4 4 \n", "\n", "[5 rows x 21 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "older.head()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1505" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "min(older['year'])" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1972536" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(pages[pages['manifest_id'].isin(older['manifest_id'])])" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1973709" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(pages)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }