Skip to content
Metadata Overview.ipynb 40.3 KiB
Newer Older
Stefan Karner's avatar
Stefan Karner committed
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
Stefan Karner's avatar
Stefan Karner committed
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('https://labs.onb.ac.at/gitlab/labs-team/raw-metadata/raw/master/anno_labs_issues.csv.bz2', compression='bz2')"
Stefan Karner's avatar
Stefan Karner committed
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>manifest_id</th>\n",
       "      <th>aid</th>\n",
       "      <th>year</th>\n",
       "      <th>day</th>\n",
       "      <th>dc_title</th>\n",
       "      <th>dc_title_additional</th>\n",
       "      <th>subjects</th>\n",
       "      <th>place_of_publications</th>\n",
       "      <th>languages</th>\n",
       "      <th>dc_type</th>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <th>meta_type</th>\n",
       "      <th>ini_type</th>\n",
       "      <th>modification_datetime</th>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <th>longer_page_id</th>\n",
       "      <th>dc_date</th>\n",
       "      <th>link_pdf</th>\n",
       "      <th>link_old</th>\n",
       "      <th>has_ocr</th>\n",
       "      <th>meta_id</th>\n",
       "      <th>page_count</th>\n",
Stefan Karner's avatar
Stefan Karner committed
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>fug15050701</td>\n",
       "      <td>fug</td>\n",
       "      <td>1505</td>\n",
       "      <td>15050701</td>\n",
       "      <td>Fugger - Zeitungen</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>NaN</td>\n",
       "      <td>Tageszeitung</td>\n",
       "      <td>o.O.</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>de</td>\n",
       "      <td>newspaper</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>brz</td>\n",
       "      <td>anno</td>\n",
       "      <td>2013-06-27 13:28:35</td>\n",
       "      <td>1</td>\n",
       "      <td>1505-07-01</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...</td>\n",
       "      <td>http://anno.onb.ac.at/cgi-content/anno?aid=fug...</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>0</td>\n",
       "      <td>1122198</td>\n",
       "      <td>2</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>fug15680120</td>\n",
       "      <td>fug</td>\n",
       "      <td>1568</td>\n",
       "      <td>15680120</td>\n",
       "      <td>Fugger - Zeitungen</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>NaN</td>\n",
       "      <td>Tageszeitung</td>\n",
       "      <td>o.O.</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>de</td>\n",
       "      <td>newspaper</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>brz</td>\n",
       "      <td>anno</td>\n",
       "      <td>2013-06-27 13:27:59</td>\n",
       "      <td>1</td>\n",
       "      <td>1568-01-20</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...</td>\n",
       "      <td>http://anno.onb.ac.at/cgi-content/anno?aid=fug...</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>0</td>\n",
       "      <td>1122201</td>\n",
       "      <td>1</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>fug15680124</td>\n",
       "      <td>fug</td>\n",
       "      <td>1568</td>\n",
       "      <td>15680124</td>\n",
       "      <td>Fugger - Zeitungen</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>NaN</td>\n",
       "      <td>Tageszeitung</td>\n",
       "      <td>o.O.</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>de</td>\n",
       "      <td>newspaper</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>brz</td>\n",
       "      <td>anno</td>\n",
       "      <td>2013-06-27 13:27:58</td>\n",
       "      <td>1</td>\n",
       "      <td>1568-01-24</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...</td>\n",
       "      <td>http://anno.onb.ac.at/cgi-content/anno?aid=fug...</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>0</td>\n",
       "      <td>1122202</td>\n",
       "      <td>2</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>fug15680228</td>\n",
       "      <td>fug</td>\n",
       "      <td>1568</td>\n",
       "      <td>15680228</td>\n",
       "      <td>Fugger - Zeitungen</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>NaN</td>\n",
       "      <td>Tageszeitung</td>\n",
       "      <td>o.O.</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>de</td>\n",
       "      <td>newspaper</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>brz</td>\n",
       "      <td>anno</td>\n",
       "      <td>2013-06-27 13:27:59</td>\n",
       "      <td>1</td>\n",
       "      <td>1568-02-28</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...</td>\n",
       "      <td>http://anno.onb.ac.at/cgi-content/anno?aid=fug...</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>0</td>\n",
       "      <td>1122203</td>\n",
       "      <td>1</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>fug15680304</td>\n",
       "      <td>fug</td>\n",
       "      <td>1568</td>\n",
       "      <td>15680304</td>\n",
       "      <td>Fugger - Zeitungen</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>NaN</td>\n",
       "      <td>Tageszeitung</td>\n",
       "      <td>o.O.</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>de</td>\n",
       "      <td>newspaper</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>brz</td>\n",
       "      <td>anno</td>\n",
       "      <td>2013-06-27 13:27:58</td>\n",
       "      <td>1</td>\n",
       "      <td>1568-03-04</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...</td>\n",
       "      <td>http://anno.onb.ac.at/cgi-content/anno?aid=fug...</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>0</td>\n",
       "      <td>1122204</td>\n",
       "      <td>4</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 21 columns</p>\n",
Stefan Karner's avatar
Stefan Karner committed
       "</div>"
      ],
      "text/plain": [
       "   manifest_id  aid  year       day            dc_title dc_title_additional  \\\n",
       "0  fug15050701  fug  1505  15050701  Fugger - Zeitungen                 NaN   \n",
       "1  fug15680120  fug  1568  15680120  Fugger - Zeitungen                 NaN   \n",
       "2  fug15680124  fug  1568  15680124  Fugger - Zeitungen                 NaN   \n",
       "3  fug15680228  fug  1568  15680228  Fugger - Zeitungen                 NaN   \n",
       "4  fug15680304  fug  1568  15680304  Fugger - Zeitungen                 NaN   \n",
Stefan Karner's avatar
Stefan Karner committed
       "\n",
       "       subjects place_of_publications languages    dc_type  ...  meta_type  \\\n",
       "0  Tageszeitung                  o.O.        de  newspaper  ...        brz   \n",
       "1  Tageszeitung                  o.O.        de  newspaper  ...        brz   \n",
       "2  Tageszeitung                  o.O.        de  newspaper  ...        brz   \n",
       "3  Tageszeitung                  o.O.        de  newspaper  ...        brz   \n",
       "4  Tageszeitung                  o.O.        de  newspaper  ...        brz   \n",
Stefan Karner's avatar
Stefan Karner committed
       "\n",
       "  ini_type modification_datetime longer_page_id     dc_date  \\\n",
       "0     anno   2013-06-27 13:28:35              1  1505-07-01   \n",
       "1     anno   2013-06-27 13:27:59              1  1568-01-20   \n",
       "2     anno   2013-06-27 13:27:58              1  1568-01-24   \n",
       "3     anno   2013-06-27 13:27:59              1  1568-02-28   \n",
       "4     anno   2013-06-27 13:27:58              1  1568-03-04   \n",
Stefan Karner's avatar
Stefan Karner committed
       "\n",
       "                                            link_pdf  \\\n",
       "0  http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...   \n",
       "1  http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...   \n",
       "2  http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...   \n",
       "3  http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...   \n",
       "4  http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...   \n",
       "\n",
       "                                            link_old has_ocr  meta_id  \\\n",
       "0  http://anno.onb.ac.at/cgi-content/anno?aid=fug...       0  1122198   \n",
       "1  http://anno.onb.ac.at/cgi-content/anno?aid=fug...       0  1122201   \n",
       "2  http://anno.onb.ac.at/cgi-content/anno?aid=fug...       0  1122202   \n",
       "3  http://anno.onb.ac.at/cgi-content/anno?aid=fug...       0  1122203   \n",
       "4  http://anno.onb.ac.at/cgi-content/anno?aid=fug...       0  1122204   \n",
       "\n",
       "   page_count  \n",
       "0           2  \n",
       "1           1  \n",
       "2           2  \n",
       "3           1  \n",
       "4           4  \n",
       "\n",
       "[5 rows x 21 columns]"
Stefan Karner's avatar
Stefan Karner committed
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
Stefan Karner's avatar
Stefan Karner committed
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['(Linzer) Tages-Post',\n",
       " '(Neue) Wiener Schachzeitung',\n",
       " '(Neuigkeits) Welt Blatt',\n",
       " 'Agramer Zeitung',\n",
       " 'Allgemeine Bauzeitung',\n",
       " 'Allgemeine Zeitschrift für Lehrerinnen',\n",
       " 'Allgemeine land- und forstwirthschaftliche Zeitung',\n",
       " 'Allgemeine musikalische Zeitung',\n",
       " 'Allgemeine Österreichische Gerichtszeitung',\n",
       " 'Amtliches Cursblatt der Wiener Börse',\n",
       " 'Bade- und Reise-Journal',\n",
       " 'Bericht über die Wirksamkeit des (...) Frauenvereines für Arbeitsschulen',\n",
       " 'Blätter für Musik, Theater und Kunst',\n",
       " 'Bregenzer Wochenblatt',\n",
       " 'Brixner Diözesanblatt',\n",
       " 'Buchdrucker-Zeitung',\n",
       " 'Bukowina',\n",
       " 'Carinthia. Zeitschrift für Vaterlandskunde, Belehrung und Unterhaltung',\n",
       " 'Christliche Kunstblätter',\n",
       " 'Cur- und Fremden-Liste des Curortes Baden bei Wien',\n",
Stefan Karner's avatar
Stefan Karner committed
       " 'Das Vaterland',\n",
       " 'Der Floh',\n",
       " 'Der Humorist',\n",
       " 'Der Militärarzt',\n",
       " 'Der Tresor',\n",
       " 'Der Zwischen-Akt',\n",
       " 'Der Österreichische Schulbote',\n",
       " 'Deutsche Musik-Zeitung',\n",
       " 'Deutsche Zeitung',\n",
       " 'Die Bombe',\n",
       " 'Die Debatte',\n",
       " 'Die Emancipation. Zeitschrift für Frauen',\n",
       " 'Die Feuerwehr',\n",
       " 'Die Gartenlaube für Österreich',\n",
       " 'Die Hausfrau: Blätter für Haus und Wirthschaft',\n",
       " 'Die Neuzeit',\n",
       " 'Die Presse',\n",
       " 'Die Vedette',\n",
       " 'Eideseis dia ta anatolika mere',\n",
       " 'Ephemeris',\n",
       " 'Extract-Schreiben (Europaeische Zeitung)',\n",
       " 'Feldkircher Anzeiger',\n",
       " 'Feldkircher Wochenblatt',\n",
       " 'Feldkircher Zeitung',\n",
       " 'Figaro',\n",
       " 'Frauenblätter',\n",
       " 'Freie Pädagogische Blätter',\n",
       " 'Fremden-Blatt',\n",
       " \"Fromme's Österreichischer Feuerwehr-Kalender\",\n",
       " 'Fugger - Zeitungen',\n",
       " 'Gerichtshalle',\n",
       " 'Grazer Volksblatt',\n",
       " 'Halleiner Bothe',\n",
       " 'Hellenikos telegraphos',\n",
       " 'Hermes ho logios',\n",
       " 'Illustriertes Österreichisches Journal',\n",
       " 'Illustrirtes Wiener Extrablatt',\n",
       " 'Innsbrucker Nachrichten',\n",
       " 'Internationale Ausstellungs-Zeitung',\n",
       " 'Ischler Bade-Liste',\n",
Stefan Karner's avatar
Stefan Karner committed
       " 'Ischler Fremden-Salon',\n",
       " 'Jahrbuch des Voralberger Landesmuseumsvereins',\n",
       " 'Jahresbericht Akademisches Gymnasium Wien',\n",
       " 'Jahresbericht Josefstädter Obergymnasium',\n",
       " 'Jahresbericht Schottengymnasium Wien',\n",
       " 'Jahresbericht Staats-Unterrealschule Margareten',\n",
       " 'Jahresbericht der städtischen Volksschule für Mädchen',\n",
       " 'Jahresbericht des Frauen-Wohlthätigkeits-Vereines für Wien und Umgebung',\n",
       " 'Jahresbericht des Männergesangsvereines in Wien',\n",
       " 'Jahresbericht des k.k. Maximiliangymnasium in Wien',\n",
       " 'Jahresbericht über die israelitische Kinderbewahr-Anstalt zu Wien, Leopoldstadt, Schiffamtsgasse No 773',\n",
Stefan Karner's avatar
Stefan Karner committed
       " 'Janus',\n",
       " 'Journal des Österreichischen Lloyd',\n",
Stefan Karner's avatar
Stefan Karner committed
       " 'Journal für Freymaurer: Als Manuskript gedruckt für Brüder und Meister des Ordens',\n",
       " 'Juristische Blätter',\n",
       " 'Jörgel Briefe',\n",
       " 'Kaufmännische Zeitschrift',\n",
       " 'Kikeriki',\n",
       " 'Klagenfurter Zeitung',\n",
       " 'Kunst und Volk',\n",
Stefan Karner's avatar
Stefan Karner committed
       " 'Laibacher Diöcesanblatt',\n",
       " 'Leitmeritzer Zeitung',\n",
       " 'Linzer Diözesanblatt',\n",
       " 'Linzer Volksblatt',\n",
       " 'Marburger Zeitung',\n",
       " 'Mitteilungen der Gesellschaft für Salzburger Landeskunde',\n",
       " 'Mittheilungen der kaiserl. königl. Central-Commission zur Erforschung und Erhaltung der Baudenkmale',\n",
       " 'Mittheilungen der kaiserlich-königlichen Geographischen Gesellschaft',\n",
       " 'Monatsschrift für den Orient',\n",
       " 'Morgen-Post',\n",
       " 'Musikalisch-literarischer Monatsbericht über neue Musikalien, musikalische Schriften und Abbildungen',\n",
       " 'Musikalisches Wochenblatt',\n",
       " 'Nasa Sloga',\n",
       " 'Neue Freie Presse',\n",
       " 'Neue Illustrirte Zeitung',\n",
       " 'Neue Wiener Musik-Zeitung',\n",
       " 'Neue Zeitschrift für Musik',\n",
       " 'Neues Fremden-Blatt',\n",
       " 'Nordböhmisches Volksblatt',\n",
       " 'Oesterreichische Buchhändler-Correspondenz',\n",
       " 'Oesterreichischer Soldatenfreund',\n",
       " 'Oesterreichisches Journal',\n",
       " 'Ordinariats-Blatt der Budweiser Diöcese',\n",
       " 'Ost-Deutsche Post',\n",
Stefan Karner's avatar
Stefan Karner committed
       " 'Philologikos telegraphos',\n",
       " 'Photographische Correspondenz',\n",
       " 'Pilsner Abendpost',\n",
       " 'Pilsner Fremdenblatt',\n",
       " 'Politische Frauen-Zeitung',\n",
       " 'Populäre österreichische Gesundheits-Zeitung',\n",
       " 'Prager Abendblatt',\n",
       " 'Prager Tagblatt',\n",
       " 'Salzburger Bote',\n",
       " 'Salzburger Chronik für Stadt und Land',\n",
       " 'Salzburger Volksblatt: unabh. Tageszeitung f. Stadt u. Land Salzburg',\n",
       " 'Siebenbürgisch-Deutsches Wochenblatt',\n",
       " 'Signale für die musikalische Welt',\n",
       " 'Social-politische Frauen-Zeitung. Organ für die Gesammt-Interessen des Frauenlebens. ',\n",
       " 'Sonntagsblätter',\n",
       " 'Statistische Monatsschrift',\n",
       " 'Steyrer Zeitung',\n",
       " 'Teplitz-Schönauer Anzeiger',\n",
       " 'Theater an der Wien - Theaterzettel',\n",
       " 'Theaterzettel (Oper und Burgtheater in Wien)',\n",
       " 'Union. Zeitschrift für Versicherungswesen und Volkswirtschaft',\n",
       " 'Unterhaltungs-Blatt zum Alzeyer Anzeigeblatt',\n",
       " 'Vaterländische Blätter',\n",
       " 'Volksblatt für Stadt und Land',\n",
       " 'Volksblätter aus Salzburg',\n",
       " 'Vorarlberger Landes-Zeitung',\n",
       " 'Vorarlberger Volksblatt',\n",
       " 'Vorarlberger Zeitung',\n",
       " 'Wiener Abendzeitung',\n",
       " 'Wiener Diözesanblatt',\n",
       " 'Wiener Feuerwehrzeitung',\n",
       " 'Wiener Kommunal-Kalender und städtisches Jahrbuch',\n",
       " 'Wiener Landwirtschaftliche Zeitung',\n",
       " 'Wiener Medizinische Wochenschrift',\n",
       " 'Wiener Moden Zeitung',\n",
       " 'Wiener Salonblatt',\n",
       " 'Wiener Sonn- und Montags-Zeitung',\n",
       " 'Wiener Theater-Zeitung (Bäuerles Theaterzeitung)',\n",
       " 'Wiener Vororte-Zeitung',\n",
       " 'Wiener Zeitung',\n",
       " 'Wienerische Kirchenzeitung',\n",
       " 'Wochenzeitschrift des ö. Ingenieur- und Architektenvereins',\n",
       " 'Wr. Weltaustellungs-Zeitung / Int. Austellungs-Zeitung',\n",
       " 'Zeitschrift des deutschen und österreichischen Alpenvereins',\n",
       " 'Zeitschrift für Notariat und freiwillige Gerichtsbarkeit in Österreich',\n",
       " 'Zeitschrift für das Privat- und Öffentliche Recht der Gegenwart',\n",
       " 'Znaimer Wochenblatt',\n",
       " 'Österreichisch-kaiserlicher Hofkalender',\n",
       " 'Österreichische Badezeitung',\n",
       " 'Österreichische Feuerwehrzeitung',\n",
       " 'Österreichische Revue',\n",
       " 'Österreichische Schachzeitung',\n",
       " 'Österreichische Verbands-Feuerwehr-Zeitung',\n",
       " 'Österreichische Wochenschrift für Wissenschaft und Kunst',\n",
       " 'Österreichische Zeitschrift für Gesetzgebung und Rechtsprechung',\n",
       " 'Österreichische Zeitschrift für Verwaltung']"
Stefan Karner's avatar
Stefan Karner committed
      ]
     },
Stefan Karner's avatar
Stefan Karner committed
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sorted(df.dc_title.unique())"
   ]
  },
  {
   "cell_type": "code",
Stefan Karner's avatar
Stefan Karner committed
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
Stefan Karner's avatar
Stefan Karner committed
      ]
     },
Stefan Karner's avatar
Stefan Karner committed
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df)"
   ]
  },
  {
   "cell_type": "code",
Stefan Karner's avatar
Stefan Karner committed
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>year</th>\n",
       "      <th>from_abo</th>\n",
       "      <th>longer_page_id</th>\n",
       "      <th>has_ocr</th>\n",
       "      <th>meta_id</th>\n",
       "      <th>page_count</th>\n",
Stefan Karner's avatar
Stefan Karner committed
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>228449.000000</td>\n",
       "      <td>228449.0</td>\n",
       "      <td>228449.000000</td>\n",
       "      <td>228449.000000</td>\n",
       "      <td>2.284490e+05</td>\n",
       "      <td>228449.000000</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>1841.248821</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>0.0</td>\n",
       "      <td>0.063121</td>\n",
       "      <td>0.782687</td>\n",
       "      <td>6.373957e+05</td>\n",
       "      <td>9.373917</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>58.808124</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>0.0</td>\n",
       "      <td>0.243182</td>\n",
       "      <td>0.412418</td>\n",
       "      <td>3.691163e+05</td>\n",
       "      <td>21.592944</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.000000</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>3.506000e+03</td>\n",
       "      <td>1.000000</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>1834.000000</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>2.696050e+05</td>\n",
       "      <td>4.000000</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>1858.000000</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>7.375320e+05</td>\n",
       "      <td>6.000000</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>1870.000000</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>9.717960e+05</td>\n",
       "      <td>12.000000</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>1877.000000</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>0.0</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.139472e+06</td>\n",
       "      <td>1676.000000</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                year  from_abo  longer_page_id        has_ocr       meta_id  \\\n",
       "count  228449.000000  228449.0   228449.000000  228449.000000  2.284490e+05   \n",
       "mean     1841.248821       0.0        0.063121       0.782687  6.373957e+05   \n",
       "std        58.808124       0.0        0.243182       0.412418  3.691163e+05   \n",
       "min         1.000000       0.0        0.000000       0.000000  3.506000e+03   \n",
       "25%      1834.000000       0.0        0.000000       1.000000  2.696050e+05   \n",
       "50%      1858.000000       0.0        0.000000       1.000000  7.375320e+05   \n",
       "75%      1870.000000       0.0        0.000000       1.000000  9.717960e+05   \n",
       "max      1877.000000       0.0        1.000000       1.000000  1.139472e+06   \n",
Stefan Karner's avatar
Stefan Karner committed
       "\n",
       "          page_count  \n",
       "count  228449.000000  \n",
       "mean        9.373917  \n",
       "std        21.592944  \n",
       "min         1.000000  \n",
       "25%         4.000000  \n",
       "50%         6.000000  \n",
       "75%        12.000000  \n",
       "max      1676.000000  "
Stefan Karner's avatar
Stefan Karner committed
      ]
     },
Stefan Karner's avatar
Stefan Karner committed
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
Stefan Karner's avatar
Stefan Karner committed
   "metadata": {},
   "outputs": [],
   "source": [
    "older = df[(df['year'] < 1878) & (df['year'] > 1500)]"
   ]
  },
  {
   "cell_type": "code",
Stefan Karner's avatar
Stefan Karner committed
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
Stefan Karner's avatar
Stefan Karner committed
      ]
     },
Stefan Karner's avatar
Stefan Karner committed
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(older)"
   ]
  },
  {
   "cell_type": "code",
Stefan Karner's avatar
Stefan Karner committed
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0])"
      ]
     },
Stefan Karner's avatar
Stefan Karner committed
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['from_abo'].unique()"
   ]
  },
  {
   "cell_type": "code",
Stefan Karner's avatar
Stefan Karner committed
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/kst/tmp/dingsdi/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3049: DtypeWarning: Columns (1,2) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  interactivity=interactivity, compiler=compiler, result=result)\n"
     ]
    }
   ],
Stefan Karner's avatar
Stefan Karner committed
   "source": [
    "pages = pd.read_csv('https://labs.onb.ac.at/gitlab/labs-team/raw-metadata/raw/master/anno_labs_ocr_pages.tsv.bz2', sep='\\t', compression='bz2')"
Stefan Karner's avatar
Stefan Karner committed
   ]
  },
  {
   "cell_type": "code",
Stefan Karner's avatar
Stefan Karner committed
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>manifest_id</th>\n",
       "      <th>page_id</th>\n",
       "      <th>structure_info</th>\n",
       "      <th>dateipfad</th>\n",
       "      <th>dateiname</th>\n",
       "      <th>has_ocr</th>\n",
       "      <th>alto_path</th>\n",
       "      <th>width</th>\n",
       "      <th>height</th>\n",
       "      <th>resolution</th>\n",
       "      <th>color_depth</th>\n",
       "      <th>is_not_for_publication</th>\n",
       "      <th>order_index</th>\n",
       "      <th>from_abo</th>\n",
       "      <th>year</th>\n",
Stefan Karner's avatar
Stefan Karner committed
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>abz1860ag0001</td>\n",
       "      <td>0007_00000363</td>\n",
       "      <td>Notizblatt</td>\n",
       "      <td>/cont01/periodika/abz/1860/18600007/00000363.tif</td>\n",
       "      <td>00000363.tif</td>\n",
       "      <td>1</td>\n",
       "      <td>/conttxt/periodika/abz/1860/18600007/alto/0000...</td>\n",
       "      <td>2368</td>\n",
       "      <td>3066</td>\n",
       "      <td>300</td>\n",
       "      <td>8</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>0</td>\n",
       "      <td>12325951</td>\n",
       "      <td>0</td>\n",
       "      <td>1860</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>abz1860ag0001</td>\n",
       "      <td>0007_00000364</td>\n",
       "      <td>Notizblatt</td>\n",
       "      <td>/cont01/periodika/abz/1860/18600007/00000364.tif</td>\n",
       "      <td>00000364.tif</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>1</td>\n",
       "      <td>/conttxt/periodika/abz/1860/18600007/alto/0000...</td>\n",
       "      <td>2368</td>\n",
       "      <td>3066</td>\n",
       "      <td>300</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>12325952</td>\n",
       "      <td>0</td>\n",
       "      <td>1860</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>abz1860ag0001</td>\n",
       "      <td>0007_00000365</td>\n",
       "      <td>Notizblatt</td>\n",
       "      <td>/cont01/periodika/abz/1860/18600007/00000365.tif</td>\n",
       "      <td>00000365.tif</td>\n",
       "      <td>1</td>\n",
       "      <td>/conttxt/periodika/abz/1860/18600007/alto/0000...</td>\n",
       "      <td>2368</td>\n",
       "      <td>3063</td>\n",
       "      <td>300</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>12325953</td>\n",
       "      <td>0</td>\n",
       "      <td>1860</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>abz1860ag0001</td>\n",
       "      <td>0007_00000366</td>\n",
       "      <td>Notizblatt</td>\n",
       "      <td>/cont01/periodika/abz/1860/18600007/00000366.tif</td>\n",
       "      <td>00000366.tif</td>\n",
       "      <td>1</td>\n",
       "      <td>/conttxt/periodika/abz/1860/18600007/alto/0000...</td>\n",
       "      <td>2368</td>\n",
       "      <td>3063</td>\n",
       "      <td>300</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>12325954</td>\n",
       "      <td>0</td>\n",
       "      <td>1860</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>abz1860ag0001</td>\n",
       "      <td>0007_00000367</td>\n",
       "      <td>Notizblatt</td>\n",
       "      <td>/cont01/periodika/abz/1860/18600007/00000367.tif</td>\n",
       "      <td>00000367.tif</td>\n",
       "      <td>1</td>\n",
       "      <td>/conttxt/periodika/abz/1860/18600007/alto/0000...</td>\n",
       "      <td>2368</td>\n",
       "      <td>3059</td>\n",
       "      <td>300</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>12325955</td>\n",
       "      <td>0</td>\n",
       "      <td>1860</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     manifest_id        page_id structure_info  \\\n",
       "0  abz1860ag0001  0007_00000363     Notizblatt   \n",
       "1  abz1860ag0001  0007_00000364     Notizblatt   \n",
       "2  abz1860ag0001  0007_00000365     Notizblatt   \n",
       "3  abz1860ag0001  0007_00000366     Notizblatt   \n",
       "4  abz1860ag0001  0007_00000367     Notizblatt   \n",
       "\n",
       "                                          dateipfad     dateiname  has_ocr  \\\n",
       "0  /cont01/periodika/abz/1860/18600007/00000363.tif  00000363.tif        1   \n",
       "1  /cont01/periodika/abz/1860/18600007/00000364.tif  00000364.tif        1   \n",
       "2  /cont01/periodika/abz/1860/18600007/00000365.tif  00000365.tif        1   \n",
       "3  /cont01/periodika/abz/1860/18600007/00000366.tif  00000366.tif        1   \n",
       "4  /cont01/periodika/abz/1860/18600007/00000367.tif  00000367.tif        1   \n",
Stefan Karner's avatar
Stefan Karner committed
       "\n",
       "                                           alto_path  width  height  \\\n",
       "0  /conttxt/periodika/abz/1860/18600007/alto/0000...   2368    3066   \n",
       "1  /conttxt/periodika/abz/1860/18600007/alto/0000...   2368    3066   \n",
       "2  /conttxt/periodika/abz/1860/18600007/alto/0000...   2368    3063   \n",
       "3  /conttxt/periodika/abz/1860/18600007/alto/0000...   2368    3063   \n",
       "4  /conttxt/periodika/abz/1860/18600007/alto/0000...   2368    3059   \n",
       "\n",
       "   resolution  color_depth  is_not_for_publication  order_index  from_abo  \\\n",
       "0         300            8                       0     12325951         0   \n",
       "1         300            8                       0     12325952         0   \n",
       "2         300            8                       0     12325953         0   \n",
       "3         300            8                       0     12325954         0   \n",
       "4         300            8                       0     12325955         0   \n",
       "\n",
       "   year  \n",
       "0  1860  \n",
       "1  1860  \n",
       "2  1860  \n",
       "3  1860  \n",
       "4  1860  "
Stefan Karner's avatar
Stefan Karner committed
      ]
     },
Stefan Karner's avatar
Stefan Karner committed
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pages.head()"
   ]
  },
  {
   "cell_type": "code",
Stefan Karner's avatar
Stefan Karner committed
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>manifest_id</th>\n",
       "      <th>aid</th>\n",
       "      <th>year</th>\n",
       "      <th>day</th>\n",
       "      <th>dc_title</th>\n",
       "      <th>dc_title_additional</th>\n",
       "      <th>subjects</th>\n",
       "      <th>place_of_publications</th>\n",
       "      <th>languages</th>\n",
       "      <th>dc_type</th>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <th>meta_type</th>\n",
       "      <th>ini_type</th>\n",
       "      <th>modification_datetime</th>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <th>longer_page_id</th>\n",
       "      <th>dc_date</th>\n",
       "      <th>link_pdf</th>\n",
       "      <th>link_old</th>\n",
       "      <th>has_ocr</th>\n",
       "      <th>meta_id</th>\n",
       "      <th>page_count</th>\n",
Stefan Karner's avatar
Stefan Karner committed
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>fug15050701</td>\n",
       "      <td>fug</td>\n",
       "      <td>1505</td>\n",
       "      <td>15050701</td>\n",
       "      <td>Fugger - Zeitungen</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Tageszeitung</td>\n",
       "      <td>o.O.</td>\n",
       "      <td>de</td>\n",
       "      <td>newspaper</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>brz</td>\n",
       "      <td>anno</td>\n",
       "      <td>2013-06-27 13:28:35</td>\n",
       "      <td>1</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>1505-07-01</td>\n",
       "      <td>http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...</td>\n",
       "      <td>http://anno.onb.ac.at/cgi-content/anno?aid=fug...</td>\n",
       "      <td>0</td>\n",
       "      <td>1122198</td>\n",
       "      <td>2</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>fug15680120</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>fug</td>\n",
       "      <td>1568</td>\n",
       "      <td>15680120</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>Fugger - Zeitungen</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Tageszeitung</td>\n",
       "      <td>o.O.</td>\n",
       "      <td>de</td>\n",
       "      <td>newspaper</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>brz</td>\n",
       "      <td>anno</td>\n",
       "      <td>2013-06-27 13:27:59</td>\n",
       "      <td>1</td>\n",
       "      <td>1568-01-20</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...</td>\n",
       "      <td>http://anno.onb.ac.at/cgi-content/anno?aid=fug...</td>\n",
       "      <td>0</td>\n",
       "      <td>1122201</td>\n",
       "      <td>1</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>fug15680124</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>fug</td>\n",
       "      <td>1568</td>\n",
       "      <td>15680124</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>Fugger - Zeitungen</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Tageszeitung</td>\n",
       "      <td>o.O.</td>\n",
       "      <td>de</td>\n",
       "      <td>newspaper</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>brz</td>\n",
       "      <td>anno</td>\n",
       "      <td>2013-06-27 13:27:58</td>\n",
       "      <td>1</td>\n",
       "      <td>1568-01-24</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...</td>\n",
       "      <td>http://anno.onb.ac.at/cgi-content/anno?aid=fug...</td>\n",
       "      <td>0</td>\n",
       "      <td>1122202</td>\n",
       "      <td>2</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>fug15680228</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>fug</td>\n",
       "      <td>1568</td>\n",
       "      <td>15680228</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>Fugger - Zeitungen</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Tageszeitung</td>\n",
       "      <td>o.O.</td>\n",
       "      <td>de</td>\n",
       "      <td>newspaper</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>brz</td>\n",
       "      <td>anno</td>\n",
       "      <td>2013-06-27 13:27:59</td>\n",
       "      <td>1</td>\n",
       "      <td>1568-02-28</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...</td>\n",
       "      <td>http://anno.onb.ac.at/cgi-content/anno?aid=fug...</td>\n",
       "      <td>0</td>\n",
       "      <td>1122203</td>\n",
       "      <td>1</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>fug15680304</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>fug</td>\n",
       "      <td>1568</td>\n",
       "      <td>15680304</td>\n",
Stefan Karner's avatar
Stefan Karner committed
       "      <td>Fugger - Zeitungen</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Tageszeitung</td>\n",
       "      <td>o.O.</td>\n",
       "      <td>de</td>\n",
       "      <td>newspaper</td>\n",