From 6416392bc3b196408e642b715f17a29a8b5ef63b Mon Sep 17 00:00:00 2001 From: Stefan Karner Date: Fri, 3 May 2019 08:50:22 +0200 Subject: [PATCH] Add html versions for block 3; change 3.3 --- 3.3 - Text - Download OCR Text.ipynb | 1125 +- html-versions/3 - Images and Text.html | 13415 ++++++++++++++ html-versions/3 - Images and Text.slides.html | 13603 ++++++++++++++ .../3.1 - IIIF Collection from SPARQL.html | 13359 ++++++++++++++ ...downsized images for machine learning.html | 15023 ++++++++++++++++ .../3.3 - Text - Download OCR Text.html | 14210 +++++++++++++++ 6 files changed, 70057 insertions(+), 678 deletions(-) create mode 100644 html-versions/3 - Images and Text.html create mode 100644 html-versions/3 - Images and Text.slides.html create mode 100644 html-versions/3.1 - IIIF Collection from SPARQL.html create mode 100644 html-versions/3.2 - Images - Download pre-downsized images for machine learning.html create mode 100644 html-versions/3.3 - Text - Download OCR Text.html diff --git a/3.3 - Text - Download OCR Text.ipynb b/3.3 - Text - Download OCR Text.ipynb index ccc08aa..4f24bb4 100644 --- a/3.3 - Text - Download OCR Text.ipynb +++ b/3.3 - Text - Download OCR Text.ipynb @@ -2,7 +2,11 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, "source": [ "# 3.3 - Text - Download OCR Text\n", "\n", @@ -15,7 +19,11 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, "source": [ "In order to get to this text, we have to\n", "\n", @@ -27,14 +35,22 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, "source": [ "### Find a Newspaper Issue" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "slideshow": { + "slide_type": "" + } + }, "source": [ "Let's take a look at the [ONB Labs' historic newspapers](https://labs.onb.ac.at/en/dataset/anno/)" ] @@ -42,7 +58,11 @@ { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, "outputs": [], "source": [ "import pandas as pd\n", @@ -53,7 +73,11 @@ { "cell_type": "code", "execution_count": 2, - "metadata": {}, + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, "outputs": [ { "data": { @@ -101,180 +125,180 @@ " \n", " \n", " \n", - " 140468\n", - " kfz18410324\n", - " kfz\n", - " 1841\n", - " 18410324\n", - " Klagenfurter Zeitung\n", + " 47898\n", + " bor18220712\n", + " bor\n", + " 1822\n", + " 18220712\n", + " Amtliches Cursblatt der Wiener Börse\n", " NaN\n", - " Tageszeitung\n", - " Klagenfurt\n", + " Wirtschaft\n", + " Wien\n", " de\n", " newspaper\n", " ...\n", " zeitungen\n", " anno\n", - " 2003-12-02 19:06:09\n", + " 2013-04-22 14:12:28\n", " 0\n", - " 1841-03-24\n", + " 1822-07-12\n", " http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...\n", - " http://anno.onb.ac.at/cgi-content/anno?aid=kfz...\n", - " 1\n", - " 766277\n", - " 20\n", + " http://anno.onb.ac.at/cgi-content/anno?aid=bor...\n", + " 0\n", + " 81750\n", + " 2\n", " \n", " \n", - " 181233\n", - " wtz18281211\n", - " wtz\n", - " 1828\n", - " 18281211\n", - " Theaterzettel (Oper und Burgtheater in Wien)\n", + " 149247\n", + " lvb18740301\n", + " lvb\n", + " 1874\n", + " 18740301\n", + " Linzer Volksblatt\n", " NaN\n", - " Kultur, Kunst, Theater, Musik\n", - " Wien\n", + " Tageszeitung\n", + " Linz\n", " de\n", " newspaper\n", " ...\n", " zeitungen\n", " anno\n", - " 2010-12-21 02:37:44\n", + " 2010-11-29 09:39:04\n", " 0\n", - " 1828-12-11\n", + " 1874-03-01\n", " http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...\n", - " http://anno.onb.ac.at/cgi-content/anno?aid=wtz...\n", - " 0\n", - " 961683\n", + " http://anno.onb.ac.at/cgi-content/anno?aid=lvb...\n", " 1\n", + " 776681\n", + " 6\n", " \n", " \n", - " 28465\n", - " apr18640419\n", - " apr\n", - " 1864\n", - " 18640419\n", - " Die Presse\n", + " 48126\n", + " bor18230417\n", + " bor\n", + " 1823\n", + " 18230417\n", + " Amtliches Cursblatt der Wiener Börse\n", " NaN\n", - " Tageszeitung\n", - " Wien, Brno (Brünn)\n", + " Wirtschaft\n", + " Wien\n", " de\n", " newspaper\n", " ...\n", " zeitungen\n", " anno\n", - " 2010-12-07 15:20:30\n", + " 2013-04-22 14:13:30\n", " 0\n", - " 1864-04-19\n", + " 1823-04-17\n", " http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...\n", - " http://anno.onb.ac.at/cgi-content/anno?aid=apr...\n", - " 1\n", - " 14304\n", - " 12\n", + " http://anno.onb.ac.at/cgi-content/anno?aid=bor...\n", + " 0\n", + " 81978\n", + " 2\n", " \n", " \n", - " 181260\n", - " wtz18290111\n", - " wtz\n", - " 1829\n", - " 18290111\n", - " Theaterzettel (Oper und Burgtheater in Wien)\n", + " 80491\n", + " neu18670531\n", + " neu\n", + " 1867\n", + " 18670531\n", + " Die Neuzeit\n", " NaN\n", - " Kultur, Kunst, Theater, Musik\n", + " Tageszeitung\n", " Wien\n", " de\n", " newspaper\n", " ...\n", " zeitungen\n", " anno\n", - " 2010-12-21 02:37:45\n", + " 2012-11-19 11:38:11\n", " 0\n", - " 1829-01-11\n", + " 1867-05-31\n", " http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...\n", - " http://anno.onb.ac.at/cgi-content/anno?aid=wtz...\n", - " 0\n", - " 961710\n", + " http://anno.onb.ac.at/cgi-content/anno?aid=neu...\n", " 1\n", + " 309977\n", + " 12\n", " \n", " \n", - " 14201\n", - " bdc1868ag0123\n", - " bdc\n", - " 1868\n", - " 18680123\n", - " Ordinariats-Blatt der Budweiser Diöcese\n", + " 138634\n", + " joe18710415\n", + " joe\n", + " 1871\n", + " 18710415\n", + " Jörgel Briefe\n", " NaN\n", - " Religion\n", - " Budweis (BudÄ›jovice, Budovicium, ÄŒeské BudÄ...\n", + " Wochenzeitung\n", + " Wien\n", " de\n", - " periodical\n", + " newspaper\n", " ...\n", - " periodika\n", - " annoplus\n", - " 2015-06-12 07:57:34\n", + " zeitungen\n", + " anno\n", + " 2009-04-02 10:54:27\n", " 0\n", - " 1868\n", + " 1871-04-15\n", " http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...\n", - " http://anno.onb.ac.at/cgi-content/anno-plus?ai...\n", + " http://anno.onb.ac.at/cgi-content/anno?aid=joe...\n", " 1\n", - " 1048842\n", - " 8\n", + " 762724\n", + " 16\n", " \n", " \n", - " 143594\n", - " kfz18610212\n", - " kfz\n", - " 1861\n", - " 18610212\n", - " Klagenfurter Zeitung\n", + " 75189\n", + " iwe18730304\n", + " iwe\n", + " 1873\n", + " 18730304\n", + " Illustrirtes Wiener Extrablatt\n", " NaN\n", " Tageszeitung\n", - " Klagenfurt\n", + " Wien\n", " de\n", " newspaper\n", " ...\n", " zeitungen\n", " anno\n", - " 2013-09-20 15:24:13\n", + " 2014-07-25 11:26:13\n", " 0\n", - " 1861-02-12\n", + " 1873-03-04\n", " http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...\n", - " http://anno.onb.ac.at/cgi-content/anno?aid=kfz...\n", + " http://anno.onb.ac.at/cgi-content/anno?aid=iwe...\n", " 1\n", - " 769403\n", - " 6\n", + " 220835\n", + " 8\n", " \n", " \n", - " 47299\n", - " bor18200703\n", - " bor\n", - " 1820\n", - " 18200703\n", - " Amtliches Cursblatt der Wiener Börse\n", + " 193395\n", + " wtz18680917\n", + " wtz\n", + " 1868\n", + " 18680917\n", + " Theaterzettel (Oper und Burgtheater in Wien)\n", " NaN\n", - " Wirtschaft\n", + " Kultur, Kunst, Theater, Musik\n", " Wien\n", " de\n", " newspaper\n", " ...\n", " zeitungen\n", " anno\n", - " 2013-04-22 14:09:40\n", + " 2014-03-21 10:28:59\n", " 0\n", - " 1820-07-03\n", + " 1868-09-17\n", " http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...\n", - " http://anno.onb.ac.at/cgi-content/anno?aid=bor...\n", + " http://anno.onb.ac.at/cgi-content/anno?aid=wtz...\n", " 0\n", - " 81151\n", - " 2\n", + " 973845\n", + " 1\n", " \n", " \n", - " 218786\n", - " wrz18510418\n", - " wrz\n", - " 1851\n", - " 18510418\n", - " Wiener Zeitung\n", + " 69095\n", + " ode18630730\n", + " ode\n", + " 1863\n", + " 18630730\n", + " Ost-Deutsche Post\n", " NaN\n", " Tageszeitung\n", " Wien\n", @@ -283,45 +307,45 @@ " ...\n", " zeitungen\n", " anno\n", - " 2010-12-28 15:27:30\n", + " 2018-08-29 09:15:11\n", " 0\n", - " 1851-04-18\n", + " 1863-07-30\n", " http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...\n", - " http://anno.onb.ac.at/cgi-content/anno?aid=wrz...\n", + " http://anno.onb.ac.at/cgi-content/anno?aid=ode...\n", " 1\n", - " 1012484\n", - " 28\n", + " 183458\n", + " 4\n", " \n", " \n", - " 77174\n", - " lmz18710902\n", - " lmz\n", - " 1871\n", - " 18710902\n", - " Leitmeritzer Zeitung\n", + " 125850\n", + " hum18450125\n", + " hum\n", + " 1845\n", + " 18450125\n", + " Der Humorist\n", " NaN\n", - " Tageszeitung\n", - " Litoměřice (Leitmeritz)\n", + " Humor, Satire, Geschichte\n", + " Wien\n", " de\n", " newspaper\n", " ...\n", " zeitungen\n", " anno\n", - " 2011-01-26 09:31:40\n", + " 2003-11-20 12:06:01\n", " 0\n", - " 1871-09-02\n", + " 1845-01-25\n", " http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...\n", - " http://anno.onb.ac.at/cgi-content/anno?aid=lmz...\n", + " http://anno.onb.ac.at/cgi-content/anno?aid=hum...\n", " 1\n", - " 269657\n", + " 729148\n", " 12\n", " \n", " \n", - " 153225\n", - " mop18550426\n", + " 152963\n", + " mop18540720\n", " mop\n", - " 1855\n", - " 18550426\n", + " 1854\n", + " 18540720\n", " Morgen-Post\n", " NaN\n", " Tageszeitung\n", @@ -331,13 +355,13 @@ " ...\n", " zeitungen\n", " anno\n", - " 2012-12-11 13:46:38\n", + " 2012-12-11 13:45:49\n", " 0\n", - " 1855-04-26\n", + " 1854-07-20\n", " http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...\n", " http://anno.onb.ac.at/cgi-content/anno?aid=mop...\n", " 1\n", - " 801457\n", + " 801195\n", " 4\n", " \n", " \n", @@ -346,113 +370,101 @@ "" ], "text/plain": [ - " manifest_id aid year day \\\n", - "140468 kfz18410324 kfz 1841 18410324 \n", - "181233 wtz18281211 wtz 1828 18281211 \n", - "28465 apr18640419 apr 1864 18640419 \n", - "181260 wtz18290111 wtz 1829 18290111 \n", - "14201 bdc1868ag0123 bdc 1868 18680123 \n", - "143594 kfz18610212 kfz 1861 18610212 \n", - "47299 bor18200703 bor 1820 18200703 \n", - "218786 wrz18510418 wrz 1851 18510418 \n", - "77174 lmz18710902 lmz 1871 18710902 \n", - "153225 mop18550426 mop 1855 18550426 \n", + " manifest_id aid year day \\\n", + "47898 bor18220712 bor 1822 18220712 \n", + "149247 lvb18740301 lvb 1874 18740301 \n", + "48126 bor18230417 bor 1823 18230417 \n", + "80491 neu18670531 neu 1867 18670531 \n", + "138634 joe18710415 joe 1871 18710415 \n", + "75189 iwe18730304 iwe 1873 18730304 \n", + "193395 wtz18680917 wtz 1868 18680917 \n", + "69095 ode18630730 ode 1863 18630730 \n", + "125850 hum18450125 hum 1845 18450125 \n", + "152963 mop18540720 mop 1854 18540720 \n", "\n", " dc_title dc_title_additional \\\n", - "140468 Klagenfurter Zeitung NaN \n", - "181233 Theaterzettel (Oper und Burgtheater in Wien) NaN \n", - "28465 Die Presse NaN \n", - "181260 Theaterzettel (Oper und Burgtheater in Wien) NaN \n", - "14201 Ordinariats-Blatt der Budweiser Diöcese NaN \n", - "143594 Klagenfurter Zeitung NaN \n", - "47299 Amtliches Cursblatt der Wiener Börse NaN \n", - "218786 Wiener Zeitung NaN \n", - "77174 Leitmeritzer Zeitung NaN \n", - "153225 Morgen-Post NaN \n", + "47898 Amtliches Cursblatt der Wiener Börse NaN \n", + "149247 Linzer Volksblatt NaN \n", + "48126 Amtliches Cursblatt der Wiener Börse NaN \n", + "80491 Die Neuzeit NaN \n", + "138634 Jörgel Briefe NaN \n", + "75189 Illustrirtes Wiener Extrablatt NaN \n", + "193395 Theaterzettel (Oper und Burgtheater in Wien) NaN \n", + "69095 Ost-Deutsche Post NaN \n", + "125850 Der Humorist NaN \n", + "152963 Morgen-Post NaN \n", "\n", - " subjects \\\n", - "140468 Tageszeitung \n", - "181233 Kultur, Kunst, Theater, Musik \n", - "28465 Tageszeitung \n", - "181260 Kultur, Kunst, Theater, Musik \n", - "14201 Religion \n", - "143594 Tageszeitung \n", - "47299 Wirtschaft \n", - "218786 Tageszeitung \n", - "77174 Tageszeitung \n", - "153225 Tageszeitung \n", + " subjects place_of_publications languages \\\n", + "47898 Wirtschaft Wien de \n", + "149247 Tageszeitung Linz de \n", + "48126 Wirtschaft Wien de \n", + "80491 Tageszeitung Wien de \n", + "138634 Wochenzeitung Wien de \n", + "75189 Tageszeitung Wien de \n", + "193395 Kultur, Kunst, Theater, Musik Wien de \n", + "69095 Tageszeitung Wien de \n", + "125850 Humor, Satire, Geschichte Wien de \n", + "152963 Tageszeitung Wien de \n", "\n", - " place_of_publications languages \\\n", - "140468 Klagenfurt de \n", - "181233 Wien de \n", - "28465 Wien, Brno (Brünn) de \n", - "181260 Wien de \n", - "14201 Budweis (BudÄ›jovice, Budovicium, ÄŒeské BudÄ... de \n", - "143594 Klagenfurt de \n", - "47299 Wien de \n", - "218786 Wien de \n", - "77174 Litoměřice (Leitmeritz) de \n", - "153225 Wien de \n", - "\n", - " dc_type ... meta_type ini_type modification_datetime \\\n", - "140468 newspaper ... zeitungen anno 2003-12-02 19:06:09 \n", - "181233 newspaper ... zeitungen anno 2010-12-21 02:37:44 \n", - "28465 newspaper ... zeitungen anno 2010-12-07 15:20:30 \n", - "181260 newspaper ... zeitungen anno 2010-12-21 02:37:45 \n", - "14201 periodical ... periodika annoplus 2015-06-12 07:57:34 \n", - "143594 newspaper ... zeitungen anno 2013-09-20 15:24:13 \n", - "47299 newspaper ... zeitungen anno 2013-04-22 14:09:40 \n", - "218786 newspaper ... zeitungen anno 2010-12-28 15:27:30 \n", - "77174 newspaper ... zeitungen anno 2011-01-26 09:31:40 \n", - "153225 newspaper ... zeitungen anno 2012-12-11 13:46:38 \n", + " dc_type ... meta_type ini_type modification_datetime \\\n", + "47898 newspaper ... zeitungen anno 2013-04-22 14:12:28 \n", + "149247 newspaper ... zeitungen anno 2010-11-29 09:39:04 \n", + "48126 newspaper ... zeitungen anno 2013-04-22 14:13:30 \n", + "80491 newspaper ... zeitungen anno 2012-11-19 11:38:11 \n", + "138634 newspaper ... zeitungen anno 2009-04-02 10:54:27 \n", + "75189 newspaper ... zeitungen anno 2014-07-25 11:26:13 \n", + "193395 newspaper ... zeitungen anno 2014-03-21 10:28:59 \n", + "69095 newspaper ... zeitungen anno 2018-08-29 09:15:11 \n", + "125850 newspaper ... zeitungen anno 2003-11-20 12:06:01 \n", + "152963 newspaper ... zeitungen anno 2012-12-11 13:45:49 \n", "\n", " longer_page_id dc_date \\\n", - "140468 0 1841-03-24 \n", - "181233 0 1828-12-11 \n", - "28465 0 1864-04-19 \n", - "181260 0 1829-01-11 \n", - "14201 0 1868 \n", - "143594 0 1861-02-12 \n", - "47299 0 1820-07-03 \n", - "218786 0 1851-04-18 \n", - "77174 0 1871-09-02 \n", - "153225 0 1855-04-26 \n", + "47898 0 1822-07-12 \n", + "149247 0 1874-03-01 \n", + "48126 0 1823-04-17 \n", + "80491 0 1867-05-31 \n", + "138634 0 1871-04-15 \n", + "75189 0 1873-03-04 \n", + "193395 0 1868-09-17 \n", + "69095 0 1863-07-30 \n", + "125850 0 1845-01-25 \n", + "152963 0 1854-07-20 \n", "\n", " link_pdf \\\n", - "140468 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", - "181233 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", - "28465 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", - "181260 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", - "14201 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", - "143594 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", - "47299 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", - "218786 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", - "77174 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", - "153225 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", + "47898 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", + "149247 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", + "48126 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", + "80491 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", + "138634 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", + "75189 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", + "193395 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", + "69095 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", + "125850 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", + "152963 http://anno.onb.ac.at/cgi-content/anno_pdf.pl?... \n", "\n", " link_old has_ocr meta_id \\\n", - "140468 http://anno.onb.ac.at/cgi-content/anno?aid=kfz... 1 766277 \n", - "181233 http://anno.onb.ac.at/cgi-content/anno?aid=wtz... 0 961683 \n", - "28465 http://anno.onb.ac.at/cgi-content/anno?aid=apr... 1 14304 \n", - "181260 http://anno.onb.ac.at/cgi-content/anno?aid=wtz... 0 961710 \n", - "14201 http://anno.onb.ac.at/cgi-content/anno-plus?ai... 1 1048842 \n", - "143594 http://anno.onb.ac.at/cgi-content/anno?aid=kfz... 1 769403 \n", - "47299 http://anno.onb.ac.at/cgi-content/anno?aid=bor... 0 81151 \n", - "218786 http://anno.onb.ac.at/cgi-content/anno?aid=wrz... 1 1012484 \n", - "77174 http://anno.onb.ac.at/cgi-content/anno?aid=lmz... 1 269657 \n", - "153225 http://anno.onb.ac.at/cgi-content/anno?aid=mop... 1 801457 \n", + "47898 http://anno.onb.ac.at/cgi-content/anno?aid=bor... 0 81750 \n", + "149247 http://anno.onb.ac.at/cgi-content/anno?aid=lvb... 1 776681 \n", + "48126 http://anno.onb.ac.at/cgi-content/anno?aid=bor... 0 81978 \n", + "80491 http://anno.onb.ac.at/cgi-content/anno?aid=neu... 1 309977 \n", + "138634 http://anno.onb.ac.at/cgi-content/anno?aid=joe... 1 762724 \n", + "75189 http://anno.onb.ac.at/cgi-content/anno?aid=iwe... 1 220835 \n", + "193395 http://anno.onb.ac.at/cgi-content/anno?aid=wtz... 0 973845 \n", + "69095 http://anno.onb.ac.at/cgi-content/anno?aid=ode... 1 183458 \n", + "125850 http://anno.onb.ac.at/cgi-content/anno?aid=hum... 1 729148 \n", + "152963 http://anno.onb.ac.at/cgi-content/anno?aid=mop... 1 801195 \n", "\n", " page_count \n", - "140468 20 \n", - "181233 1 \n", - "28465 12 \n", - "181260 1 \n", - "14201 8 \n", - "143594 6 \n", - "47299 2 \n", - "218786 28 \n", - "77174 12 \n", - "153225 4 \n", + "47898 2 \n", + "149247 6 \n", + "48126 2 \n", + "80491 12 \n", + "138634 16 \n", + "75189 8 \n", + "193395 1 \n", + "69095 4 \n", + "125850 12 \n", + "152963 4 \n", "\n", "[10 rows x 21 columns]" ] @@ -468,9 +480,13 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, "source": [ - "Let's go with the *Leitmeritzer Zeitung* issue from the 2nd of September 1871" + "Let's go with the *Ost-Deutsche Post* issue from the 30th of July 1863" ] }, { @@ -479,12 +495,16 @@ "metadata": {}, "outputs": [], "source": [ - "manifest_id = 'lmz18710902'" + "manifest_id = 'ode18630730'" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, "source": [ "### Download the IIIF Manifest" ] @@ -495,13 +515,17 @@ "source": [ "If we look at the [SACHA API description](https://iiif.onb.ac.at/api#_manifestrequestprocessor), we see that the link for the IIIF manifest has to look like this:\n", "\n", - "`http://iiif.onb.ac.at/presentation/ANNO/lmz18710902/manifest`" + "`http://iiif.onb.ac.at/presentation/ANNO/ode18630730/manifest`" ] }, { "cell_type": "code", "execution_count": 4, - "metadata": {}, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, "outputs": [], "source": [ "import requests" @@ -510,30 +534,47 @@ { "cell_type": "code", "execution_count": 5, - "metadata": {}, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "r = requests.get(f'http://iiif.onb.ac.at/presentation/ANNO/{manifest_id}/manifest')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, "outputs": [ { "data": { "text/plain": [ "{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/manifest',\n", + " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/manifest',\n", " '@type': 'sc:Manifest',\n", - " 'label': 'Leitmeritzer Zeitung 1871-09-02',\n", + " 'label': 'Ost-Deutsche Post 1863-07-30',\n", " 'metadata': [{'label': [{'@value': 'Id', '@language': 'en'},\n", " {'@value': 'Id', '@language': 'ger'}],\n", - " 'value': 'lmz18710902'},\n", + " 'value': 'ode18630730'},\n", " {'label': [{'@value': 'Title', '@language': 'en'},\n", " {'@value': 'Titel', '@language': 'ger'}],\n", - " 'value': 'Leitmeritzer Zeitung'},\n", + " 'value': 'Ost-Deutsche Post'},\n", " {'label': [{'@value': 'Type', '@language': 'en'},\n", " {'@value': 'Typ', '@language': 'ger'}],\n", " 'value': 'newspaper'},\n", " {'label': [{'@value': 'Place of Publications', '@language': 'en'},\n", " {'@value': 'Erscheinungsort', '@language': 'ger'}],\n", - " 'value': \"Litoměřice (Leitmeritz)\"},\n", + " 'value': \"Wien\"},\n", " {'label': [{'@value': 'Date Issued', '@language': 'en'},\n", " {'@value': 'Erscheinungsdatum', '@language': 'ger'}],\n", - " 'value': '1871-09-02'},\n", + " 'value': '1863-07-30'},\n", " {'label': [{'@value': 'Subject Heading', '@language': 'en'},\n", " {'@value': 'Schlagworte', '@language': 'ger'}],\n", " 'value': \"Tageszeitung\"},\n", @@ -543,398 +584,161 @@ " {'label': [{'@value': 'Languages', '@language': 'en'},\n", " {'@value': 'Sprachen', '@language': 'ger'}],\n", " 'value': 'ger'}],\n", - " 'description': 'Leitmeritzer Zeitung 1871-09-02',\n", + " 'description': 'Ost-Deutsche Post 1863-07-30',\n", " 'viewingDirection': 'left-to-right',\n", " 'viewingHint': 'paged',\n", " 'license': 'http://creativecommons.org/publicdomain/mark/1.0/',\n", " 'attribution': [{'@value': 'Austrian National Library', '@language': 'en'},\n", " {'@value': 'Österreichische Nationalbibliothek', '@language': 'ger'}],\n", " 'logo': 'https://iiif.onb.ac.at/logo/',\n", - " 'seeAlso': [{'@id': 'http://anno.onb.ac.at/cgi-content/anno_pdf.pl?aid=lmz&datum=18710902',\n", + " 'seeAlso': [{'@id': 'http://anno.onb.ac.at/cgi-content/anno_pdf.pl?aid=ode&datum=18630730',\n", " 'format': 'application/pdf'},\n", - " {'@id': 'http://anno.onb.ac.at/cgi-content/anno?aid=lmz&datum=18710902',\n", + " {'@id': 'http://anno.onb.ac.at/cgi-content/anno?aid=ode&datum=18630730',\n", " 'format': 'text/html'},\n", - " {'@id': 'http://data.onb.ac.at/ANNO/lmz18710902.rdf',\n", + " {'@id': 'http://data.onb.ac.at/ANNO/ode18630730.rdf',\n", " 'format': 'application/rdf+xml'}],\n", " 'sequences': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/sequence/normal',\n", + " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/sequence/normal',\n", " '@type': 'sc:Sequence',\n", - " 'startCanvas': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000001',\n", + " 'startCanvas': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000001',\n", " 'canvases': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000001',\n", + " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000001',\n", " '@type': 'sc:Canvas',\n", " 'label': '00000001',\n", - " 'height': 3788,\n", - " 'width': 2819,\n", - " 'metadata': [{'label': 'Resolution', 'value': '0dpi'},\n", - " {'label': 'Color Depth', 'value': '0bpp'}],\n", + " 'height': 6148,\n", + " 'width': 4456,\n", + " 'metadata': [{'label': 'Resolution', 'value': '300dpi'},\n", + " {'label': 'Color Depth', 'value': '8bpp'}],\n", " 'images': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/annotation/00000001',\n", + " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/annotation/00000001',\n", " '@type': 'oa:Annotation',\n", " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/images/ANNO/lmz18710902/00000001/full/full/0/default.jpg',\n", + " 'resource': {'@id': 'https://iiif.onb.ac.at/images/ANNO/ode18630730/00000001/full/full/0/default.jpg',\n", " '@type': 'dctypes:Image',\n", - " 'height': 3788,\n", - " 'width': 2819,\n", + " 'height': 6148,\n", + " 'width': 4456,\n", " 'format': 'image/jpeg',\n", " 'service': {'@context': 'https://iiif.io/api/image/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/images/ANNO/lmz18710902/00000001',\n", + " '@id': 'https://iiif.onb.ac.at/images/ANNO/ode18630730/00000001',\n", " 'profile': 'https://iiif.io/api/image/2/level2.json'}},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000001'}],\n", + " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000001'}],\n", " 'otherContent': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000001.json',\n", + " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000001.json',\n", " '@type': 'sc:AnnotationList',\n", " 'resources': [{'@type': 'oa:Annotation',\n", " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000001.xml',\n", + " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000001.xml',\n", " '@type': 'dctypes:Text',\n", " 'format': 'application/xml+alto'},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000001'}]}]},\n", + " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000001'}]}]},\n", " {'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000002',\n", + " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000002',\n", " '@type': 'sc:Canvas',\n", " 'label': '00000002',\n", - " 'height': 3802,\n", - " 'width': 2822,\n", - " 'metadata': [{'label': 'Resolution', 'value': '0dpi'},\n", - " {'label': 'Color Depth', 'value': '0bpp'}],\n", + " 'height': 6176,\n", + " 'width': 4444,\n", + " 'metadata': [{'label': 'Resolution', 'value': '300dpi'},\n", + " {'label': 'Color Depth', 'value': '8bpp'}],\n", " 'images': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/annotation/00000002',\n", + " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/annotation/00000002',\n", " '@type': 'oa:Annotation',\n", " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/images/ANNO/lmz18710902/00000002/full/full/0/default.jpg',\n", + " 'resource': {'@id': 'https://iiif.onb.ac.at/images/ANNO/ode18630730/00000002/full/full/0/default.jpg',\n", " '@type': 'dctypes:Image',\n", - " 'height': 3802,\n", - " 'width': 2822,\n", + " 'height': 6176,\n", + " 'width': 4444,\n", " 'format': 'image/jpeg',\n", " 'service': {'@context': 'https://iiif.io/api/image/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/images/ANNO/lmz18710902/00000002',\n", + " '@id': 'https://iiif.onb.ac.at/images/ANNO/ode18630730/00000002',\n", " 'profile': 'https://iiif.io/api/image/2/level2.json'}},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000002'}],\n", + " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000002'}],\n", " 'otherContent': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000002.json',\n", + " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000002.json',\n", " '@type': 'sc:AnnotationList',\n", " 'resources': [{'@type': 'oa:Annotation',\n", " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000002.xml',\n", + " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000002.xml',\n", " '@type': 'dctypes:Text',\n", " 'format': 'application/xml+alto'},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000002'}]}]},\n", + " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000002'}]}]},\n", " {'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000003',\n", + " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000003',\n", " '@type': 'sc:Canvas',\n", " 'label': '00000003',\n", - " 'height': 3788,\n", - " 'width': 2819,\n", - " 'metadata': [{'label': 'Resolution', 'value': '0dpi'},\n", - " {'label': 'Color Depth', 'value': '0bpp'}],\n", + " 'height': 6148,\n", + " 'width': 4456,\n", + " 'metadata': [{'label': 'Resolution', 'value': '300dpi'},\n", + " {'label': 'Color Depth', 'value': '8bpp'}],\n", " 'images': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/annotation/00000003',\n", + " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/annotation/00000003',\n", " '@type': 'oa:Annotation',\n", " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/images/ANNO/lmz18710902/00000003/full/full/0/default.jpg',\n", + " 'resource': {'@id': 'https://iiif.onb.ac.at/images/ANNO/ode18630730/00000003/full/full/0/default.jpg',\n", " '@type': 'dctypes:Image',\n", - " 'height': 3788,\n", - " 'width': 2819,\n", + " 'height': 6148,\n", + " 'width': 4456,\n", " 'format': 'image/jpeg',\n", " 'service': {'@context': 'https://iiif.io/api/image/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/images/ANNO/lmz18710902/00000003',\n", + " '@id': 'https://iiif.onb.ac.at/images/ANNO/ode18630730/00000003',\n", " 'profile': 'https://iiif.io/api/image/2/level2.json'}},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000003'}],\n", + " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000003'}],\n", " 'otherContent': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000003.json',\n", + " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000003.json',\n", " '@type': 'sc:AnnotationList',\n", " 'resources': [{'@type': 'oa:Annotation',\n", " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000003.xml',\n", + " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000003.xml',\n", " '@type': 'dctypes:Text',\n", " 'format': 'application/xml+alto'},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000003'}]}]},\n", + " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000003'}]}]},\n", " {'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000004',\n", + " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000004',\n", " '@type': 'sc:Canvas',\n", " 'label': '00000004',\n", - " 'height': 3802,\n", - " 'width': 2822,\n", - " 'metadata': [{'label': 'Resolution', 'value': '0dpi'},\n", - " {'label': 'Color Depth', 'value': '0bpp'}],\n", - " 'images': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/annotation/00000004',\n", - " '@type': 'oa:Annotation',\n", - " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/images/ANNO/lmz18710902/00000004/full/full/0/default.jpg',\n", - " '@type': 'dctypes:Image',\n", - " 'height': 3802,\n", - " 'width': 2822,\n", - " 'format': 'image/jpeg',\n", - " 'service': {'@context': 'https://iiif.io/api/image/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/images/ANNO/lmz18710902/00000004',\n", - " 'profile': 'https://iiif.io/api/image/2/level2.json'}},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000004'}],\n", - " 'otherContent': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000004.json',\n", - " '@type': 'sc:AnnotationList',\n", - " 'resources': [{'@type': 'oa:Annotation',\n", - " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000004.xml',\n", - " '@type': 'dctypes:Text',\n", - " 'format': 'application/xml+alto'},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000004'}]}]},\n", - " {'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000005',\n", - " '@type': 'sc:Canvas',\n", - " 'label': '00000005',\n", - " 'height': 3788,\n", - " 'width': 2819,\n", - " 'metadata': [{'label': 'Resolution', 'value': '0dpi'},\n", - " {'label': 'Color Depth', 'value': '0bpp'}],\n", - " 'images': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/annotation/00000005',\n", - " '@type': 'oa:Annotation',\n", - " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/images/ANNO/lmz18710902/00000005/full/full/0/default.jpg',\n", - " '@type': 'dctypes:Image',\n", - " 'height': 3788,\n", - " 'width': 2819,\n", - " 'format': 'image/jpeg',\n", - " 'service': {'@context': 'https://iiif.io/api/image/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/images/ANNO/lmz18710902/00000005',\n", - " 'profile': 'https://iiif.io/api/image/2/level2.json'}},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000005'}],\n", - " 'otherContent': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000005.json',\n", - " '@type': 'sc:AnnotationList',\n", - " 'resources': [{'@type': 'oa:Annotation',\n", - " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000005.xml',\n", - " '@type': 'dctypes:Text',\n", - " 'format': 'application/xml+alto'},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000005'}]}]},\n", - " {'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000006',\n", - " '@type': 'sc:Canvas',\n", - " 'label': '00000006',\n", - " 'height': 3802,\n", - " 'width': 2822,\n", - " 'metadata': [{'label': 'Resolution', 'value': '0dpi'},\n", - " {'label': 'Color Depth', 'value': '0bpp'}],\n", + " 'height': 6176,\n", + " 'width': 4416,\n", + " 'metadata': [{'label': 'Resolution', 'value': '300dpi'},\n", + " {'label': 'Color Depth', 'value': '8bpp'}],\n", " 'images': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/annotation/00000006',\n", + " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/annotation/00000004',\n", " '@type': 'oa:Annotation',\n", " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/images/ANNO/lmz18710902/00000006/full/full/0/default.jpg',\n", + " 'resource': {'@id': 'https://iiif.onb.ac.at/images/ANNO/ode18630730/00000004/full/full/0/default.jpg',\n", " '@type': 'dctypes:Image',\n", - " 'height': 3802,\n", - " 'width': 2822,\n", + " 'height': 6176,\n", + " 'width': 4416,\n", " 'format': 'image/jpeg',\n", " 'service': {'@context': 'https://iiif.io/api/image/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/images/ANNO/lmz18710902/00000006',\n", + " '@id': 'https://iiif.onb.ac.at/images/ANNO/ode18630730/00000004',\n", " 'profile': 'https://iiif.io/api/image/2/level2.json'}},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000006'}],\n", + " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000004'}],\n", " 'otherContent': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000006.json',\n", + " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000004.json',\n", " '@type': 'sc:AnnotationList',\n", " 'resources': [{'@type': 'oa:Annotation',\n", " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000006.xml',\n", + " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000004.xml',\n", " '@type': 'dctypes:Text',\n", " 'format': 'application/xml+alto'},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000006'}]}]},\n", - " {'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000007',\n", - " '@type': 'sc:Canvas',\n", - " 'label': '00000007',\n", - " 'height': 3788,\n", - " 'width': 2819,\n", - " 'metadata': [{'label': 'Resolution', 'value': '0dpi'},\n", - " {'label': 'Color Depth', 'value': '0bpp'}],\n", - " 'images': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/annotation/00000007',\n", - " '@type': 'oa:Annotation',\n", - " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/images/ANNO/lmz18710902/00000007/full/full/0/default.jpg',\n", - " '@type': 'dctypes:Image',\n", - " 'height': 3788,\n", - " 'width': 2819,\n", - " 'format': 'image/jpeg',\n", - " 'service': {'@context': 'https://iiif.io/api/image/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/images/ANNO/lmz18710902/00000007',\n", - " 'profile': 'https://iiif.io/api/image/2/level2.json'}},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000007'}],\n", - " 'otherContent': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000007.json',\n", - " '@type': 'sc:AnnotationList',\n", - " 'resources': [{'@type': 'oa:Annotation',\n", - " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000007.xml',\n", - " '@type': 'dctypes:Text',\n", - " 'format': 'application/xml+alto'},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000007'}]}]},\n", - " {'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000008',\n", - " '@type': 'sc:Canvas',\n", - " 'label': '00000008',\n", - " 'height': 3802,\n", - " 'width': 2822,\n", - " 'metadata': [{'label': 'Resolution', 'value': '0dpi'},\n", - " {'label': 'Color Depth', 'value': '0bpp'}],\n", - " 'images': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/annotation/00000008',\n", - " '@type': 'oa:Annotation',\n", - " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/images/ANNO/lmz18710902/00000008/full/full/0/default.jpg',\n", - " '@type': 'dctypes:Image',\n", - " 'height': 3802,\n", - " 'width': 2822,\n", - " 'format': 'image/jpeg',\n", - " 'service': {'@context': 'https://iiif.io/api/image/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/images/ANNO/lmz18710902/00000008',\n", - " 'profile': 'https://iiif.io/api/image/2/level2.json'}},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000008'}],\n", - " 'otherContent': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000008.json',\n", - " '@type': 'sc:AnnotationList',\n", - " 'resources': [{'@type': 'oa:Annotation',\n", - " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000008.xml',\n", - " '@type': 'dctypes:Text',\n", - " 'format': 'application/xml+alto'},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000008'}]}]},\n", - " {'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000009',\n", - " '@type': 'sc:Canvas',\n", - " 'label': '00000009',\n", - " 'height': 3788,\n", - " 'width': 2819,\n", - " 'metadata': [{'label': 'Resolution', 'value': '0dpi'},\n", - " {'label': 'Color Depth', 'value': '0bpp'}],\n", - " 'images': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/annotation/00000009',\n", - " '@type': 'oa:Annotation',\n", - " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/images/ANNO/lmz18710902/00000009/full/full/0/default.jpg',\n", - " '@type': 'dctypes:Image',\n", - " 'height': 3788,\n", - " 'width': 2819,\n", - " 'format': 'image/jpeg',\n", - " 'service': {'@context': 'https://iiif.io/api/image/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/images/ANNO/lmz18710902/00000009',\n", - " 'profile': 'https://iiif.io/api/image/2/level2.json'}},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000009'}],\n", - " 'otherContent': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000009.json',\n", - " '@type': 'sc:AnnotationList',\n", - " 'resources': [{'@type': 'oa:Annotation',\n", - " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000009.xml',\n", - " '@type': 'dctypes:Text',\n", - " 'format': 'application/xml+alto'},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000009'}]}]},\n", - " {'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000010',\n", - " '@type': 'sc:Canvas',\n", - " 'label': '00000010',\n", - " 'height': 3802,\n", - " 'width': 2822,\n", - " 'metadata': [{'label': 'Resolution', 'value': '0dpi'},\n", - " {'label': 'Color Depth', 'value': '0bpp'}],\n", - " 'images': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/annotation/00000010',\n", - " '@type': 'oa:Annotation',\n", - " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/images/ANNO/lmz18710902/00000010/full/full/0/default.jpg',\n", - " '@type': 'dctypes:Image',\n", - " 'height': 3802,\n", - " 'width': 2822,\n", - " 'format': 'image/jpeg',\n", - " 'service': {'@context': 'https://iiif.io/api/image/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/images/ANNO/lmz18710902/00000010',\n", - " 'profile': 'https://iiif.io/api/image/2/level2.json'}},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000010'}],\n", - " 'otherContent': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000010.json',\n", - " '@type': 'sc:AnnotationList',\n", - " 'resources': [{'@type': 'oa:Annotation',\n", - " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000010.xml',\n", - " '@type': 'dctypes:Text',\n", - " 'format': 'application/xml+alto'},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000010'}]}]},\n", - " {'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000011',\n", - " '@type': 'sc:Canvas',\n", - " 'label': '00000011',\n", - " 'height': 3788,\n", - " 'width': 2819,\n", - " 'metadata': [{'label': 'Resolution', 'value': '0dpi'},\n", - " {'label': 'Color Depth', 'value': '0bpp'}],\n", - " 'images': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/annotation/00000011',\n", - " '@type': 'oa:Annotation',\n", - " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/images/ANNO/lmz18710902/00000011/full/full/0/default.jpg',\n", - " '@type': 'dctypes:Image',\n", - " 'height': 3788,\n", - " 'width': 2819,\n", - " 'format': 'image/jpeg',\n", - " 'service': {'@context': 'https://iiif.io/api/image/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/images/ANNO/lmz18710902/00000011',\n", - " 'profile': 'https://iiif.io/api/image/2/level2.json'}},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000011'}],\n", - " 'otherContent': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000011.json',\n", - " '@type': 'sc:AnnotationList',\n", - " 'resources': [{'@type': 'oa:Annotation',\n", - " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000011.xml',\n", - " '@type': 'dctypes:Text',\n", - " 'format': 'application/xml+alto'},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000011'}]}]},\n", - " {'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000012',\n", - " '@type': 'sc:Canvas',\n", - " 'label': '00000012',\n", - " 'height': 3802,\n", - " 'width': 2822,\n", - " 'metadata': [{'label': 'Resolution', 'value': '0dpi'},\n", - " {'label': 'Color Depth', 'value': '0bpp'}],\n", - " 'images': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/annotation/00000012',\n", - " '@type': 'oa:Annotation',\n", - " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/images/ANNO/lmz18710902/00000012/full/full/0/default.jpg',\n", - " '@type': 'dctypes:Image',\n", - " 'height': 3802,\n", - " 'width': 2822,\n", - " 'format': 'image/jpeg',\n", - " 'service': {'@context': 'https://iiif.io/api/image/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/images/ANNO/lmz18710902/00000012',\n", - " 'profile': 'https://iiif.io/api/image/2/level2.json'}},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000012'}],\n", - " 'otherContent': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", - " '@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000012.json',\n", - " '@type': 'sc:AnnotationList',\n", - " 'resources': [{'@type': 'oa:Annotation',\n", - " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000012.xml',\n", - " '@type': 'dctypes:Text',\n", - " 'format': 'application/xml+alto'},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000012'}]}]}]}]}" + " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000004'}]}]}]}]}" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "r = requests.get('http://iiif.onb.ac.at/presentation/ANNO/lmz18710902/manifest')\n", "r.json()" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, "source": [ "There's a lot of information in there. We need the info blocks with links to ALTO-XML resources.\n", "\n", @@ -943,8 +747,12 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": {}, + "execution_count": 7, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, "outputs": [], "source": [ "from jsonpath_ng import parse" @@ -952,8 +760,12 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": {}, + "execution_count": 8, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, "outputs": [], "source": [ "def jp(http_response, parser):\n", @@ -962,8 +774,12 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": {}, + "execution_count": 9, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, "outputs": [], "source": [ "resource_parser = parse('$.sequences[*].canvases[*].otherContent[*].resources')" @@ -973,7 +789,10 @@ "cell_type": "code", "execution_count": 10, "metadata": { - "scrolled": true + "scrolled": true, + "slideshow": { + "slide_type": "subslide" + } }, "outputs": [ { @@ -981,76 +800,28 @@ "text/plain": [ "[[{'@type': 'oa:Annotation',\n", " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000001.xml',\n", - " '@type': 'dctypes:Text',\n", - " 'format': 'application/xml+alto'},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000001'}],\n", - " [{'@type': 'oa:Annotation',\n", - " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000002.xml',\n", + " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000001.xml',\n", " '@type': 'dctypes:Text',\n", " 'format': 'application/xml+alto'},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000002'}],\n", + " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000001'}],\n", " [{'@type': 'oa:Annotation',\n", " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000003.xml',\n", + " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000002.xml',\n", " '@type': 'dctypes:Text',\n", " 'format': 'application/xml+alto'},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000003'}],\n", + " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000002'}],\n", " [{'@type': 'oa:Annotation',\n", " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000004.xml',\n", + " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000003.xml',\n", " '@type': 'dctypes:Text',\n", " 'format': 'application/xml+alto'},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000004'}],\n", + " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000003'}],\n", " [{'@type': 'oa:Annotation',\n", " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000005.xml',\n", + " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000004.xml',\n", " '@type': 'dctypes:Text',\n", " 'format': 'application/xml+alto'},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000005'}],\n", - " [{'@type': 'oa:Annotation',\n", - " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000006.xml',\n", - " '@type': 'dctypes:Text',\n", - " 'format': 'application/xml+alto'},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000006'}],\n", - " [{'@type': 'oa:Annotation',\n", - " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000007.xml',\n", - " '@type': 'dctypes:Text',\n", - " 'format': 'application/xml+alto'},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000007'}],\n", - " [{'@type': 'oa:Annotation',\n", - " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000008.xml',\n", - " '@type': 'dctypes:Text',\n", - " 'format': 'application/xml+alto'},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000008'}],\n", - " [{'@type': 'oa:Annotation',\n", - " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000009.xml',\n", - " '@type': 'dctypes:Text',\n", - " 'format': 'application/xml+alto'},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000009'}],\n", - " [{'@type': 'oa:Annotation',\n", - " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000010.xml',\n", - " '@type': 'dctypes:Text',\n", - " 'format': 'application/xml+alto'},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000010'}],\n", - " [{'@type': 'oa:Annotation',\n", - " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000011.xml',\n", - " '@type': 'dctypes:Text',\n", - " 'format': 'application/xml+alto'},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000011'}],\n", - " [{'@type': 'oa:Annotation',\n", - " 'motivation': 'sc:painting',\n", - " 'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000012.xml',\n", - " '@type': 'dctypes:Text',\n", - " 'format': 'application/xml+alto'},\n", - " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/canvas/00000012'}]]" + " 'on': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000004'}]]" ] }, "execution_count": 10, @@ -1072,7 +843,11 @@ { "cell_type": "code", "execution_count": 11, - "metadata": {}, + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, "outputs": [], "source": [ "all_resources = parse('$.sequences[*].canvases[*].otherContent[*].resources[*].resource')" @@ -1088,40 +863,16 @@ { "data": { "text/plain": [ - "[{'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000001.xml',\n", + "[{'@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000001.xml',\n", " '@type': 'dctypes:Text',\n", " 'format': 'application/xml+alto'},\n", - " {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000002.xml',\n", + " {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000002.xml',\n", " '@type': 'dctypes:Text',\n", " 'format': 'application/xml+alto'},\n", - " {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000003.xml',\n", + " {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000003.xml',\n", " '@type': 'dctypes:Text',\n", " 'format': 'application/xml+alto'},\n", - " {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000004.xml',\n", - " '@type': 'dctypes:Text',\n", - " 'format': 'application/xml+alto'},\n", - " {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000005.xml',\n", - " '@type': 'dctypes:Text',\n", - " 'format': 'application/xml+alto'},\n", - " {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000006.xml',\n", - " '@type': 'dctypes:Text',\n", - " 'format': 'application/xml+alto'},\n", - " {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000007.xml',\n", - " '@type': 'dctypes:Text',\n", - " 'format': 'application/xml+alto'},\n", - " {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000008.xml',\n", - " '@type': 'dctypes:Text',\n", - " 'format': 'application/xml+alto'},\n", - " {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000009.xml',\n", - " '@type': 'dctypes:Text',\n", - " 'format': 'application/xml+alto'},\n", - " {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000010.xml',\n", - " '@type': 'dctypes:Text',\n", - " 'format': 'application/xml+alto'},\n", - " {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000011.xml',\n", - " '@type': 'dctypes:Text',\n", - " 'format': 'application/xml+alto'},\n", - " {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000012.xml',\n", + " {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000004.xml',\n", " '@type': 'dctypes:Text',\n", " 'format': 'application/xml+alto'}]" ] @@ -1137,14 +888,18 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, "source": [ "Filter just the ones with format `application/xml+alto`, and there only the `@id`:" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -1153,27 +908,19 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000001.xml',\n", - " 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000002.xml',\n", - " 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000003.xml',\n", - " 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000004.xml',\n", - " 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000005.xml',\n", - " 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000006.xml',\n", - " 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000007.xml',\n", - " 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000008.xml',\n", - " 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000009.xml',\n", - " 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000010.xml',\n", - " 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000011.xml',\n", - " 'https://iiif.onb.ac.at/presentation/ANNO/lmz18710902/resource/00000012.xml']" + "['https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000001.xml',\n", + " 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000002.xml',\n", + " 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000003.xml',\n", + " 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000004.xml']" ] }, - "execution_count": 25, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -1184,14 +931,18 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, "source": [ "### Download the ALTO Files" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -1205,8 +956,12 @@ }, { "cell_type": "code", - "execution_count": 27, - "metadata": {}, + "execution_count": 16, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, "outputs": [ { "data": { @@ -1214,7 +969,7 @@ "{}" ] }, - "execution_count": 27, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1225,8 +980,12 @@ }, { "cell_type": "code", - "execution_count": 28, - "metadata": {}, + "execution_count": 17, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, "outputs": [ { "data": { @@ -1234,7 +993,7 @@ "" ] }, - "execution_count": 28, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1245,8 +1004,12 @@ }, { "cell_type": "code", - "execution_count": 29, - "metadata": {}, + "execution_count": 18, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, "outputs": [ { "data": { @@ -1254,7 +1017,7 @@ "False" ] }, - "execution_count": 29, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1265,21 +1028,29 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, "source": [ "Uh oh." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, "source": [ "### Convert the ALTO-XML to TXT" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -1300,18 +1071,22 @@ }, { "cell_type": "code", - "execution_count": 31, - "metadata": {}, + "execution_count": 20, + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, "outputs": [ { "ename": "OSError", - "evalue": "Error reading file 'http://iiif.onb.ac.at/presentation/ANNO/apr18750223/resource/00000002.xml': failed to load HTTP resource", + "evalue": "Error reading file 'http://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000002.xml': failed to load HTTP resource", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0malto_to_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'http://iiif.onb.ac.at/presentation/ANNO/apr18750223/resource/00000002.xml'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m\u001b[0m in \u001b[0;36malto_to_text\u001b[0;34m(raw_alto_text)\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0malto_to_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mraw_alto_text\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0malto\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mxml\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mxmlns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0malto_tools\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0malto_parse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mraw_alto_text\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 13\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0malto_extract_text_lines\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxml\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mxmlns\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0malto_to_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf'http://iiif.onb.ac.at/presentation/ANNO/{manifest_id}/resource/00000002.xml'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36malto_to_text\u001b[0;34m(raw_alto_text)\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0malto_to_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mraw_alto_text\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0malto\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mxml\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mxmlns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0malto_tools\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0malto_parse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mraw_alto_text\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 13\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0malto_extract_text_lines\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxml\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mxmlns\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/labs/pydays19/alto_tools.py\u001b[0m in \u001b[0;36malto_parse\u001b[0;34m(alto)\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0;34m\"\"\" Convert ALTO xml file to element tree \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 19\u001b[0;31m \u001b[0mxml\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0metree\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0malto\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 20\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0metree\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mParseError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 21\u001b[0m sys.stdout.write('\\nERROR: Failed parsing \"%s\" - '\n", "\u001b[0;32msrc/lxml/etree.pyx\u001b[0m in \u001b[0;36mlxml.etree.parse\u001b[0;34m()\u001b[0m\n", "\u001b[0;32msrc/lxml/parser.pxi\u001b[0m in \u001b[0;36mlxml.etree._parseDocument\u001b[0;34m()\u001b[0m\n", @@ -1321,23 +1096,17 @@ "\u001b[0;32msrc/lxml/parser.pxi\u001b[0m in \u001b[0;36mlxml.etree._ParserContext._handleParseResultDoc\u001b[0;34m()\u001b[0m\n", "\u001b[0;32msrc/lxml/parser.pxi\u001b[0m in \u001b[0;36mlxml.etree._handleParseResult\u001b[0;34m()\u001b[0m\n", "\u001b[0;32msrc/lxml/parser.pxi\u001b[0m in \u001b[0;36mlxml.etree._raiseParseError\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mOSError\u001b[0m: Error reading file 'http://iiif.onb.ac.at/presentation/ANNO/apr18750223/resource/00000002.xml': failed to load HTTP resource" + "\u001b[0;31mOSError\u001b[0m: Error reading file 'http://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000002.xml': failed to load HTTP resource" ] } ], "source": [ - "print(alto_to_text('http://iiif.onb.ac.at/presentation/ANNO/apr18750223/resource/00000002.xml'))" + "print(alto_to_text(f'http://iiif.onb.ac.at/presentation/ANNO/{manifest_id}/resource/00000002.xml'))" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { + "celltoolbar": "Slideshow", "kernelspec": { "display_name": "Python 3", "language": "python", diff --git a/html-versions/3 - Images and Text.html b/html-versions/3 - Images and Text.html new file mode 100644 index 0000000..17c25ea --- /dev/null +++ b/html-versions/3 - Images and Text.html @@ -0,0 +1,13415 @@ + + + + +3 - Images and Text + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+
+
+

In this block

    +
  • Overview IIIF
  • +
  • Overview OCR formats
  • +
+ +
+
+
+
+
+
+
    +
  • Example: Create IIIF collection from SPARQL query result
  • +
  • Example: Download pre-downsized images for machine learning
  • +
  • Example: Download OCR text
  • +
+ +
+
+
+
+
+
+

Overview IIIF

http://iiif.io/

+ +
+
+
+
+
+
+

What is IIIF?

+
+
+
+
+
+
+
    +
  • International Image Interoperability Framework (http://iiif.io/ - well written, worth a read)
  • +
  • Standardised method of describing and delivering images over the web
  • +
  • Community that develops APIs and implements them in Software
  • +
+ +
+
+
+
+
+
+

+

Image courtesy of https://github.com/IIIF/training, CC-BY 4.0

+ +
+
+
+
+
+
+

Why would I use this?

+
+
+
+
+
+
+

If you want to display images

    +
  • If you want to use one of several nice viewers for images (zoom, rotate, fullscreen ootb)
  • +
  • If you want to include image data hosted elsewhere
  • +
+ +
+
+
+
+
+
+

If you want to process images

    +
  • If you want structured access to potentially huge sets of images
  • +
  • If you want included metadata
  • +
  • If you want to resize images before downloading
  • +
+ +
+
+
+
+
+
+

How would I use this?

+
+
+
+
+
+
+ + +
+
+
+
+
+
+ + +
+
+
+
+
+
+ + +
+
+
+
+
+
+

Pics or didn't happen!

+
+
+
+
+
+
+ + +
+
+
+
+
+
+ + +
+
+
+
+
+
+
+
+
+ + +
+
+
+
+
+
+ + +
+
+
+
+
+
+

Overview OCR formats

+
+
+
+
+
+
+ + +
+
+
+
+
+
+
    +
  • 3 ALTO main elements
      +
    • <Description>
        +
      • metadata and general settings (e.g. measurement units) about the ALTO file
      • +
      +
    • +
    • <Styles>
        +
      • text and paragraph styles
      • +
      +
    • +
    • <Layout>
        +
      • content information
      • +
      • subdivided into <Page> elements
      • +
      +
    • +
    +
  • +
+ +
+
+
+
+
+
+

ALTO page element

+ +
+
+
+
+
+
+
    +
  • hOCR
      +
    • alternative to ALTO
    • +
    • based on XHTML
    • +
    • not used in the ONB Labs
    • +
    +
  • +
+ +
+
+
+
+
+ + + + + + diff --git a/html-versions/3 - Images and Text.slides.html b/html-versions/3 - Images and Text.slides.html new file mode 100644 index 0000000..34bdf55 --- /dev/null +++ b/html-versions/3 - Images and Text.slides.html @@ -0,0 +1,13603 @@ + + + + + + + + + + + + +3 - Images and Text slides + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+
+
+
+

In this block

    +
  • Overview IIIF
  • +
  • Overview OCR formats
  • +
+ +
+
+
+
+
+
+
    +
  • Example: Create IIIF collection from SPARQL query result
  • +
  • Example: Download pre-downsized images for machine learning
  • +
  • Example: Download OCR text
  • +
+ +
+
+
+
+
+
+

Overview IIIF

http://iiif.io/

+ +
+
+
+
+
+
+

What is IIIF?

+
+
+
+
+
+
+
    +
  • International Image Interoperability Framework (http://iiif.io/ - well written, worth a read)
  • +
  • Standardised method of describing and delivering images over the web
  • +
  • Community that develops APIs and implements them in Software
  • +
+ +
+
+
+
+
+
+

+

Image courtesy of https://github.com/IIIF/training, CC-BY 4.0

+ +
+
+
+
+
+
+

Why would I use this?

+
+
+
+
+
+
+

If you want to display images

    +
  • If you want to use one of several nice viewers for images (zoom, rotate, fullscreen ootb)
  • +
  • If you want to include image data hosted elsewhere
  • +
+ +
+
+
+
+
+
+

If you want to process images

    +
  • If you want structured access to potentially huge sets of images
  • +
  • If you want included metadata
  • +
  • If you want to resize images before downloading
  • +
+ +
+
+
+
+
+
+

How would I use this?

+
+
+
+
+
+
+ + +
+
+
+
+
+
+ + +
+
+
+
+
+
+ + +
+
+
+
+
+
+

Pics or didn't happen!

+
+
+
+
+
+
+ + +
+
+
+
+
+
+ + +
+
+
+
+
+
+
+
+
+ + +
+
+
+
+
+
+ + +
+
+
+
+
+
+

Overview OCR formats

+
+
+
+
+
+
+ + +
+
+
+
+
+
+
    +
  • 3 ALTO main elements
      +
    • <Description>
        +
      • metadata and general settings (e.g. measurement units) about the ALTO file
      • +
      +
    • +
    • <Styles>
        +
      • text and paragraph styles
      • +
      +
    • +
    • <Layout>
        +
      • content information
      • +
      • subdivided into <Page> elements
      • +
      +
    • +
    +
  • +
+ +
+
+
+
+
+
+

ALTO page element

+ +
+
+
+
+
+
+
    +
  • hOCR
      +
    • alternative to ALTO
    • +
    • based on XHTML
    • +
    • not used in the ONB Labs
    • +
    +
  • +
+ +
+
+
+
+
+ + + + + + + diff --git a/html-versions/3.1 - IIIF Collection from SPARQL.html b/html-versions/3.1 - IIIF Collection from SPARQL.html new file mode 100644 index 0000000..78881a9 --- /dev/null +++ b/html-versions/3.1 - IIIF Collection from SPARQL.html @@ -0,0 +1,13359 @@ + + + + +3.1 - IIIF Collection from SPARQL + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+
In [1]:
+
+
+
import requests
+import pandas as pd
+from SPARQLWrapper import SPARQLWrapper, JSON
+import json
+
+ +
+
+
+ +
+
+
+
+

Set the SPARQL-Endpoint:

+ + +
+
+
+
+
+
In [2]:
+
+
+
anno_lod_endpoint = "https://lod.onb.ac.at/sparql/anno"
+
+ +
+
+
+ +
+
+
+
+

Methods to query the endpoint and build the dataframe:

+ +
+
+
+
+
+
In [3]:
+
+
+
def get_sparql_result(service, query):
+    sparql = SPARQLWrapper(service)
+    sparql.setQuery(query)
+    sparql.setReturnFormat(JSON)
+    return sparql.query()
+
+def get_sparql_dataframe(service, query):
+    result = get_sparql_result(service, query)
+
+    processed_results = result.convert()
+    cols = processed_results['head']['vars']
+
+    out = []
+    for row in processed_results['results']['bindings']:
+        item = []
+        for c in cols:
+            item.append(row.get(c, {}).get('value'))
+        out.append(item)
+
+    return pd.DataFrame(out, columns=cols)
+
+ +
+
+
+ +
+
+
+
+

Select all newspapers and periodicals with subjectheading Statistik:

+ +
+
+
+
+
+
In [4]:
+
+
+
query = '''
+PREFIX dc: <http://purl.org/dc/elements/1.1/>
+PREFIX edm: <http://www.europeana.eu/schemas/edm/>
+PREFIX dcterms: <http://purl.org/dc/terms/>
+SELECT ?title ?subjectURI ?manifest 
+WHERE {?subjectURI dc:subject <http://d-nb.info/gnd/4056995-0> .
+       ?subjectURI dc:title ?title .
+       ?subjectURI edm:isShownBy ?firstpage .
+       ?subjectURI edm:rights <http://creativecommons.org/publicdomain/mark/1.0/> .
+       ?firstpage dcterms:isReferencedBy ?manifest
+}'''
+
+ +
+
+
+ +
+
+
+
+

Get list of IIIF Manifests URLs:

+ +
+
+
+
+
+
In [5]:
+
+
+
df = get_sparql_dataframe(anno_lod_endpoint, query)
+manifests = list(df['manifest'])
+manifests
+
+ +
+
+
+ +
+
+ + +
+ +
Out[5]:
+ + + + +
+
['http://iiif.onb.ac.at/presentation/ANNO/stm1875ag0001/manifest',
+ 'http://iiif.onb.ac.at/presentation/ANNO/stm1876ag0001/manifest',
+ 'http://iiif.onb.ac.at/presentation/ANNO/stm1877ag0001/manifest',
+ 'http://iiif.onb.ac.at/presentation/ANNO/stm1878ag0001/manifest']
+
+ +
+ +
+
+ +
+
+
+
+

Function to create a SACHA Collection (https://iiif.onb.ac.at/api#_collectionspostjsonprocessor):

+ +
+
+
+
+
+
In [6]:
+
+
+
def create_collection(description, list_of_manifest_ids_or_ids):
+    j = {
+        "description": description,
+        "elements": list_of_manifest_ids_or_ids
+    }
+    creation_link = 'https://iiif.onb.ac.at/presentation/collection'
+    result = requests.post(creation_link, json=j)
+    if result.status_code == 201:
+        print('SUCCESS: Create collection {}'.format(result.json()['url']))
+        print('View collection in Mirador: https://iiif.onb.ac.at/view/collection/mirador/' + result.json()['url'].split('/').pop())
+    elif result.status_code == 400:
+        print('ERROR: Request error creating collection')
+        print(result.text)
+    elif result.status_code == 500:
+        print('ERROR: Server error creating collection')
+        print(result.text)
+    else:
+        print('ERROR: General error creating collection, HTTP status = {}'.format(result.status_code))
+
+ +
+
+
+ +
+
+
+
+

Create the SACHA Collection:

+ +
+
+
+
+
+
In [7]:
+
+
+
create_collection("newspaper with subject heading Statistik", manifests)
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
SUCCESS: Create collection https://iiif.onb.ac.at/presentation/collection/R9kE0IcrIE
+View collection in Mirador: https://iiif.onb.ac.at/view/collection/mirador/R9kE0IcrIE
+
+
+
+ +
+
+ +
+
+
+ + + + + + diff --git a/html-versions/3.2 - Images - Download pre-downsized images for machine learning.html b/html-versions/3.2 - Images - Download pre-downsized images for machine learning.html new file mode 100644 index 0000000..8aa7a55 --- /dev/null +++ b/html-versions/3.2 - Images - Download pre-downsized images for machine learning.html @@ -0,0 +1,15023 @@ + + + + +3.2 - Images - Download pre-downsized images for machine learning + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+
+

3.2 - Images - Download pre-downsized images for machine learning

I want to download a bunch of small images, already scaled down for my CNN

+

https://labs.onb.ac.at/en/dataset/akon/

+

https://labs.onb.ac.at/gitlab/labs-team/raw-metadata/raw/master/akon_postcards_public_domain.csv.bz2

+

https://github.com/h2non/jsonpath-ng

+ +
+
+
+
+
+
+

Let's say you got a bunch of old timey scenery photographs. +And you want to extract all images containing mountains, why not. +And, because you can, you want an AI to do all the dirty work for you.

+

What that has to do with this workshop?

+

You can use the historic postcards from the ONB Labs as training data for your AI.

+ +
+
+
+
+
+
+

Disclaimer: The AI-part is beyond the scope of this notebook, and would blow up the size of the venv considerably.

+

If you want instructions on actually performing the training, take a look at

+ +

One way to do it: Download a VGG16 network that's pre-trained on ImageNet, remove the last layer (the actual classifier), add your own output layer with 2 outputs ('mountain', 'no mountain') and train that one.

+

Now back to the show.

+ +
+
+
+
+
+
+

What do we have to do?

+
    +
  • Download Metdata
      +
    • List of all available postcards
    • +
    • Info about the 'mountain-ness' of postcards
    • +
    +
  • +
  • Create Download Links
      +
    • To fetch all images
    • +
    +
  • +
  • Split Into Two Sets
      +
    • Mountain and non-mountain
    • +
    +
  • +
  • Download Images
  • +
+ +
+
+
+
+
+
+

Download Metadata

+
+
+
+
+
+
+

Download the metadata set from the ONB Labs

+ +
+
+
+
+
+
In [1]:
+
+
+
import pandas as pd
+
+# Let pandas show all available columns
+pd.set_option('display.max_columns', 50)
+# Pandas can read data directly from web links, even compressed files
+meta = pd.read_csv('https://labs.onb.ac.at/gitlab/labs-team/' \
+                   'raw-metadata/raw/master/akon_postcards_public_domain.csv.bz2', compression='bz2')
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
/home/oida/labs/pydays19/venv/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3049: DtypeWarning: Columns (13) have mixed types. Specify dtype option on import or set low_memory=False.
+  interactivity=interactivity, compiler=compiler, result=result)
+
+
+
+ +
+
+ +
+
+
+
In [2]:
+
+
+
meta.sample(6)
+
+ +
+
+
+ +
+
+ + +
+ +
Out[2]:
+ + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Unnamed: 0akon_ididaltitudebuildingcitycolorcommentmountainotherphotographerpublisherpublisher_placeregionwater_bodyyearinventory_numbersignaturerevision_datedatefeature_classfeature_codegeoname_idlatitudelongitudenamecountry_idadmin_name_1admin_code_1geo
66836683AK121_35280931NaNZwingerDresdenFalsev. 1907NaNNaNNaNNaNNaNNaNNaNNaNNaNGeogr. Topogr. Bilder-Samml. 1943, 74022014-08-25 13:52:35.479vor 1907PPPLA2935022.051.0508913.73832DresdenDENaNNaN51.05089, 13.73832
10601060AK074_28745904NaNNaNSolingenTrue1908 gelNaNKaiser Wilhelm-BrückeNaNNaNNaNNaNNaNNaNNaNNaN2014-08-19 15:22:42.160gelaufen 1908PPPLA32831580.051.173437.08450SolingenDENordrhein-Westfalen0751.17343, 7.0845
3422534225AK087_16954994NaNNaNVenezia, Piazza S. MarcoFalse1925 gelNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN2014-08-25 09:26:12.544gelaufen 1925PPPLA3164603.045.4371312.33265VeneciaITNaNNaN45.43713, 12.33265
2025020250AK030_36717883NaNNaNVorder StoderFalseNaNTodtengebirge, Spitzmauer, Kleiner Priel, Groß...NaNNaNLedermannWienNaNNaN1909.0NaNNaN2014-08-04 07:59:10.2351909PPPL2762185.047.7133714.22712VorderstoderATNaNNaN47.71337, 14.22712
1998119981AK029_17317088NaNNaNPöggstallFalse1903 gelNaNNaNNaNHofmeisterPöggstallNaNNaNNaNNaNNaN2014-08-04 07:59:10.223gelaufen 1903PPPLA32768616.048.3166715.18333PöggstallATNaNNaN48.31667, 15.18333
3049230492AK088_05555510NaNNaNNeutitschein, ObertorstrasseFalse1920 gelNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN2014-08-28 13:39:02.860gelaufen 1920PPPL3069305.049.5943818.01028NeutitscheinCZNaNNaN49.59438, 18.01028
+
+
+ +
+ +
+
+ +
+
+
+
+

Ok, we have metadata. And look, there's a column mountain:

+ +
+
+
+
+
+
In [3]:
+
+
+
meta.sample(5)[['akon_id', 'mountain']]
+
+ +
+
+
+ +
+
+ + +
+ +
Out[3]:
+ + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
akon_idmountain
33148AK076_442Watzmann, Hochkalter
29503AK070_327NaN
5663AK075_470NaN
31604AK107_561NaN
8748AK091_235NaN
+
+
+ +
+ +
+
+ +
+
+
+
+

Later, we'll split the dataset in two using the data in this column.

+ +
+
+
+
+
+
+ +
+
+
+
+
+
+

The SACHA project provides an API for accessing digitized objects of the National Library via IIIF. +The online documentation for the API is here: https://iiif.onb.ac.at/api.

+

We're especially interested in the possibility to serve manifests: https://iiif.onb.ac.at/api#_manifestrequestprocessor:

+ +
GET /presentation/{projectName}/{id}/manifest
+ +
+
+
+
+
+
+ +
GET /presentation/{projectName}/{id}/manifest
+

The projectName is AKON ('AnsichtsKarten ONline'), the id is the akon_id.

+

See also https://iiif.onb.ac.at/api#_digitization_projects.

+ +
+
+
+
+
+
+ +
+
+
+
+
+
In [4]:
+
+
+
def akon_id_to_manifest_link(akon_id):
+    return f'https://iiif.onb.ac.at/presentation/AKON/{akon_id}/manifest'
+
+ +
+
+
+ +
+
+
+
In [5]:
+
+
+
akon_id_to_manifest_link('AK024_176')
+
+ +
+
+
+ +
+
+ + +
+ +
Out[5]:
+ + + + +
+
'https://iiif.onb.ac.at/presentation/AKON/AK024_176/manifest'
+
+ +
+ +
+
+ +
+
+
+
+

Let's test the link

+ +
+
+
+
+
+
In [6]:
+
+
+
import requests
+
+r = requests.get(akon_id_to_manifest_link('AK024_176'))
+r.json()
+
+ +
+
+
+ +
+
+ + +
+ +
Out[6]:
+ + + + +
+
{'@context': 'https://iiif.io/api/presentation/2/context.json',
+ '@id': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/manifest',
+ '@type': 'sc:Manifest',
+ 'label': 'Wien, III',
+ 'metadata': [{'label': [{'@value': 'Id', '@language': 'en'},
+    {'@value': 'Id', '@language': 'ger'}],
+   'value': 'AK024_176'},
+  {'label': [{'@value': 'Title', '@language': 'en'},
+    {'@value': 'Titel', '@language': 'ger'}],
+   'value': 'Wien, III'},
+  {'label': [{'@value': 'Place', '@language': 'en'},
+    {'@value': 'Ort', '@language': 'ger'}],
+   'value': "<a href='https://sws.geonames.org/2773040'>Landstraße</a>"},
+  {'label': [{'@value': 'Publisher', '@language': 'en'},
+    {'@value': 'Verlag', '@language': 'ger'}],
+   'value': 'Ledermann'},
+  {'label': [{'@value': 'Place of Publications', '@language': 'en'},
+    {'@value': 'Erscheinungsort', '@language': 'ger'}],
+   'value': 'Wien'},
+  {'label': [{'@value': 'Year', '@language': 'en'},
+    {'@value': 'Jahr', '@language': 'ger'}],
+   'value': '1906'},
+  {'label': [{'@value': 'Disseminator', '@language': 'en'},
+    {'@value': 'Anbieter', '@language': 'ger'}],
+   'value': "<a href='https://akon.onb.ac.at/'>Ansichtskarten Online</a>"},
+  {'label': [{'@value': 'Physical Location', '@language': 'en'},
+    {'@value': 'Standort', '@language': 'ger'}],
+   'value': 'ÖNB'}],
+ 'description': 'Russische Kirche',
+ 'viewingDirection': 'left-to-right',
+ 'viewingHint': 'paged',
+ 'license': 'http://creativecommons.org/publicdomain/mark/1.0/',
+ 'attribution': [{'@value': 'Austrian National Library', '@language': 'en'},
+  {'@value': 'Österreichische Nationalbibliothek', '@language': 'ger'}],
+ 'logo': 'https://iiif.onb.ac.at/logo/',
+ 'seeAlso': [{'@id': 'http://data.onb.ac.at/AKON/AK024_176',
+   'format': 'text/html'},
+  {'@id': 'http://data.onb.ac.at/AKON/AK024_176.rdf',
+   'format': 'application/rdf+xml'}],
+ 'sequences': [{'@context': 'https://iiif.io/api/presentation/2/context.json',
+   '@id': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/sequence/normal',
+   '@type': 'sc:Sequence',
+   'startCanvas': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/canvas/176',
+   'canvases': [{'@context': 'https://iiif.io/api/presentation/2/context.json',
+     '@id': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/canvas/176',
+     '@type': 'sc:Canvas',
+     'label': 'Wien, III',
+     'height': 1681,
+     'width': 1082,
+     'images': [{'@context': 'https://iiif.io/api/presentation/2/context.json',
+       '@id': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/annotation/176',
+       '@type': 'oa:Annotation',
+       'motivation': 'sc:painting',
+       'resource': {'@id': 'https://iiif.onb.ac.at/images/AKON/AK024_176/176/full/full/0/native.jpg',
+        '@type': 'dctypes:Image',
+        'height': 1681,
+        'width': 1082,
+        'format': 'image/jpeg',
+        'service': {'@context': 'https://iiif.io/api/image/2/context.json',
+         '@id': 'https://iiif.onb.ac.at/images/AKON/AK024_176/176',
+         'profile': 'https://iiif.io/api/image/2/level2.json'}},
+       'on': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/canvas/176'}]}]}]}
+
+ +
+ +
+
+ +
+
+
+
+

The manifest link seems to work. Let's add manifest links for all postcards to the dataframe:

+ +
+
+
+
+
+
In [7]:
+
+
+
meta['manifest_link'] = meta['akon_id'].apply(akon_id_to_manifest_link)
+
+ +
+
+
+ +
+
+
+
In [8]:
+
+
+
meta.sample(6)[['akon_id', 'manifest_link']]
+
+ +
+
+
+ +
+
+ + +
+ +
Out[8]:
+ + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
akon_idmanifest_link
32242AK049_538https://iiif.onb.ac.at/presentation/AKON/AK049...
10827AK001_237https://iiif.onb.ac.at/presentation/AKON/AK001...
14148AK009_081https://iiif.onb.ac.at/presentation/AKON/AK009...
8074AK087_246https://iiif.onb.ac.at/presentation/AKON/AK087...
33232AK082_006https://iiif.onb.ac.at/presentation/AKON/AK082...
22877AK040_083https://iiif.onb.ac.at/presentation/AKON/AK040...
+
+
+ +
+ +
+
+ +
+
+
+
+ +
+
+
+
+
+
+

Let's take a look at that manifest again:

+ +
+
+
+
+
+
In [9]:
+
+
+
r = requests.get(akon_id_to_manifest_link('AK024_176'))
+r.json()
+
+ +
+
+
+ +
+
+ + +
+ +
Out[9]:
+ + + + +
+
{'@context': 'https://iiif.io/api/presentation/2/context.json',
+ '@id': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/manifest',
+ '@type': 'sc:Manifest',
+ 'label': 'Wien, III',
+ 'metadata': [{'label': [{'@value': 'Id', '@language': 'en'},
+    {'@value': 'Id', '@language': 'ger'}],
+   'value': 'AK024_176'},
+  {'label': [{'@value': 'Title', '@language': 'en'},
+    {'@value': 'Titel', '@language': 'ger'}],
+   'value': 'Wien, III'},
+  {'label': [{'@value': 'Place', '@language': 'en'},
+    {'@value': 'Ort', '@language': 'ger'}],
+   'value': "<a href='https://sws.geonames.org/2773040'>Landstraße</a>"},
+  {'label': [{'@value': 'Publisher', '@language': 'en'},
+    {'@value': 'Verlag', '@language': 'ger'}],
+   'value': 'Ledermann'},
+  {'label': [{'@value': 'Place of Publications', '@language': 'en'},
+    {'@value': 'Erscheinungsort', '@language': 'ger'}],
+   'value': 'Wien'},
+  {'label': [{'@value': 'Year', '@language': 'en'},
+    {'@value': 'Jahr', '@language': 'ger'}],
+   'value': '1906'},
+  {'label': [{'@value': 'Disseminator', '@language': 'en'},
+    {'@value': 'Anbieter', '@language': 'ger'}],
+   'value': "<a href='https://akon.onb.ac.at/'>Ansichtskarten Online</a>"},
+  {'label': [{'@value': 'Physical Location', '@language': 'en'},
+    {'@value': 'Standort', '@language': 'ger'}],
+   'value': 'ÖNB'}],
+ 'description': 'Russische Kirche',
+ 'viewingDirection': 'left-to-right',
+ 'viewingHint': 'paged',
+ 'license': 'http://creativecommons.org/publicdomain/mark/1.0/',
+ 'attribution': [{'@value': 'Austrian National Library', '@language': 'en'},
+  {'@value': 'Österreichische Nationalbibliothek', '@language': 'ger'}],
+ 'logo': 'https://iiif.onb.ac.at/logo/',
+ 'seeAlso': [{'@id': 'http://data.onb.ac.at/AKON/AK024_176',
+   'format': 'text/html'},
+  {'@id': 'http://data.onb.ac.at/AKON/AK024_176.rdf',
+   'format': 'application/rdf+xml'}],
+ 'sequences': [{'@context': 'https://iiif.io/api/presentation/2/context.json',
+   '@id': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/sequence/normal',
+   '@type': 'sc:Sequence',
+   'startCanvas': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/canvas/176',
+   'canvases': [{'@context': 'https://iiif.io/api/presentation/2/context.json',
+     '@id': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/canvas/176',
+     '@type': 'sc:Canvas',
+     'label': 'Wien, III',
+     'height': 1681,
+     'width': 1082,
+     'images': [{'@context': 'https://iiif.io/api/presentation/2/context.json',
+       '@id': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/annotation/176',
+       '@type': 'oa:Annotation',
+       'motivation': 'sc:painting',
+       'resource': {'@id': 'https://iiif.onb.ac.at/images/AKON/AK024_176/176/full/full/0/native.jpg',
+        '@type': 'dctypes:Image',
+        'height': 1681,
+        'width': 1082,
+        'format': 'image/jpeg',
+        'service': {'@context': 'https://iiif.io/api/image/2/context.json',
+         '@id': 'https://iiif.onb.ac.at/images/AKON/AK024_176/176',
+         'profile': 'https://iiif.io/api/image/2/level2.json'}},
+       'on': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/canvas/176'}]}]}]}
+
+ +
+ +
+
+ +
+
+
+
+

We need to collect all @ids from all resources from all images from all canvases.

+

That's tedious by hand. We'll use jsonpath-ng:

+ +
+
+
+
+
+
In [10]:
+
+
+
from jsonpath_ng import jsonpath, parse
+
+image_id_jp = parse('$.sequences[*].canvases[*].images[*].resource.@id')
+
+ +
+
+
+ +
+
+
+
In [11]:
+
+
+
[match.value for match in image_id_jp.find(r.json())]
+
+ +
+
+
+ +
+
+ + +
+ +
Out[11]:
+ + + + +
+
['https://iiif.onb.ac.at/images/AKON/AK024_176/176/full/full/0/native.jpg']
+
+ +
+ +
+
+ +
+
+
+
+

All of this in one function:

+ +
+
+
+
+
+
In [12]:
+
+
+
image_id_jp = parse('$.sequences[*].canvases[*].images[*].resource.@id')
+
+def image_links_for_manifest_link(manifest_link):
+    r = requests.get(manifest_link)
+    try:
+        json = r.json()
+    except:
+        # default to empty on exceptions - makes batch processing easier in pandas
+        json = {}
+    image_links = [match.value for match in image_id_jp.find(json)]
+    return image_links
+
+ +
+
+
+ +
+
+
+
+

Let's test it:

+ +
+
+
+
+
+
In [13]:
+
+
+
random_akon_id = meta.sample().iloc[0]['akon_id']
+manifest_link = akon_id_to_manifest_link(random_akon_id)
+image_links_for_manifest_link(manifest_link)
+
+ +
+
+
+ +
+
+ + +
+ +
Out[13]:
+ + + + +
+
['https://iiif.onb.ac.at/images/AKON/AK036_284/284/full/full/0/native.jpg']
+
+ +
+ +
+
+ +
+
+
+
+

Looking good.

+ +
+
+
+
+
+
+

Now let's add the image links to the dataframe...

+

...actually, let's not do that now, because it takes a while (upwards of 10 minutes). Let's cheat instead, skip this step and load the resulting dataframe directly.

+ +
+
+
+
+
+
In [14]:
+
+
+
# %%time
+# meta['image_links'] = meta['manifest_link'].apply(image_links_for_manifest_link)
+
+ +
+
+
+ +
+
+
+
In [15]:
+
+
+
import json
+
+def load_json(s):
+    try:
+        return json.loads(s.replace("'", '"'))
+    except:
+        return []
+
+meta = pd.read_csv('postcards_with_image_links.csv.bz2', compression='bz2', converters={
+    'image_links': load_json
+})
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
/home/oida/labs/pydays19/venv/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3049: DtypeWarning: Columns (14) have mixed types. Specify dtype option on import or set low_memory=False.
+  interactivity=interactivity, compiler=compiler, result=result)
+
+
+
+ +
+
+ +
+
+
+
In [16]:
+
+
+
meta.sample(10)
+
+ +
+
+
+ +
+
+ + +
+ +
Out[16]:
+ + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Unnamed: 0Unnamed: 0.1akon_ididaltitudebuildingcitycolorcommentmountainotherphotographerpublisherpublisher_placeregionwater_bodyyearinventory_numbersignaturerevision_datedatefeature_classfeature_codegeoname_idlatitudelongitudenamecountry_idadmin_name_1admin_code_1geomanifest_linkimage_links
243243243AK111_47675139NaNNaNRochlitzFalsev. 1907Rochlitzer BergNaNNaNNaNNaNNaNNaNNaNNaNNiederösterreichische Landesbibliothek 16722014-09-05 11:30:43.299vor 1907THLL2846260.051.0267812.77079Rochlitzer BergDENaNNaN51.02678, 12.77079https://iiif.onb.ac.at/presentation/AKON/AK111...[https://iiif.onb.ac.at/images/AKON/AK111_476/...
348093480934809AK073_57845523NaNKgl. ResidenzWürzburgFalse1909 gelNaNNaNNaNMartinNürnbergNaNNaNNaNNaNNaN2014-08-19 14:22:35.340gelaufen 1909PPPLA22805615.049.793919.95121WürzburgDEBayern0249.79391, 9.95121https://iiif.onb.ac.at/presentation/AKON/AK073...[https://iiif.onb.ac.at/images/AKON/AK073_578/...
180691806918069AK023_14513445NaNNaNVillachTrueNaNMittagskogelNaNNaNNaNNaNNaNNaN1912.0NaNNaN2014-08-04 07:59:10.1561912PPPLA22762372.046.6102813.85583VillachATNaNNaN46.61028, 13.85583https://iiif.onb.ac.at/presentation/AKON/AK023...[https://iiif.onb.ac.at/images/AKON/AK023_145/...
455445544554AK034_08620003693.0Chorherrensift VorauVorauFalseNaNNaNNaNNaNRazaVorauNaNNaN1924.0NaNNaN2014-09-16 14:48:11.4551924SMSTY2762297.047.4000015.90000Stift VorauATNaNNaN47.4, 15.9https://iiif.onb.ac.at/presentation/AKON/AK034...[https://iiif.onb.ac.at/images/AKON/AK034_086/...
209072090720907AK032_49719311NaNSchloss PurgstallNaNFalseNaNNaNNaNNaNNaNNaNNaNNaN1918.0NaNNaN2014-08-04 07:59:10.2571918AADM37873031.048.0551315.13316Purgstall an der ErlaufATNaNNaN48.05513, 15.13316https://iiif.onb.ac.at/presentation/AKON/AK032...[https://iiif.onb.ac.at/images/AKON/AK032_497/...
513651365136AK111_05474715NaNNaNKindbergFalse1901 gelNaNNaNNaNNaNNaNNaNNaNNaNNaNNiederösterreichische Landesbibliothek 16642014-09-05 10:17:42.132gelaufen 1901PPPLA32774437.047.5000015.45000KindbergATNaNNaN47.5, 15.45https://iiif.onb.ac.at/presentation/AKON/AK111...[https://iiif.onb.ac.at/images/AKON/AK111_054/...
387138713871AK125_38183488601.0Hans Hackl's Gasthof zum JaidhausHinterstoderFalseNaNNaNNaNNaNNaNNaNNaNNaN1911.0NaNNationalbibliothek Karten Abteilung 58622014-09-12 16:07:31.7801911PPPL2776235.047.6995714.15468HinterstoderATNaNNaN47.69957, 14.15468https://iiif.onb.ac.at/presentation/AKON/AK125...[https://iiif.onb.ac.at/images/AKON/AK125_381/...
117411741174AK116_23577922NaNBurgruine GarsGars a. KampFalse1913 gelNaNNaNNaNKiennastGarsNaNNaNNaN79/59 KNaN2014-09-09 12:22:52.928gelaufen 1913PPPLA32778845.048.5833315.65000Gars am KampATNaNNaN48.58333, 15.65https://iiif.onb.ac.at/presentation/AKON/AK116...[https://iiif.onb.ac.at/images/AKON/AK116_235/...
189718971897AK118_37665136NaNNaNNaNFalse1925 gelNaNNaNNaNNaNNaNNaNGrundlseeNaN11/44 Kt.Geogr. Topogr. Bilder-Samml. 1944, 41442014-09-10 07:51:30.611gelaufen 1925HLK2777424.047.6333313.86667GrundlseeATNaNNaN47.63333, 13.86667https://iiif.onb.ac.at/presentation/AKON/AK118...[https://iiif.onb.ac.at/images/AKON/AK118_376/...
332433324333243AK083_21752264NaNNaNHöllenthalFalsev 1905NaNNaNNaNJohannesPartenkirchen-GarmischNaNNaNNaNNaNNaN2014-08-26 12:14:56.005vor 1905TCRQ2900507.047.4333311.01667Höllental KarDEBayern0247.43333, 11.01667https://iiif.onb.ac.at/presentation/AKON/AK083...[https://iiif.onb.ac.at/images/AKON/AK083_217/...
+
+
+ +
+ +
+
+ +
+
+
+
+

Split Into Two Sets

+
+
+
+
+
+
+

We'll split the dataframe into two: One with mountains, one without.

+ +
+
+
+
+
+
In [17]:
+
+
+
nomountain = meta[ meta['mountain'].isnull() ]
+mountain = meta[ ~ meta['mountain'].isnull() ]
+
+ +
+
+
+ +
+
+
+
In [18]:
+
+
+
len(meta), len(nomountain), len(mountain)
+
+ +
+
+
+ +
+
+ + +
+ +
Out[18]:
+ + + + +
+
(34846, 29271, 5575)
+
+ +
+ +
+
+ +
+
+
+
+

Yeah, that adds up.

+ +
+
+
+
+
+
+

Download

+
+
+
+
+
+
+

Ok, so what's left to do?

+
    +
  • Download all image data into two separate directories for training
  • +
  • Resize the images for the CNN used
  • +
+

VGG16 and VGG19 expect 224x224 pixel RGB images.

+ +
+
+
+
+
+
+

Luckily, IIIF allows us to request images already resized to our demands. That saves on bandwidth, time and code complexity.

+

According to the standard we can use the size parameter to resize the image exactly to the dimensions we need.

+

The links, before and after, would be:

+

https://iiif.onb.ac.at/images/AKON/AK024_176/176/full/full/0/native.jpg

+

https://iiif.onb.ac.at/images/AKON/AK024_176/176/full/224,224/0/native.jpg

+ +
+
+
+
+
+
+

Let's try it:

+ +
+
+
+
+
+
In [19]:
+
+
+
r = requests.get('https://iiif.onb.ac.at/images/AKON/AK024_176/176/full/224,224/0/native.jpg')
+
+ +
+
+
+ +
+
+
+
In [20]:
+
+
+
from IPython.display import display, Image
+
+ +
+
+
+ +
+
+
+
In [21]:
+
+
+
display(Image(r.content))
+
+ +
+
+
+ +
+
+ + +
+ +
+ + + + +
+ +
+ +
+ +
+
+ +
+
+
+
+

That looks about right.

+ +
+
+
+
+
+
+

Download to file:

+ +
+
+
+
+
+
In [22]:
+
+
+
import shutil
+
+def download_to_file(url, filename):
+    with requests.get(url, stream=True) as r:
+        with open(filename, 'wb') as fh:
+            shutil.copyfileobj(r.raw, fh)
+
+def sized_link(iiif_url, size='224,224'):
+    frags = iiif_url.split('/')
+    frags[-3] = size
+    return '/'.join(frags)
+
+ +
+
+
+ +
+
+
+
+

Test that:

+ +
+
+
+
+
+
In [23]:
+
+
+
link = sized_link('https://iiif.onb.ac.at/images/AKON/AK024_176/176/full/full/0/native.jpg')
+download_to_file(link, 'testimg.jpg')
+
+ +
+
+
+ +
+
+
+
In [24]:
+
+
+
with open('testimg.jpg', 'rb') as fh:
+    display(Image(fh.read()))
+
+ +
+
+
+ +
+
+ + +
+ +
+ + + + +
+ +
+ +
+ +
+
+ +
+
+
+
+

Create directories:

+ +
+
+
+
+
+
In [25]:
+
+
+
import os
+
+os.mkdir('./images')
+os.mkdir('./images/mountain')
+os.mkdir('./images/nomountain')
+
+ +
+
+
+ +
+
+
+
+

Now let's download!

+ +
+
+
+
+
+
In [26]:
+
+
+
# For this demonstration we'll just take 10 images each
+for idx, row in mountain.sample(10).iterrows():
+    akon_id = row['akon_id']
+    for n, link in enumerate(row['image_links']):
+        small_image_link = sized_link(link)
+        file_name = f'./images/mountain/{akon_id}_{n}.jpg'
+        download_to_file(small_image_link, file_name)
+        print('.', end='')
+for idx, row in nomountain.sample(10).iterrows():
+    akon_id = row['akon_id']
+    for n, link in enumerate(row['image_links']):
+        small_image_link = sized_link(link)
+        file_name = f'./images/nomountain/{akon_id}_{n}.jpg'
+        download_to_file(small_image_link, file_name)
+        print('.', end='')    
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
....................
+
+
+ +
+
+ +
+
+
+ + + + + + diff --git a/html-versions/3.3 - Text - Download OCR Text.html b/html-versions/3.3 - Text - Download OCR Text.html new file mode 100644 index 0000000..8ba241d --- /dev/null +++ b/html-versions/3.3 - Text - Download OCR Text.html @@ -0,0 +1,14210 @@ + + + + +3.3 - Text - Download OCR Text + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+
+

3.3 - Text - Download OCR Text

I need loads of text from old newspapers, preferably with loads of errors due to bad OCR.

+

https://labs.onb.ac.at/en/dataset/anno/

+

https://github.com/cneud/alto-tools

+ +
+
+
+
+
+
+

In order to get to this text, we have to

+
    +
  • Find a newspaper issue we'd like to harvest
  • +
  • Download the IIIF manifest for this newspaper issue
  • +
  • Download the ALTO-XML files for this newspaper issue
  • +
  • Convert the ALTO-XML to TXT
  • +
+ +
+
+
+
+
+
+

Find a Newspaper Issue

+
+
+
+
+
+
+

Let's take a look at the ONB Labs' historic newspapers

+ +
+
+
+
+
+
In [1]:
+
+
+
import pandas as pd
+
+meta = pd.read_csv('https://labs.onb.ac.at/gitlab/labs-team/raw-metadata/raw/master/anno_labs_issues.csv.bz2', compression='bz2')
+
+ +
+
+
+ +
+
+
+
In [2]:
+
+
+
meta.sample(10)
+
+ +
+
+
+ +
+
+ + +
+ +
Out[2]:
+ + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
manifest_idaidyeardaydc_titledc_title_additionalsubjectsplace_of_publicationslanguagesdc_type...meta_typeini_typemodification_datetimelonger_page_iddc_datelink_pdflink_oldhas_ocrmeta_idpage_count
47898bor18220712bor182218220712Amtliches Cursblatt der Wiener BörseNaNWirtschaftWiendenewspaper...zeitungenanno2013-04-22 14:12:2801822-07-12http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...http://anno.onb.ac.at/cgi-content/anno?aid=bor...0817502
149247lvb18740301lvb187418740301Linzer VolksblattNaNTageszeitungLinzdenewspaper...zeitungenanno2010-11-29 09:39:0401874-03-01http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...http://anno.onb.ac.at/cgi-content/anno?aid=lvb...17766816
48126bor18230417bor182318230417Amtliches Cursblatt der Wiener BörseNaNWirtschaftWiendenewspaper...zeitungenanno2013-04-22 14:13:3001823-04-17http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...http://anno.onb.ac.at/cgi-content/anno?aid=bor...0819782
80491neu18670531neu186718670531Die NeuzeitNaNTageszeitungWiendenewspaper...zeitungenanno2012-11-19 11:38:1101867-05-31http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...http://anno.onb.ac.at/cgi-content/anno?aid=neu...130997712
138634joe18710415joe187118710415Jörgel BriefeNaNWochenzeitungWiendenewspaper...zeitungenanno2009-04-02 10:54:2701871-04-15http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...http://anno.onb.ac.at/cgi-content/anno?aid=joe...176272416
75189iwe18730304iwe187318730304Illustrirtes Wiener ExtrablattNaNTageszeitungWiendenewspaper...zeitungenanno2014-07-25 11:26:1301873-03-04http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...http://anno.onb.ac.at/cgi-content/anno?aid=iwe...12208358
193395wtz18680917wtz186818680917Theaterzettel (Oper und Burgtheater in Wien)NaNKultur, Kunst, Theater, MusikWiendenewspaper...zeitungenanno2014-03-21 10:28:5901868-09-17http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...http://anno.onb.ac.at/cgi-content/anno?aid=wtz...09738451
69095ode18630730ode186318630730Ost-Deutsche PostNaNTageszeitungWiendenewspaper...zeitungenanno2018-08-29 09:15:1101863-07-30http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...http://anno.onb.ac.at/cgi-content/anno?aid=ode...11834584
125850hum18450125hum184518450125Der HumoristNaNHumor, Satire, GeschichteWiendenewspaper...zeitungenanno2003-11-20 12:06:0101845-01-25http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...http://anno.onb.ac.at/cgi-content/anno?aid=hum...172914812
152963mop18540720mop185418540720Morgen-PostNaNTageszeitungWiendenewspaper...zeitungenanno2012-12-11 13:45:4901854-07-20http://anno.onb.ac.at/cgi-content/anno_pdf.pl?...http://anno.onb.ac.at/cgi-content/anno?aid=mop...18011954
+

10 rows × 21 columns

+
+
+ +
+ +
+
+ +
+
+
+
+

Let's go with the Ost-Deutsche Post issue from the 30th of July 1863

+ +
+
+
+
+
+
In [3]:
+
+
+
manifest_id = 'ode18630730'
+
+ +
+
+
+ +
+
+
+
+

Download the IIIF Manifest

+
+
+
+
+
+
+

If we look at the SACHA API description, we see that the link for the IIIF manifest has to look like this:

+

http://iiif.onb.ac.at/presentation/ANNO/ode18630730/manifest

+ +
+
+
+
+
+
In [4]:
+
+
+
import requests
+
+ +
+
+
+ +
+
+
+
In [5]:
+
+
+
r = requests.get(f'http://iiif.onb.ac.at/presentation/ANNO/{manifest_id}/manifest')
+
+ +
+
+
+ +
+
+
+
In [6]:
+
+
+
r.json()
+
+ +
+
+
+ +
+
+ + +
+ +
Out[6]:
+ + + + +
+
{'@context': 'https://iiif.io/api/presentation/2/context.json',
+ '@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/manifest',
+ '@type': 'sc:Manifest',
+ 'label': 'Ost-Deutsche Post 1863-07-30',
+ 'metadata': [{'label': [{'@value': 'Id', '@language': 'en'},
+    {'@value': 'Id', '@language': 'ger'}],
+   'value': 'ode18630730'},
+  {'label': [{'@value': 'Title', '@language': 'en'},
+    {'@value': 'Titel', '@language': 'ger'}],
+   'value': 'Ost-Deutsche Post'},
+  {'label': [{'@value': 'Type', '@language': 'en'},
+    {'@value': 'Typ', '@language': 'ger'}],
+   'value': 'newspaper'},
+  {'label': [{'@value': 'Place of Publications', '@language': 'en'},
+    {'@value': 'Erscheinungsort', '@language': 'ger'}],
+   'value': "<a href='http://d-nb.info/gnd/4066009-6'>Wien</a>"},
+  {'label': [{'@value': 'Date Issued', '@language': 'en'},
+    {'@value': 'Erscheinungsdatum', '@language': 'ger'}],
+   'value': '1863-07-30'},
+  {'label': [{'@value': 'Subject Heading', '@language': 'en'},
+    {'@value': 'Schlagworte', '@language': 'ger'}],
+   'value': "<a href='http://d-nb.info/gnd/4067510-5'>Tageszeitung</a>"},
+  {'label': [{'@value': 'Disseminator', '@language': 'en'},
+    {'@value': 'Anbieter', '@language': 'ger'}],
+   'value': "<a href='http://anno.onb.ac.at/'>Austrian Newspapers Online</a>"},
+  {'label': [{'@value': 'Languages', '@language': 'en'},
+    {'@value': 'Sprachen', '@language': 'ger'}],
+   'value': 'ger'}],
+ 'description': 'Ost-Deutsche Post 1863-07-30',
+ 'viewingDirection': 'left-to-right',
+ 'viewingHint': 'paged',
+ 'license': 'http://creativecommons.org/publicdomain/mark/1.0/',
+ 'attribution': [{'@value': 'Austrian National Library', '@language': 'en'},
+  {'@value': 'Österreichische Nationalbibliothek', '@language': 'ger'}],
+ 'logo': 'https://iiif.onb.ac.at/logo/',
+ 'seeAlso': [{'@id': 'http://anno.onb.ac.at/cgi-content/anno_pdf.pl?aid=ode&datum=18630730',
+   'format': 'application/pdf'},
+  {'@id': 'http://anno.onb.ac.at/cgi-content/anno?aid=ode&datum=18630730',
+   'format': 'text/html'},
+  {'@id': 'http://data.onb.ac.at/ANNO/ode18630730.rdf',
+   'format': 'application/rdf+xml'}],
+ 'sequences': [{'@context': 'https://iiif.io/api/presentation/2/context.json',
+   '@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/sequence/normal',
+   '@type': 'sc:Sequence',
+   'startCanvas': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000001',
+   'canvases': [{'@context': 'https://iiif.io/api/presentation/2/context.json',
+     '@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000001',
+     '@type': 'sc:Canvas',
+     'label': '00000001',
+     'height': 6148,
+     'width': 4456,
+     'metadata': [{'label': 'Resolution', 'value': '300dpi'},
+      {'label': 'Color Depth', 'value': '8bpp'}],
+     'images': [{'@context': 'https://iiif.io/api/presentation/2/context.json',
+       '@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/annotation/00000001',
+       '@type': 'oa:Annotation',
+       'motivation': 'sc:painting',
+       'resource': {'@id': 'https://iiif.onb.ac.at/images/ANNO/ode18630730/00000001/full/full/0/default.jpg',
+        '@type': 'dctypes:Image',
+        'height': 6148,
+        'width': 4456,
+        'format': 'image/jpeg',
+        'service': {'@context': 'https://iiif.io/api/image/2/context.json',
+         '@id': 'https://iiif.onb.ac.at/images/ANNO/ode18630730/00000001',
+         'profile': 'https://iiif.io/api/image/2/level2.json'}},
+       'on': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000001'}],
+     'otherContent': [{'@context': 'https://iiif.io/api/presentation/2/context.json',
+       '@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000001.json',
+       '@type': 'sc:AnnotationList',
+       'resources': [{'@type': 'oa:Annotation',
+         'motivation': 'sc:painting',
+         'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000001.xml',
+          '@type': 'dctypes:Text',
+          'format': 'application/xml+alto'},
+         'on': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000001'}]}]},
+    {'@context': 'https://iiif.io/api/presentation/2/context.json',
+     '@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000002',
+     '@type': 'sc:Canvas',
+     'label': '00000002',
+     'height': 6176,
+     'width': 4444,
+     'metadata': [{'label': 'Resolution', 'value': '300dpi'},
+      {'label': 'Color Depth', 'value': '8bpp'}],
+     'images': [{'@context': 'https://iiif.io/api/presentation/2/context.json',
+       '@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/annotation/00000002',
+       '@type': 'oa:Annotation',
+       'motivation': 'sc:painting',
+       'resource': {'@id': 'https://iiif.onb.ac.at/images/ANNO/ode18630730/00000002/full/full/0/default.jpg',
+        '@type': 'dctypes:Image',
+        'height': 6176,
+        'width': 4444,
+        'format': 'image/jpeg',
+        'service': {'@context': 'https://iiif.io/api/image/2/context.json',
+         '@id': 'https://iiif.onb.ac.at/images/ANNO/ode18630730/00000002',
+         'profile': 'https://iiif.io/api/image/2/level2.json'}},
+       'on': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000002'}],
+     'otherContent': [{'@context': 'https://iiif.io/api/presentation/2/context.json',
+       '@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000002.json',
+       '@type': 'sc:AnnotationList',
+       'resources': [{'@type': 'oa:Annotation',
+         'motivation': 'sc:painting',
+         'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000002.xml',
+          '@type': 'dctypes:Text',
+          'format': 'application/xml+alto'},
+         'on': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000002'}]}]},
+    {'@context': 'https://iiif.io/api/presentation/2/context.json',
+     '@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000003',
+     '@type': 'sc:Canvas',
+     'label': '00000003',
+     'height': 6148,
+     'width': 4456,
+     'metadata': [{'label': 'Resolution', 'value': '300dpi'},
+      {'label': 'Color Depth', 'value': '8bpp'}],
+     'images': [{'@context': 'https://iiif.io/api/presentation/2/context.json',
+       '@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/annotation/00000003',
+       '@type': 'oa:Annotation',
+       'motivation': 'sc:painting',
+       'resource': {'@id': 'https://iiif.onb.ac.at/images/ANNO/ode18630730/00000003/full/full/0/default.jpg',
+        '@type': 'dctypes:Image',
+        'height': 6148,
+        'width': 4456,
+        'format': 'image/jpeg',
+        'service': {'@context': 'https://iiif.io/api/image/2/context.json',
+         '@id': 'https://iiif.onb.ac.at/images/ANNO/ode18630730/00000003',
+         'profile': 'https://iiif.io/api/image/2/level2.json'}},
+       'on': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000003'}],
+     'otherContent': [{'@context': 'https://iiif.io/api/presentation/2/context.json',
+       '@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000003.json',
+       '@type': 'sc:AnnotationList',
+       'resources': [{'@type': 'oa:Annotation',
+         'motivation': 'sc:painting',
+         'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000003.xml',
+          '@type': 'dctypes:Text',
+          'format': 'application/xml+alto'},
+         'on': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000003'}]}]},
+    {'@context': 'https://iiif.io/api/presentation/2/context.json',
+     '@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000004',
+     '@type': 'sc:Canvas',
+     'label': '00000004',
+     'height': 6176,
+     'width': 4416,
+     'metadata': [{'label': 'Resolution', 'value': '300dpi'},
+      {'label': 'Color Depth', 'value': '8bpp'}],
+     'images': [{'@context': 'https://iiif.io/api/presentation/2/context.json',
+       '@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/annotation/00000004',
+       '@type': 'oa:Annotation',
+       'motivation': 'sc:painting',
+       'resource': {'@id': 'https://iiif.onb.ac.at/images/ANNO/ode18630730/00000004/full/full/0/default.jpg',
+        '@type': 'dctypes:Image',
+        'height': 6176,
+        'width': 4416,
+        'format': 'image/jpeg',
+        'service': {'@context': 'https://iiif.io/api/image/2/context.json',
+         '@id': 'https://iiif.onb.ac.at/images/ANNO/ode18630730/00000004',
+         'profile': 'https://iiif.io/api/image/2/level2.json'}},
+       'on': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000004'}],
+     'otherContent': [{'@context': 'https://iiif.io/api/presentation/2/context.json',
+       '@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000004.json',
+       '@type': 'sc:AnnotationList',
+       'resources': [{'@type': 'oa:Annotation',
+         'motivation': 'sc:painting',
+         'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000004.xml',
+          '@type': 'dctypes:Text',
+          'format': 'application/xml+alto'},
+         'on': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000004'}]}]}]}]}
+
+ +
+ +
+
+ +
+
+
+
+

There's a lot of information in there. We need the info blocks with links to ALTO-XML resources.

+

Let's use jsonpath-ng for that.

+ +
+
+
+
+
+
In [7]:
+
+
+
from jsonpath_ng import parse
+
+ +
+
+
+ +
+
+
+
In [8]:
+
+
+
def jp(http_response, parser):
+    return [match.value for match in parser.find(http_response.json())]
+
+ +
+
+
+ +
+
+
+
In [9]:
+
+
+
resource_parser = parse('$.sequences[*].canvases[*].otherContent[*].resources')
+
+ +
+
+
+ +
+
+
+
In [10]:
+
+
+
jp(r, resource_parser)
+
+ +
+
+
+ +
+
+ + +
+ +
Out[10]:
+ + + + +
+
[[{'@type': 'oa:Annotation',
+   'motivation': 'sc:painting',
+   'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000001.xml',
+    '@type': 'dctypes:Text',
+    'format': 'application/xml+alto'},
+   'on': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000001'}],
+ [{'@type': 'oa:Annotation',
+   'motivation': 'sc:painting',
+   'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000002.xml',
+    '@type': 'dctypes:Text',
+    'format': 'application/xml+alto'},
+   'on': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000002'}],
+ [{'@type': 'oa:Annotation',
+   'motivation': 'sc:painting',
+   'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000003.xml',
+    '@type': 'dctypes:Text',
+    'format': 'application/xml+alto'},
+   'on': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000003'}],
+ [{'@type': 'oa:Annotation',
+   'motivation': 'sc:painting',
+   'resource': {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000004.xml',
+    '@type': 'dctypes:Text',
+    'format': 'application/xml+alto'},
+   'on': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/canvas/00000004'}]]
+
+ +
+ +
+
+ +
+
+
+
+

Not quite there yet.

+ +
+
+
+
+
+
In [11]:
+
+
+
all_resources = parse('$.sequences[*].canvases[*].otherContent[*].resources[*].resource')
+
+ +
+
+
+ +
+
+
+
In [12]:
+
+
+
jp(r, all_resources)
+
+ +
+
+
+ +
+
+ + +
+ +
Out[12]:
+ + + + +
+
[{'@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000001.xml',
+  '@type': 'dctypes:Text',
+  'format': 'application/xml+alto'},
+ {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000002.xml',
+  '@type': 'dctypes:Text',
+  'format': 'application/xml+alto'},
+ {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000003.xml',
+  '@type': 'dctypes:Text',
+  'format': 'application/xml+alto'},
+ {'@id': 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000004.xml',
+  '@type': 'dctypes:Text',
+  'format': 'application/xml+alto'}]
+
+ +
+ +
+
+ +
+
+
+
+

Filter just the ones with format application/xml+alto, and there only the @id:

+ +
+
+
+
+
+
In [13]:
+
+
+
ids = [d['@id'] for d in jp(r, all_resources) if d['format'] == 'application/xml+alto']
+
+ +
+
+
+ +
+
+
+
In [14]:
+
+
+
ids
+
+ +
+
+
+ +
+
+ + +
+ +
Out[14]:
+ + + + +
+
['https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000001.xml',
+ 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000002.xml',
+ 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000003.xml',
+ 'https://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000004.xml']
+
+ +
+ +
+
+ +
+
+
+
+

Download the ALTO Files

+
+
+
+
+
+
In [15]:
+
+
+
alto_storage = {}
+
+for xml_link in ids:
+    r = requests.get(xml_link)
+    if r.ok:
+        alto_storage[xml_link] = r.text
+
+ +
+
+
+ +
+
+
+
In [16]:
+
+
+
alto_storage
+
+ +
+
+
+ +
+
+ + +
+ +
Out[16]:
+ + + + +
+
{}
+
+ +
+ +
+
+ +
+
+
+
In [17]:
+
+
+
r
+
+ +
+
+
+ +
+
+ + +
+ +
Out[17]:
+ + + + +
+
<Response [400]>
+
+ +
+ +
+
+ +
+
+
+
In [18]:
+
+
+
r.ok
+
+ +
+
+
+ +
+
+ + +
+ +
Out[18]:
+ + + + +
+
False
+
+ +
+ +
+
+ +
+
+
+
+

Uh oh.

+ +
+
+
+
+
+
+

Convert the ALTO-XML to TXT

+
+
+
+
+
+
In [19]:
+
+
+
import alto_tools
+
+def alto_extract_text_lines(xml, xmlns):
+    text_lines = []
+    nsdict = {'alto': xmlns}
+    for lines in xml.iterfind('.//alto:TextLine', nsdict):
+        words = [line.attrib.get('CONTENT') for line in lines.findall('alto:String', nsdict)]
+        text_lines.append(' '.join(words))
+    return '\n'.join(text_lines)
+
+def alto_to_text(raw_alto_text):
+    alto, xml, xmlns = alto_tools.alto_parse(raw_alto_text)
+    return alto_extract_text_lines(xml, xmlns)
+
+ +
+
+
+ +
+
+
+
In [20]:
+
+
+
print(alto_to_text(f'http://iiif.onb.ac.at/presentation/ANNO/{manifest_id}/resource/00000002.xml'))
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
+---------------------------------------------------------------------------
+OSError                                   Traceback (most recent call last)
+<ipython-input-20-939f29efb91d> in <module>
+----> 1 print(alto_to_text(f'http://iiif.onb.ac.at/presentation/ANNO/{manifest_id}/resource/00000002.xml'))
+
+<ipython-input-19-bfbcde0cf185> in alto_to_text(raw_alto_text)
+     10 
+     11 def alto_to_text(raw_alto_text):
+---> 12     alto, xml, xmlns = alto_tools.alto_parse(raw_alto_text)
+     13     return alto_extract_text_lines(xml, xmlns)
+
+~/labs/pydays19/alto_tools.py in alto_parse(alto)
+     17     """ Convert ALTO xml file to element tree """
+     18     try:
+---> 19         xml = etree.parse(alto)
+     20     except etree.ParseError as e:
+     21         sys.stdout.write('\nERROR: Failed parsing "%s" - '
+
+src/lxml/etree.pyx in lxml.etree.parse()
+
+src/lxml/parser.pxi in lxml.etree._parseDocument()
+
+src/lxml/parser.pxi in lxml.etree._parseDocumentFromURL()
+
+src/lxml/parser.pxi in lxml.etree._parseDocFromFile()
+
+src/lxml/parser.pxi in lxml.etree._BaseParser._parseDocFromFile()
+
+src/lxml/parser.pxi in lxml.etree._ParserContext._handleParseResultDoc()
+
+src/lxml/parser.pxi in lxml.etree._handleParseResult()
+
+src/lxml/parser.pxi in lxml.etree._raiseParseError()
+
+OSError: Error reading file 'http://iiif.onb.ac.at/presentation/ANNO/ode18630730/resource/00000002.xml': failed to load HTTP resource
+
+
+ +
+
+ +
+
+
+ + + + + + -- GitLab