diff --git a/3.2 - Images - Download pre-downsized images for machine learning.ipynb b/3.2 - Images - Download pre-downsized images for machine learning.ipynb index 3fb93fdd171345f112cdc06be3c58d1e77a6b7c9..413b54ed22705fb64444ab2e136cfb54377dbdf8 100644 --- a/3.2 - Images - Download pre-downsized images for machine learning.ipynb +++ b/3.2 - Images - Download pre-downsized images for machine learning.ipynb @@ -14,7 +14,9 @@ "\n", "[https://labs.onb.ac.at/en/dataset/akon/](https://labs.onb.ac.at/en/dataset/akon/)\n", "\n", - "[https://labs.onb.ac.at/gitlab/labs-team/raw-metadata/raw/master/akon_postcards_public_domain.csv.bz2](https://labs.onb.ac.at/gitlab/labs-team/raw-metadata/raw/master/akon_postcards_public_domain.csv.bz2)" + "[https://labs.onb.ac.at/gitlab/labs-team/raw-metadata/raw/master/akon_postcards_public_domain.csv.bz2](https://labs.onb.ac.at/gitlab/labs-team/raw-metadata/raw/master/akon_postcards_public_domain.csv.bz2)\n", + "\n", + "[https://github.com/h2non/jsonpath-ng](https://github.com/h2non/jsonpath-ng)" ] }, { @@ -26,19 +28,12 @@ }, "source": [ "Let's say you got a bunch of old timey scenery photographs.\n", - "And you want all images of lakes, why not.\n", - "And you want an AI to do all the dirty work for you, because you can." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "slideshow": { - "slide_type": "fragment" - } - }, - "source": [ - "You can use the historic postcards from the ONB Labs as training data for your AI." + "And you want to extract all images of lakes, why not.\n", + "And, because you can, you want an AI to do all the dirty work for you.\n", + "\n", + "What that has to do with this workshop?\n", + "\n", + "*You can use the **historic postcards** from the ONB Labs as training data for your AI.*" ] }, { @@ -61,6 +56,26 @@ "Now back to the show." ] }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "What do we have to do?\n", + "\n", + "* **Download Metdata**\n", + " * List of all available postcards\n", + " * Info about the 'lake-ness' of postcards\n", + "* **Create Download Links**\n", + " * To fetch all images\n", + "* **Split Into Two Sets**\n", + " * Lakes and non-lakes\n", + "* **Download Images**" + ] + }, { "cell_type": "markdown", "metadata": { @@ -81,7 +96,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -96,14 +111,16 @@ "source": [ "import pandas as pd\n", "\n", + "# Let pandas show all available columns\n", "pd.set_option('display.max_columns', 50)\n", + "# Pandas can read data directly from web links, even compressed files\n", "meta = pd.read_csv('https://labs.onb.ac.at/gitlab/labs-team/' \\\n", " 'raw-metadata/raw/master/akon_postcards_public_domain.csv.bz2', compression='bz2')" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": { "slideshow": { "slide_type": "subslide" @@ -165,51 +182,50 @@ " \n", " \n", " \n", - " 9839\n", - " 9839\n", - " AK095_271\n", - " 60715\n", - " NaN\n", + " 21958\n", + " 21958\n", + " AK036_452\n", + " 21573\n", + " 355.0\n", " NaN\n", - " Bastia, Rue des Terasses\n", - " False\n", - " 1903 gel\n", + " Piesting\n", + " True\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", + " Ledermann\n", + " Wien\n", " NaN\n", " NaN\n", + " 1906.0\n", " NaN\n", " NaN\n", + " 2014-08-04 07:59:10.288\n", + " 1906\n", + " P\n", + " PPLA3\n", + " 2771869.0\n", + " 47.87358\n", + " 16.12510\n", + " Piesting\n", + " AT\n", " NaN\n", " NaN\n", - " 2014-09-02 07:50:01.517\n", - " gelaufen 1903\n", - " P\n", - " PPLA2\n", - " 3034640.0\n", - " 42.70278\n", - " 9.45000\n", - " Bastia\n", - " FR\n", - " Korsika\n", - " A5\n", - " 42.70278, 9.45\n", + " 47.87358, 16.1251\n", " \n", " \n", - " 17826\n", - " 17826\n", - " AK080_449\n", - " 50329\n", + " 31428\n", + " 31428\n", + " AK102_214\n", + " 66938\n", " NaN\n", " NaN\n", - " Roma, Via Appia Nuova\n", + " Schaffhausen\n", " False\n", - " v 1905\n", - " NaN\n", - " NaN\n", + " v. 1907\n", " NaN\n", + " Hohfluh\n", " NaN\n", " NaN\n", " NaN\n", @@ -217,214 +233,223 @@ " NaN\n", " NaN\n", " NaN\n", - " 2014-08-25 15:48:21.484\n", - " vor 1905\n", + " Geogr. Topogr. Bilder-Samml. 1951, 1093\n", + " 2014-09-03 10:50:31.524\n", + " vor 1907\n", " P\n", - " PPLC\n", - " 3169070.0\n", - " 41.89474\n", - " 12.48390\n", - " Roma\n", - " IT\n", + " PPLA\n", + " 2658761.0\n", + " 47.69732\n", + " 8.63493\n", + " Schaffhausen\n", + " CH\n", " NaN\n", " NaN\n", - " 41.89474, 12.4839\n", + " 47.69732, 8.63493\n", " \n", " \n", - " 20639\n", - " 20639\n", - " AK032_101\n", - " 18815\n", + " 13828\n", + " 13828\n", + " AK008_015\n", + " 4271\n", " NaN\n", - " Sechshaus\n", - " Weidling\n", - " True\n", - " v 1907\n", " NaN\n", + " Eiskaarspitze\n", + " False\n", + " NaN\n", + " Dachstein\n", " NaN\n", " NaN\n", " Ledermann\n", " Wien\n", " NaN\n", " NaN\n", + " 1921.0\n", " NaN\n", " NaN\n", - " NaN\n", - " 2014-08-04 07:59:10.250\n", - " vor 1907\n", - " P\n", - " PPL\n", - " 2761741.0\n", - " 48.29100\n", - " 16.30865\n", - " Weidling\n", + " 2014-08-04 07:59:09.895\n", + " 1921\n", + " T\n", + " MT\n", + " 2775701.0\n", + " 47.47545\n", + " 13.60588\n", + " Dachstein\n", " AT\n", - " Niederösterreich\n", - " 03\n", - " 48.291, 16.30865\n", + " NaN\n", + " NaN\n", + " 47.47545, 13.60588\n", " \n", " \n", - " 28262\n", - " 28262\n", - " AK061_487\n", - " 36913\n", + " 22725\n", + " 22725\n", + " AK039_299\n", + " 23224\n", " NaN\n", " NaN\n", - " Pontebba\n", + " Attnang\n", " False\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", + " Topf\n", + " Attnang-Puchheim\n", " NaN\n", " NaN\n", + " 1919.0\n", " NaN\n", " NaN\n", - " 1918.0\n", - " NaN\n", - " Geogr. Topogr. Bilder-Samml. 1944, 18587\n", - " 2014-08-04 07:59:10.412\n", - " 1918\n", + " 2014-08-04 07:59:10.309\n", + " 1919\n", " P\n", - " PPLA3\n", - " 3170297.0\n", - " 46.50540\n", - " 13.30622\n", - " Pontebba\n", - " IT\n", + " PPLX\n", + " 2782285.0\n", + " 48.01667\n", + " 13.71667\n", + " Attnang\n", + " AT\n", " NaN\n", " NaN\n", - " 46.5054, 13.30622\n", + " 48.01667, 13.71667\n", " \n", " \n", - " 33213\n", - " 33213\n", - " AK080_268\n", - " 50142\n", - " NaN\n", - " NaN\n", - " New York\n", - " True\n", - " v 1905\n", - " NaN\n", + " 32047\n", + " 32047\n", + " AK067_186\n", + " 40906\n", " NaN\n", " NaN\n", + " Salzburg\n", + " False\n", + " 1918 gel\n", + " Mönchsberg\n", " NaN\n", " NaN\n", + " Würthle & Sohn Nachfolger G. m. b. H\n", + " Salzburg\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", - " 2014-08-25 15:27:14.452\n", - " vor 1905\n", + " 2014-08-12 14:04:34.797\n", + " gelaufen 1918\n", " P\n", - " PPL\n", - " 5128581.0\n", - " 40.71427\n", - " -74.00597\n", - " New York City\n", - " US\n", + " PPLA\n", + " 2766824.0\n", + " 47.79941\n", + " 13.04399\n", + " Salzburg\n", + " AT\n", " NaN\n", " NaN\n", - " 40.71427, -74.00597\n", + " 47.79941, 13.04399\n", " \n", " \n", - " 757\n", - " 757\n", - " AK115_107\n", - " 77157\n", + " 17407\n", + " 17407\n", + " AK020_443\n", + " 11927\n", " NaN\n", - " Corte S. Ilario e Duomo\n", - " Görz\n", - " False\n", - " 1900 gel\n", " NaN\n", + " Saalfelden\n", + " False\n", + " v 1907\n", " NaN\n", " NaN\n", " NaN\n", + " Ledermann\n", + " Wien\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", - " 79/66 G\n", " NaN\n", - " 2014-09-09 10:20:27.631\n", - " gelaufen 1900\n", + " 2014-08-04 07:59:10.121\n", + " vor 1907\n", " P\n", - " PPLA2\n", - " 3175986.0\n", - " 45.94088\n", - " 13.62167\n", - " Gorizia\n", - " IT\n", + " PPLA3\n", + " 2766922.0\n", + " 47.42681\n", + " 12.84800\n", + " Saalfelden am Steinernen Meer\n", + " AT\n", " NaN\n", " NaN\n", - " 45.94088, 13.62167\n", + " 47.42681, 12.848\n", " \n", " \n", "\n", "" ], "text/plain": [ - " Unnamed: 0 akon_id id altitude building \\\n", - "9839 9839 AK095_271 60715 NaN NaN \n", - "17826 17826 AK080_449 50329 NaN NaN \n", - "20639 20639 AK032_101 18815 NaN Sechshaus \n", - "28262 28262 AK061_487 36913 NaN NaN \n", - "33213 33213 AK080_268 50142 NaN NaN \n", - "757 757 AK115_107 77157 NaN Corte S. Ilario e Duomo \n", + " Unnamed: 0 akon_id id altitude building city color \\\n", + "21958 21958 AK036_452 21573 355.0 NaN Piesting True \n", + "31428 31428 AK102_214 66938 NaN NaN Schaffhausen False \n", + "13828 13828 AK008_015 4271 NaN NaN Eiskaarspitze False \n", + "22725 22725 AK039_299 23224 NaN NaN Attnang False \n", + "32047 32047 AK067_186 40906 NaN NaN Salzburg False \n", + "17407 17407 AK020_443 11927 NaN NaN Saalfelden False \n", "\n", - " city color comment mountain other photographer \\\n", - "9839 Bastia, Rue des Terasses False 1903 gel NaN NaN NaN \n", - "17826 Roma, Via Appia Nuova False v 1905 NaN NaN NaN \n", - "20639 Weidling True v 1907 NaN NaN NaN \n", - "28262 Pontebba False NaN NaN NaN NaN \n", - "33213 New York True v 1905 NaN NaN NaN \n", - "757 Görz False 1900 gel NaN NaN NaN \n", + " comment mountain other photographer \\\n", + "21958 NaN NaN NaN NaN \n", + "31428 v. 1907 NaN Hohfluh NaN \n", + "13828 NaN Dachstein NaN NaN \n", + "22725 NaN NaN NaN NaN \n", + "32047 1918 gel Mönchsberg NaN NaN \n", + "17407 v 1907 NaN NaN NaN \n", "\n", - " publisher publisher_place region water_body year inventory_number \\\n", - "9839 NaN NaN NaN NaN NaN NaN \n", - "17826 NaN NaN NaN NaN NaN NaN \n", - "20639 Ledermann Wien NaN NaN NaN NaN \n", - "28262 NaN NaN NaN NaN 1918.0 NaN \n", - "33213 NaN NaN NaN NaN NaN NaN \n", - "757 NaN NaN NaN NaN NaN 79/66 G \n", + " publisher publisher_place region \\\n", + "21958 Ledermann Wien NaN \n", + "31428 NaN NaN NaN \n", + "13828 Ledermann Wien NaN \n", + "22725 Topf Attnang-Puchheim NaN \n", + "32047 Würthle & Sohn Nachfolger G. m. b. H Salzburg NaN \n", + "17407 Ledermann Wien NaN \n", "\n", - " signature revision_date \\\n", - "9839 NaN 2014-09-02 07:50:01.517 \n", - "17826 NaN 2014-08-25 15:48:21.484 \n", - "20639 NaN 2014-08-04 07:59:10.250 \n", - "28262 Geogr. Topogr. Bilder-Samml. 1944, 18587 2014-08-04 07:59:10.412 \n", - "33213 NaN 2014-08-25 15:27:14.452 \n", - "757 NaN 2014-09-09 10:20:27.631 \n", + " water_body year inventory_number \\\n", + "21958 NaN 1906.0 NaN \n", + "31428 NaN NaN NaN \n", + "13828 NaN 1921.0 NaN \n", + "22725 NaN 1919.0 NaN \n", + "32047 NaN NaN NaN \n", + "17407 NaN NaN NaN \n", + "\n", + " signature revision_date \\\n", + "21958 NaN 2014-08-04 07:59:10.288 \n", + "31428 Geogr. Topogr. Bilder-Samml. 1951, 1093 2014-09-03 10:50:31.524 \n", + "13828 NaN 2014-08-04 07:59:09.895 \n", + "22725 NaN 2014-08-04 07:59:10.309 \n", + "32047 NaN 2014-08-12 14:04:34.797 \n", + "17407 NaN 2014-08-04 07:59:10.121 \n", "\n", " date feature_class feature_code geoname_id latitude \\\n", - "9839 gelaufen 1903 P PPLA2 3034640.0 42.70278 \n", - "17826 vor 1905 P PPLC 3169070.0 41.89474 \n", - "20639 vor 1907 P PPL 2761741.0 48.29100 \n", - "28262 1918 P PPLA3 3170297.0 46.50540 \n", - "33213 vor 1905 P PPL 5128581.0 40.71427 \n", - "757 gelaufen 1900 P PPLA2 3175986.0 45.94088 \n", + "21958 1906 P PPLA3 2771869.0 47.87358 \n", + "31428 vor 1907 P PPLA 2658761.0 47.69732 \n", + "13828 1921 T MT 2775701.0 47.47545 \n", + "22725 1919 P PPLX 2782285.0 48.01667 \n", + "32047 gelaufen 1918 P PPLA 2766824.0 47.79941 \n", + "17407 vor 1907 P PPLA3 2766922.0 47.42681 \n", "\n", - " longitude name country_id admin_name_1 admin_code_1 \\\n", - "9839 9.45000 Bastia FR Korsika A5 \n", - "17826 12.48390 Roma IT NaN NaN \n", - "20639 16.30865 Weidling AT Niederösterreich 03 \n", - "28262 13.30622 Pontebba IT NaN NaN \n", - "33213 -74.00597 New York City US NaN NaN \n", - "757 13.62167 Gorizia IT NaN NaN \n", + " longitude name country_id admin_name_1 \\\n", + "21958 16.12510 Piesting AT NaN \n", + "31428 8.63493 Schaffhausen CH NaN \n", + "13828 13.60588 Dachstein AT NaN \n", + "22725 13.71667 Attnang AT NaN \n", + "32047 13.04399 Salzburg AT NaN \n", + "17407 12.84800 Saalfelden am Steinernen Meer AT NaN \n", "\n", - " geo \n", - "9839 42.70278, 9.45 \n", - "17826 41.89474, 12.4839 \n", - "20639 48.291, 16.30865 \n", - "28262 46.5054, 13.30622 \n", - "33213 40.71427, -74.00597 \n", - "757 45.94088, 13.62167 " + " admin_code_1 geo \n", + "21958 NaN 47.87358, 16.1251 \n", + "31428 NaN 47.69732, 8.63493 \n", + "13828 NaN 47.47545, 13.60588 \n", + "22725 NaN 48.01667, 13.71667 \n", + "32047 NaN 47.79941, 13.04399 \n", + "17407 NaN 47.42681, 12.848 " ] }, - "execution_count": 6, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -446,7 +471,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -476,50 +501,50 @@ " \n", " \n", " \n", - " 7017\n", - " AK082_070\n", - " NaN\n", + " 19411\n", + " AK027_436\n", + " Millstätter See\n", " \n", " \n", - " 17570\n", - " AK021_135\n", + " 24183\n", + " AK045_467\n", " NaN\n", " \n", " \n", - " 19626\n", - " AK028_223\n", + " 7038\n", + " AK082_171\n", " NaN\n", " \n", " \n", - " 4202\n", - " AK058_235\n", + " 21425\n", + " AK035_008\n", " NaN\n", " \n", " \n", - " 17094\n", - " AK019_205\n", + " 8853\n", + " AK107_209\n", " NaN\n", " \n", " \n", - " 29221\n", - " AK068_474\n", - " NaN\n", + " 27306\n", + " AK057_094\n", + " Csorba See\n", " \n", " \n", "\n", "" ], "text/plain": [ - " akon_id water_body\n", - "7017 AK082_070 NaN\n", - "17570 AK021_135 NaN\n", - "19626 AK028_223 NaN\n", - "4202 AK058_235 NaN\n", - "17094 AK019_205 NaN\n", - "29221 AK068_474 NaN" + " akon_id water_body\n", + "19411 AK027_436 Millstätter See\n", + "24183 AK045_467 NaN\n", + "7038 AK082_171 NaN\n", + "21425 AK035_008 NaN\n", + "8853 AK107_209 NaN\n", + "27306 AK057_094 Csorba See" ] }, - "execution_count": 20, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -550,28 +575,578 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "ONB Labs' own `SACHA` project provides an API for accessing digitized objects via IIIF.\n", + "The [SACHA project](https://iiif.onb.ac.at/) provides an API for accessing digitized objects of the National Library via IIIF.\n", "The online documentation for the API is here: [https://iiif.onb.ac.at/api](https://iiif.onb.ac.at/api).\n", "\n", - "We're especially interested in the possibility to serve manifests: [https://iiif.onb.ac.at/api#_manifestrequestprocessor](https://iiif.onb.ac.at/api#_manifestrequestprocessor).\n", + "We're especially interested in the possibility to serve manifests: [https://iiif.onb.ac.at/api#_manifestrequestprocessor](https://iiif.onb.ac.at/api#_manifestrequestprocessor):\n", "\n", - "The project name is `AKON`, see also [https://iiif.onb.ac.at/api#_digitization_projects)(https://iiif.onb.ac.at/api#_digitization_projects)." + "```\n", + "GET /presentation/{projectName}/{id}/manifest\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "```\n", + "GET /presentation/{projectName}/{id}/manifest\n", + "```\n", + "\n", + "The `projectName` is `AKON` ('AnsichtsKarten ONline'), the `id` is the `akon_id`.\n", + "\n", + "See also [https://iiif.onb.ac.at/api#_digitization_projects](https://iiif.onb.ac.at/api#_digitization_projects)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "### Create Download Links for Manifests" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "def akon_id_to_manifest_link(akon_id):\n", + " return f'https://iiif.onb.ac.at/presentation/AKON/{akon_id}/manifest'" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'https://iiif.onb.ac.at/presentation/AKON/AK024_176/manifest'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "akon_id_to_manifest_link('AK024_176')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "Let's test the link" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "scrolled": true, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", + " '@id': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/manifest',\n", + " '@type': 'sc:Manifest',\n", + " 'label': 'Wien, III',\n", + " 'metadata': [{'label': [{'@value': 'Id', '@language': 'en'},\n", + " {'@value': 'Id', '@language': 'ger'}],\n", + " 'value': 'AK024_176'},\n", + " {'label': [{'@value': 'Title', '@language': 'en'},\n", + " {'@value': 'Titel', '@language': 'ger'}],\n", + " 'value': 'Wien, III'},\n", + " {'label': [{'@value': 'Place', '@language': 'en'},\n", + " {'@value': 'Ort', '@language': 'ger'}],\n", + " 'value': \"Landstraße\"},\n", + " {'label': [{'@value': 'Publisher', '@language': 'en'},\n", + " {'@value': 'Verlag', '@language': 'ger'}],\n", + " 'value': 'Ledermann'},\n", + " {'label': [{'@value': 'Place of Publications', '@language': 'en'},\n", + " {'@value': 'Erscheinungsort', '@language': 'ger'}],\n", + " 'value': 'Wien'},\n", + " {'label': [{'@value': 'Year', '@language': 'en'},\n", + " {'@value': 'Jahr', '@language': 'ger'}],\n", + " 'value': '1906'},\n", + " {'label': [{'@value': 'Disseminator', '@language': 'en'},\n", + " {'@value': 'Anbieter', '@language': 'ger'}],\n", + " 'value': \"Ansichtskarten Online\"},\n", + " {'label': [{'@value': 'Physical Location', '@language': 'en'},\n", + " {'@value': 'Standort', '@language': 'ger'}],\n", + " 'value': 'ÖNB'}],\n", + " 'description': 'Russische Kirche',\n", + " 'viewingDirection': 'left-to-right',\n", + " 'viewingHint': 'paged',\n", + " 'license': 'http://creativecommons.org/publicdomain/mark/1.0/',\n", + " 'attribution': [{'@value': 'Austrian National Library', '@language': 'en'},\n", + " {'@value': 'Österreichische Nationalbibliothek', '@language': 'ger'}],\n", + " 'logo': 'https://iiif.onb.ac.at/logo/',\n", + " 'seeAlso': [{'@id': 'http://data.onb.ac.at/AKON/AK024_176',\n", + " 'format': 'text/html'},\n", + " {'@id': 'http://data.onb.ac.at/AKON/AK024_176.rdf',\n", + " 'format': 'application/rdf+xml'}],\n", + " 'sequences': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", + " '@id': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/sequence/normal',\n", + " '@type': 'sc:Sequence',\n", + " 'startCanvas': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/canvas/176',\n", + " 'canvases': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", + " '@id': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/canvas/176',\n", + " '@type': 'sc:Canvas',\n", + " 'label': 'Wien, III',\n", + " 'height': 1681,\n", + " 'width': 1082,\n", + " 'images': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", + " '@id': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/annotation/176',\n", + " '@type': 'oa:Annotation',\n", + " 'motivation': 'sc:painting',\n", + " 'resource': {'@id': 'https://iiif.onb.ac.at/images/AKON/AK024_176/176/full/full/0/native.jpg',\n", + " '@type': 'dctypes:Image',\n", + " 'height': 1681,\n", + " 'width': 1082,\n", + " 'format': 'image/jpeg',\n", + " 'service': {'@context': 'https://iiif.io/api/image/2/context.json',\n", + " '@id': 'https://iiif.onb.ac.at/images/AKON/AK024_176/176',\n", + " 'profile': 'https://iiif.io/api/image/2/level2.json'}},\n", + " 'on': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/canvas/176'}]}]}]}" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import requests\n", + "\n", + "r = requests.get(akon_id_to_manifest_link('AK024_176'))\n", + "r.json()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "The manifest link seems to work. Let's add them to the dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "meta['manifest_link'] = meta['akon_id'].apply(akon_id_to_manifest_link)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
akon_idmanifest_link
34406AK072_262https://iiif.onb.ac.at/presentation/AKON/AK072...
30373AK085_299https://iiif.onb.ac.at/presentation/AKON/AK085...
18098AK023_263https://iiif.onb.ac.at/presentation/AKON/AK023...
3152AK122_506https://iiif.onb.ac.at/presentation/AKON/AK122...
31981AK097_136https://iiif.onb.ac.at/presentation/AKON/AK097...
160AK111_325https://iiif.onb.ac.at/presentation/AKON/AK111...
\n", + "
" + ], + "text/plain": [ + " akon_id manifest_link\n", + "34406 AK072_262 https://iiif.onb.ac.at/presentation/AKON/AK072...\n", + "30373 AK085_299 https://iiif.onb.ac.at/presentation/AKON/AK085...\n", + "18098 AK023_263 https://iiif.onb.ac.at/presentation/AKON/AK023...\n", + "3152 AK122_506 https://iiif.onb.ac.at/presentation/AKON/AK122...\n", + "31981 AK097_136 https://iiif.onb.ac.at/presentation/AKON/AK097...\n", + "160 AK111_325 https://iiif.onb.ac.at/presentation/AKON/AK111..." + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "meta.sample(6)[['akon_id', 'manifest_link']]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "### Collect Image Links" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's take a look at that manifest again:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", + " '@id': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/manifest',\n", + " '@type': 'sc:Manifest',\n", + " 'label': 'Wien, III',\n", + " 'metadata': [{'label': [{'@value': 'Id', '@language': 'en'},\n", + " {'@value': 'Id', '@language': 'ger'}],\n", + " 'value': 'AK024_176'},\n", + " {'label': [{'@value': 'Title', '@language': 'en'},\n", + " {'@value': 'Titel', '@language': 'ger'}],\n", + " 'value': 'Wien, III'},\n", + " {'label': [{'@value': 'Place', '@language': 'en'},\n", + " {'@value': 'Ort', '@language': 'ger'}],\n", + " 'value': \"Landstraße\"},\n", + " {'label': [{'@value': 'Publisher', '@language': 'en'},\n", + " {'@value': 'Verlag', '@language': 'ger'}],\n", + " 'value': 'Ledermann'},\n", + " {'label': [{'@value': 'Place of Publications', '@language': 'en'},\n", + " {'@value': 'Erscheinungsort', '@language': 'ger'}],\n", + " 'value': 'Wien'},\n", + " {'label': [{'@value': 'Year', '@language': 'en'},\n", + " {'@value': 'Jahr', '@language': 'ger'}],\n", + " 'value': '1906'},\n", + " {'label': [{'@value': 'Disseminator', '@language': 'en'},\n", + " {'@value': 'Anbieter', '@language': 'ger'}],\n", + " 'value': \"Ansichtskarten Online\"},\n", + " {'label': [{'@value': 'Physical Location', '@language': 'en'},\n", + " {'@value': 'Standort', '@language': 'ger'}],\n", + " 'value': 'ÖNB'}],\n", + " 'description': 'Russische Kirche',\n", + " 'viewingDirection': 'left-to-right',\n", + " 'viewingHint': 'paged',\n", + " 'license': 'http://creativecommons.org/publicdomain/mark/1.0/',\n", + " 'attribution': [{'@value': 'Austrian National Library', '@language': 'en'},\n", + " {'@value': 'Österreichische Nationalbibliothek', '@language': 'ger'}],\n", + " 'logo': 'https://iiif.onb.ac.at/logo/',\n", + " 'seeAlso': [{'@id': 'http://data.onb.ac.at/AKON/AK024_176',\n", + " 'format': 'text/html'},\n", + " {'@id': 'http://data.onb.ac.at/AKON/AK024_176.rdf',\n", + " 'format': 'application/rdf+xml'}],\n", + " 'sequences': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", + " '@id': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/sequence/normal',\n", + " '@type': 'sc:Sequence',\n", + " 'startCanvas': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/canvas/176',\n", + " 'canvases': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", + " '@id': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/canvas/176',\n", + " '@type': 'sc:Canvas',\n", + " 'label': 'Wien, III',\n", + " 'height': 1681,\n", + " 'width': 1082,\n", + " 'images': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n", + " '@id': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/annotation/176',\n", + " '@type': 'oa:Annotation',\n", + " 'motivation': 'sc:painting',\n", + " 'resource': {'@id': 'https://iiif.onb.ac.at/images/AKON/AK024_176/176/full/full/0/native.jpg',\n", + " '@type': 'dctypes:Image',\n", + " 'height': 1681,\n", + " 'width': 1082,\n", + " 'format': 'image/jpeg',\n", + " 'service': {'@context': 'https://iiif.io/api/image/2/context.json',\n", + " '@id': 'https://iiif.onb.ac.at/images/AKON/AK024_176/176',\n", + " 'profile': 'https://iiif.io/api/image/2/level2.json'}},\n", + " 'on': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/canvas/176'}]}]}]}" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "r = requests.get(akon_id_to_manifest_link('AK024_176'))\n", + "r.json()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "We need to collect all `@id`s from all `resource`s from all `images` from all `canvases`.\n", + "\n", + "That's tedious by hand. We'll use `jsonpath-ng`:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ - "PROJ = 'AKON'" + "from jsonpath_ng import jsonpath, parse\n", + "\n", + "image_id_jp = parse('$.sequences[*].canvases[*].images[*].resource.@id')" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['https://iiif.onb.ac.at/images/AKON/AK024_176/176/full/full/0/native.jpg']" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[match.value for match in image_id_jp.find(r.json())]" ] }, { "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "All of this in one function:" + ] + }, + { + "cell_type": "code", + "execution_count": 27, "metadata": {}, + "outputs": [], "source": [ - "**TODO**" + "image_id_jp = parse('$.sequences[*].canvases[*].images[*].resource.@id')\n", + "\n", + "def image_links_for_manifest_link(manifest_link):\n", + " r = requests.get(manifest_link)\n", + " try:\n", + " json = r.json()\n", + " except:\n", + " # default to empty on any errors\n", + " # makes batch processing easier in pandas\n", + " json = {}\n", + " image_links = [match.value for match in image_id_jp.find(json)]\n", + " return image_links" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['https://iiif.onb.ac.at/images/AKON/AK024_176/176/full/full/0/native.jpg']" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "image_links_for_manifest_link(akon_id_to_manifest_link('AK024_176'))" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['https://iiif.onb.ac.at/images/AKON/AK111_325/325/full/full/0/native.jpg']" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "image_links_for_manifest_link(akon_id_to_manifest_link('AK111_325'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looking good." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "Now let's add the image links to the dataframe...\n", + "\n", + "...actually, let's not do that now, because it takes a while (upwards of 10 minutes). Let's cheat instead, skip this step and load the resulting dataframe directly." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 7min 26s, sys: 18 s, total: 7min 44s\n", + "Wall time: 12min 34s\n" + ] + } + ], + "source": [ + "# %%time\n", + "# meta['image_links'] = meta['manifest_link'].apply(image_links_for_manifest_link)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/kst/tmp/dingsdi/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3049: DtypeWarning: Columns (14) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " interactivity=interactivity, compiler=compiler, result=result)\n" + ] + } + ], + "source": [ + "meta = pd.read_csv('postcards_with_image_links.csv.bz2', compression='bz2')" ] }, { @@ -616,6 +1191,63 @@ "metadata": {}, "outputs": [], "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# Just The Code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "\n", + "def akon_id_to_manifest_link(akon_id):\n", + " return f'https://iiif.onb.ac.at/presentation/AKON/{akon_id}/manifest'\n", + "\n", + "\n", + "meta = pd.read_csv('https://labs.onb.ac.at/gitlab/labs-team/' \\\n", + " 'raw-metadata/raw/master/akon_postcards_public_domain.csv.bz2', compression='bz2')\n", + "meta['manifest_link'] = meta['akon_id'].apply(akon_id_to_manifest_link)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "import requests\n", + "from jsonpath_ng import jsonpath, parse\n", + "\n", + "\n", + "image_id_jp = parse('$.sequences[*].canvases[*].images[*].resource.@id')\n", + "\n", + "def image_links_for_manifest_link(manifest_link):\n", + " r = requests.get(manifest_link)\n", + " try:\n", + " json = r.json()\n", + " except:\n", + " json = {}\n", + " return [match.value for match in image_id_jp.find(json)]\n", + "\n", + "\n", + "meta['image_links'] = meta['manifest_link'].apply(image_links_for_manifest_link)" + ] } ], "metadata": { diff --git a/postcards_with_image_links.csv.bz2 b/postcards_with_image_links.csv.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..849c645900f67c6e8ee7ba99d490100532dd9841 Binary files /dev/null and b/postcards_with_image_links.csv.bz2 differ