diff --git a/3.2 - Images - Download pre-downsized images for machine learning.ipynb b/3.2 - Images - Download pre-downsized images for machine learning.ipynb
index 3fb93fdd171345f112cdc06be3c58d1e77a6b7c9..413b54ed22705fb64444ab2e136cfb54377dbdf8 100644
--- a/3.2 - Images - Download pre-downsized images for machine learning.ipynb
+++ b/3.2 - Images - Download pre-downsized images for machine learning.ipynb
@@ -14,7 +14,9 @@
"\n",
"[https://labs.onb.ac.at/en/dataset/akon/](https://labs.onb.ac.at/en/dataset/akon/)\n",
"\n",
- "[https://labs.onb.ac.at/gitlab/labs-team/raw-metadata/raw/master/akon_postcards_public_domain.csv.bz2](https://labs.onb.ac.at/gitlab/labs-team/raw-metadata/raw/master/akon_postcards_public_domain.csv.bz2)"
+ "[https://labs.onb.ac.at/gitlab/labs-team/raw-metadata/raw/master/akon_postcards_public_domain.csv.bz2](https://labs.onb.ac.at/gitlab/labs-team/raw-metadata/raw/master/akon_postcards_public_domain.csv.bz2)\n",
+ "\n",
+ "[https://github.com/h2non/jsonpath-ng](https://github.com/h2non/jsonpath-ng)"
]
},
{
@@ -26,19 +28,12 @@
},
"source": [
"Let's say you got a bunch of old timey scenery photographs.\n",
- "And you want all images of lakes, why not.\n",
- "And you want an AI to do all the dirty work for you, because you can."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "slideshow": {
- "slide_type": "fragment"
- }
- },
- "source": [
- "You can use the historic postcards from the ONB Labs as training data for your AI."
+ "And you want to extract all images of lakes, why not.\n",
+ "And, because you can, you want an AI to do all the dirty work for you.\n",
+ "\n",
+ "What that has to do with this workshop?\n",
+ "\n",
+ "*You can use the **historic postcards** from the ONB Labs as training data for your AI.*"
]
},
{
@@ -61,6 +56,26 @@
"Now back to the show."
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "subslide"
+ }
+ },
+ "source": [
+ "What do we have to do?\n",
+ "\n",
+ "* **Download Metdata**\n",
+ " * List of all available postcards\n",
+ " * Info about the 'lake-ness' of postcards\n",
+ "* **Create Download Links**\n",
+ " * To fetch all images\n",
+ "* **Split Into Two Sets**\n",
+ " * Lakes and non-lakes\n",
+ "* **Download Images**"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {
@@ -81,7 +96,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 1,
"metadata": {},
"outputs": [
{
@@ -96,14 +111,16 @@
"source": [
"import pandas as pd\n",
"\n",
+ "# Let pandas show all available columns\n",
"pd.set_option('display.max_columns', 50)\n",
+ "# Pandas can read data directly from web links, even compressed files\n",
"meta = pd.read_csv('https://labs.onb.ac.at/gitlab/labs-team/' \\\n",
" 'raw-metadata/raw/master/akon_postcards_public_domain.csv.bz2', compression='bz2')"
]
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 4,
"metadata": {
"slideshow": {
"slide_type": "subslide"
@@ -165,51 +182,50 @@
" \n",
"
\n",
" \n",
- " 9839 | \n",
- " 9839 | \n",
- " AK095_271 | \n",
- " 60715 | \n",
- " NaN | \n",
+ " 21958 | \n",
+ " 21958 | \n",
+ " AK036_452 | \n",
+ " 21573 | \n",
+ " 355.0 | \n",
" NaN | \n",
- " Bastia, Rue des Terasses | \n",
- " False | \n",
- " 1903 gel | \n",
+ " Piesting | \n",
+ " True | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
+ " Ledermann | \n",
+ " Wien | \n",
" NaN | \n",
" NaN | \n",
+ " 1906.0 | \n",
" NaN | \n",
" NaN | \n",
+ " 2014-08-04 07:59:10.288 | \n",
+ " 1906 | \n",
+ " P | \n",
+ " PPLA3 | \n",
+ " 2771869.0 | \n",
+ " 47.87358 | \n",
+ " 16.12510 | \n",
+ " Piesting | \n",
+ " AT | \n",
" NaN | \n",
" NaN | \n",
- " 2014-09-02 07:50:01.517 | \n",
- " gelaufen 1903 | \n",
- " P | \n",
- " PPLA2 | \n",
- " 3034640.0 | \n",
- " 42.70278 | \n",
- " 9.45000 | \n",
- " Bastia | \n",
- " FR | \n",
- " Korsika | \n",
- " A5 | \n",
- " 42.70278, 9.45 | \n",
+ " 47.87358, 16.1251 | \n",
"
\n",
" \n",
- " 17826 | \n",
- " 17826 | \n",
- " AK080_449 | \n",
- " 50329 | \n",
+ " 31428 | \n",
+ " 31428 | \n",
+ " AK102_214 | \n",
+ " 66938 | \n",
" NaN | \n",
" NaN | \n",
- " Roma, Via Appia Nuova | \n",
+ " Schaffhausen | \n",
" False | \n",
- " v 1905 | \n",
- " NaN | \n",
- " NaN | \n",
+ " v. 1907 | \n",
" NaN | \n",
+ " Hohfluh | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
@@ -217,214 +233,223 @@
" NaN | \n",
" NaN | \n",
" NaN | \n",
- " 2014-08-25 15:48:21.484 | \n",
- " vor 1905 | \n",
+ " Geogr. Topogr. Bilder-Samml. 1951, 1093 | \n",
+ " 2014-09-03 10:50:31.524 | \n",
+ " vor 1907 | \n",
" P | \n",
- " PPLC | \n",
- " 3169070.0 | \n",
- " 41.89474 | \n",
- " 12.48390 | \n",
- " Roma | \n",
- " IT | \n",
+ " PPLA | \n",
+ " 2658761.0 | \n",
+ " 47.69732 | \n",
+ " 8.63493 | \n",
+ " Schaffhausen | \n",
+ " CH | \n",
" NaN | \n",
" NaN | \n",
- " 41.89474, 12.4839 | \n",
+ " 47.69732, 8.63493 | \n",
"
\n",
" \n",
- " 20639 | \n",
- " 20639 | \n",
- " AK032_101 | \n",
- " 18815 | \n",
+ " 13828 | \n",
+ " 13828 | \n",
+ " AK008_015 | \n",
+ " 4271 | \n",
" NaN | \n",
- " Sechshaus | \n",
- " Weidling | \n",
- " True | \n",
- " v 1907 | \n",
" NaN | \n",
+ " Eiskaarspitze | \n",
+ " False | \n",
+ " NaN | \n",
+ " Dachstein | \n",
" NaN | \n",
" NaN | \n",
" Ledermann | \n",
" Wien | \n",
" NaN | \n",
" NaN | \n",
+ " 1921.0 | \n",
" NaN | \n",
" NaN | \n",
- " NaN | \n",
- " 2014-08-04 07:59:10.250 | \n",
- " vor 1907 | \n",
- " P | \n",
- " PPL | \n",
- " 2761741.0 | \n",
- " 48.29100 | \n",
- " 16.30865 | \n",
- " Weidling | \n",
+ " 2014-08-04 07:59:09.895 | \n",
+ " 1921 | \n",
+ " T | \n",
+ " MT | \n",
+ " 2775701.0 | \n",
+ " 47.47545 | \n",
+ " 13.60588 | \n",
+ " Dachstein | \n",
" AT | \n",
- " Niederösterreich | \n",
- " 03 | \n",
- " 48.291, 16.30865 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 47.47545, 13.60588 | \n",
"
\n",
" \n",
- " 28262 | \n",
- " 28262 | \n",
- " AK061_487 | \n",
- " 36913 | \n",
+ " 22725 | \n",
+ " 22725 | \n",
+ " AK039_299 | \n",
+ " 23224 | \n",
" NaN | \n",
" NaN | \n",
- " Pontebba | \n",
+ " Attnang | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
+ " Topf | \n",
+ " Attnang-Puchheim | \n",
" NaN | \n",
" NaN | \n",
+ " 1919.0 | \n",
" NaN | \n",
" NaN | \n",
- " 1918.0 | \n",
- " NaN | \n",
- " Geogr. Topogr. Bilder-Samml. 1944, 18587 | \n",
- " 2014-08-04 07:59:10.412 | \n",
- " 1918 | \n",
+ " 2014-08-04 07:59:10.309 | \n",
+ " 1919 | \n",
" P | \n",
- " PPLA3 | \n",
- " 3170297.0 | \n",
- " 46.50540 | \n",
- " 13.30622 | \n",
- " Pontebba | \n",
- " IT | \n",
+ " PPLX | \n",
+ " 2782285.0 | \n",
+ " 48.01667 | \n",
+ " 13.71667 | \n",
+ " Attnang | \n",
+ " AT | \n",
" NaN | \n",
" NaN | \n",
- " 46.5054, 13.30622 | \n",
+ " 48.01667, 13.71667 | \n",
"
\n",
" \n",
- " 33213 | \n",
- " 33213 | \n",
- " AK080_268 | \n",
- " 50142 | \n",
- " NaN | \n",
- " NaN | \n",
- " New York | \n",
- " True | \n",
- " v 1905 | \n",
- " NaN | \n",
+ " 32047 | \n",
+ " 32047 | \n",
+ " AK067_186 | \n",
+ " 40906 | \n",
" NaN | \n",
" NaN | \n",
+ " Salzburg | \n",
+ " False | \n",
+ " 1918 gel | \n",
+ " Mönchsberg | \n",
" NaN | \n",
" NaN | \n",
+ " Würthle & Sohn Nachfolger G. m. b. H | \n",
+ " Salzburg | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
- " 2014-08-25 15:27:14.452 | \n",
- " vor 1905 | \n",
+ " 2014-08-12 14:04:34.797 | \n",
+ " gelaufen 1918 | \n",
" P | \n",
- " PPL | \n",
- " 5128581.0 | \n",
- " 40.71427 | \n",
- " -74.00597 | \n",
- " New York City | \n",
- " US | \n",
+ " PPLA | \n",
+ " 2766824.0 | \n",
+ " 47.79941 | \n",
+ " 13.04399 | \n",
+ " Salzburg | \n",
+ " AT | \n",
" NaN | \n",
" NaN | \n",
- " 40.71427, -74.00597 | \n",
+ " 47.79941, 13.04399 | \n",
"
\n",
" \n",
- " 757 | \n",
- " 757 | \n",
- " AK115_107 | \n",
- " 77157 | \n",
+ " 17407 | \n",
+ " 17407 | \n",
+ " AK020_443 | \n",
+ " 11927 | \n",
" NaN | \n",
- " Corte S. Ilario e Duomo | \n",
- " Görz | \n",
- " False | \n",
- " 1900 gel | \n",
" NaN | \n",
+ " Saalfelden | \n",
+ " False | \n",
+ " v 1907 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
+ " Ledermann | \n",
+ " Wien | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
- " 79/66 G | \n",
" NaN | \n",
- " 2014-09-09 10:20:27.631 | \n",
- " gelaufen 1900 | \n",
+ " 2014-08-04 07:59:10.121 | \n",
+ " vor 1907 | \n",
" P | \n",
- " PPLA2 | \n",
- " 3175986.0 | \n",
- " 45.94088 | \n",
- " 13.62167 | \n",
- " Gorizia | \n",
- " IT | \n",
+ " PPLA3 | \n",
+ " 2766922.0 | \n",
+ " 47.42681 | \n",
+ " 12.84800 | \n",
+ " Saalfelden am Steinernen Meer | \n",
+ " AT | \n",
" NaN | \n",
" NaN | \n",
- " 45.94088, 13.62167 | \n",
+ " 47.42681, 12.848 | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " Unnamed: 0 akon_id id altitude building \\\n",
- "9839 9839 AK095_271 60715 NaN NaN \n",
- "17826 17826 AK080_449 50329 NaN NaN \n",
- "20639 20639 AK032_101 18815 NaN Sechshaus \n",
- "28262 28262 AK061_487 36913 NaN NaN \n",
- "33213 33213 AK080_268 50142 NaN NaN \n",
- "757 757 AK115_107 77157 NaN Corte S. Ilario e Duomo \n",
+ " Unnamed: 0 akon_id id altitude building city color \\\n",
+ "21958 21958 AK036_452 21573 355.0 NaN Piesting True \n",
+ "31428 31428 AK102_214 66938 NaN NaN Schaffhausen False \n",
+ "13828 13828 AK008_015 4271 NaN NaN Eiskaarspitze False \n",
+ "22725 22725 AK039_299 23224 NaN NaN Attnang False \n",
+ "32047 32047 AK067_186 40906 NaN NaN Salzburg False \n",
+ "17407 17407 AK020_443 11927 NaN NaN Saalfelden False \n",
"\n",
- " city color comment mountain other photographer \\\n",
- "9839 Bastia, Rue des Terasses False 1903 gel NaN NaN NaN \n",
- "17826 Roma, Via Appia Nuova False v 1905 NaN NaN NaN \n",
- "20639 Weidling True v 1907 NaN NaN NaN \n",
- "28262 Pontebba False NaN NaN NaN NaN \n",
- "33213 New York True v 1905 NaN NaN NaN \n",
- "757 Görz False 1900 gel NaN NaN NaN \n",
+ " comment mountain other photographer \\\n",
+ "21958 NaN NaN NaN NaN \n",
+ "31428 v. 1907 NaN Hohfluh NaN \n",
+ "13828 NaN Dachstein NaN NaN \n",
+ "22725 NaN NaN NaN NaN \n",
+ "32047 1918 gel Mönchsberg NaN NaN \n",
+ "17407 v 1907 NaN NaN NaN \n",
"\n",
- " publisher publisher_place region water_body year inventory_number \\\n",
- "9839 NaN NaN NaN NaN NaN NaN \n",
- "17826 NaN NaN NaN NaN NaN NaN \n",
- "20639 Ledermann Wien NaN NaN NaN NaN \n",
- "28262 NaN NaN NaN NaN 1918.0 NaN \n",
- "33213 NaN NaN NaN NaN NaN NaN \n",
- "757 NaN NaN NaN NaN NaN 79/66 G \n",
+ " publisher publisher_place region \\\n",
+ "21958 Ledermann Wien NaN \n",
+ "31428 NaN NaN NaN \n",
+ "13828 Ledermann Wien NaN \n",
+ "22725 Topf Attnang-Puchheim NaN \n",
+ "32047 Würthle & Sohn Nachfolger G. m. b. H Salzburg NaN \n",
+ "17407 Ledermann Wien NaN \n",
"\n",
- " signature revision_date \\\n",
- "9839 NaN 2014-09-02 07:50:01.517 \n",
- "17826 NaN 2014-08-25 15:48:21.484 \n",
- "20639 NaN 2014-08-04 07:59:10.250 \n",
- "28262 Geogr. Topogr. Bilder-Samml. 1944, 18587 2014-08-04 07:59:10.412 \n",
- "33213 NaN 2014-08-25 15:27:14.452 \n",
- "757 NaN 2014-09-09 10:20:27.631 \n",
+ " water_body year inventory_number \\\n",
+ "21958 NaN 1906.0 NaN \n",
+ "31428 NaN NaN NaN \n",
+ "13828 NaN 1921.0 NaN \n",
+ "22725 NaN 1919.0 NaN \n",
+ "32047 NaN NaN NaN \n",
+ "17407 NaN NaN NaN \n",
+ "\n",
+ " signature revision_date \\\n",
+ "21958 NaN 2014-08-04 07:59:10.288 \n",
+ "31428 Geogr. Topogr. Bilder-Samml. 1951, 1093 2014-09-03 10:50:31.524 \n",
+ "13828 NaN 2014-08-04 07:59:09.895 \n",
+ "22725 NaN 2014-08-04 07:59:10.309 \n",
+ "32047 NaN 2014-08-12 14:04:34.797 \n",
+ "17407 NaN 2014-08-04 07:59:10.121 \n",
"\n",
" date feature_class feature_code geoname_id latitude \\\n",
- "9839 gelaufen 1903 P PPLA2 3034640.0 42.70278 \n",
- "17826 vor 1905 P PPLC 3169070.0 41.89474 \n",
- "20639 vor 1907 P PPL 2761741.0 48.29100 \n",
- "28262 1918 P PPLA3 3170297.0 46.50540 \n",
- "33213 vor 1905 P PPL 5128581.0 40.71427 \n",
- "757 gelaufen 1900 P PPLA2 3175986.0 45.94088 \n",
+ "21958 1906 P PPLA3 2771869.0 47.87358 \n",
+ "31428 vor 1907 P PPLA 2658761.0 47.69732 \n",
+ "13828 1921 T MT 2775701.0 47.47545 \n",
+ "22725 1919 P PPLX 2782285.0 48.01667 \n",
+ "32047 gelaufen 1918 P PPLA 2766824.0 47.79941 \n",
+ "17407 vor 1907 P PPLA3 2766922.0 47.42681 \n",
"\n",
- " longitude name country_id admin_name_1 admin_code_1 \\\n",
- "9839 9.45000 Bastia FR Korsika A5 \n",
- "17826 12.48390 Roma IT NaN NaN \n",
- "20639 16.30865 Weidling AT Niederösterreich 03 \n",
- "28262 13.30622 Pontebba IT NaN NaN \n",
- "33213 -74.00597 New York City US NaN NaN \n",
- "757 13.62167 Gorizia IT NaN NaN \n",
+ " longitude name country_id admin_name_1 \\\n",
+ "21958 16.12510 Piesting AT NaN \n",
+ "31428 8.63493 Schaffhausen CH NaN \n",
+ "13828 13.60588 Dachstein AT NaN \n",
+ "22725 13.71667 Attnang AT NaN \n",
+ "32047 13.04399 Salzburg AT NaN \n",
+ "17407 12.84800 Saalfelden am Steinernen Meer AT NaN \n",
"\n",
- " geo \n",
- "9839 42.70278, 9.45 \n",
- "17826 41.89474, 12.4839 \n",
- "20639 48.291, 16.30865 \n",
- "28262 46.5054, 13.30622 \n",
- "33213 40.71427, -74.00597 \n",
- "757 45.94088, 13.62167 "
+ " admin_code_1 geo \n",
+ "21958 NaN 47.87358, 16.1251 \n",
+ "31428 NaN 47.69732, 8.63493 \n",
+ "13828 NaN 47.47545, 13.60588 \n",
+ "22725 NaN 48.01667, 13.71667 \n",
+ "32047 NaN 47.79941, 13.04399 \n",
+ "17407 NaN 47.42681, 12.848 "
]
},
- "execution_count": 6,
+ "execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@@ -446,7 +471,7 @@
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 4,
"metadata": {},
"outputs": [
{
@@ -476,50 +501,50 @@
" \n",
" \n",
" \n",
- " 7017 | \n",
- " AK082_070 | \n",
- " NaN | \n",
+ " 19411 | \n",
+ " AK027_436 | \n",
+ " Millstätter See | \n",
"
\n",
" \n",
- " 17570 | \n",
- " AK021_135 | \n",
+ " 24183 | \n",
+ " AK045_467 | \n",
" NaN | \n",
"
\n",
" \n",
- " 19626 | \n",
- " AK028_223 | \n",
+ " 7038 | \n",
+ " AK082_171 | \n",
" NaN | \n",
"
\n",
" \n",
- " 4202 | \n",
- " AK058_235 | \n",
+ " 21425 | \n",
+ " AK035_008 | \n",
" NaN | \n",
"
\n",
" \n",
- " 17094 | \n",
- " AK019_205 | \n",
+ " 8853 | \n",
+ " AK107_209 | \n",
" NaN | \n",
"
\n",
" \n",
- " 29221 | \n",
- " AK068_474 | \n",
- " NaN | \n",
+ " 27306 | \n",
+ " AK057_094 | \n",
+ " Csorba See | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " akon_id water_body\n",
- "7017 AK082_070 NaN\n",
- "17570 AK021_135 NaN\n",
- "19626 AK028_223 NaN\n",
- "4202 AK058_235 NaN\n",
- "17094 AK019_205 NaN\n",
- "29221 AK068_474 NaN"
+ " akon_id water_body\n",
+ "19411 AK027_436 Millstätter See\n",
+ "24183 AK045_467 NaN\n",
+ "7038 AK082_171 NaN\n",
+ "21425 AK035_008 NaN\n",
+ "8853 AK107_209 NaN\n",
+ "27306 AK057_094 Csorba See"
]
},
- "execution_count": 20,
+ "execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@@ -550,28 +575,578 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "ONB Labs' own `SACHA` project provides an API for accessing digitized objects via IIIF.\n",
+ "The [SACHA project](https://iiif.onb.ac.at/) provides an API for accessing digitized objects of the National Library via IIIF.\n",
"The online documentation for the API is here: [https://iiif.onb.ac.at/api](https://iiif.onb.ac.at/api).\n",
"\n",
- "We're especially interested in the possibility to serve manifests: [https://iiif.onb.ac.at/api#_manifestrequestprocessor](https://iiif.onb.ac.at/api#_manifestrequestprocessor).\n",
+ "We're especially interested in the possibility to serve manifests: [https://iiif.onb.ac.at/api#_manifestrequestprocessor](https://iiif.onb.ac.at/api#_manifestrequestprocessor):\n",
"\n",
- "The project name is `AKON`, see also [https://iiif.onb.ac.at/api#_digitization_projects)(https://iiif.onb.ac.at/api#_digitization_projects)."
+ "```\n",
+ "GET /presentation/{projectName}/{id}/manifest\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "subslide"
+ }
+ },
+ "source": [
+ "```\n",
+ "GET /presentation/{projectName}/{id}/manifest\n",
+ "```\n",
+ "\n",
+ "The `projectName` is `AKON` ('AnsichtsKarten ONline'), the `id` is the `akon_id`.\n",
+ "\n",
+ "See also [https://iiif.onb.ac.at/api#_digitization_projects](https://iiif.onb.ac.at/api#_digitization_projects)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "subslide"
+ }
+ },
+ "source": [
+ "### Create Download Links for Manifests"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def akon_id_to_manifest_link(akon_id):\n",
+ " return f'https://iiif.onb.ac.at/presentation/AKON/{akon_id}/manifest'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {
+ "slideshow": {
+ "slide_type": "fragment"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'https://iiif.onb.ac.at/presentation/AKON/AK024_176/manifest'"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "akon_id_to_manifest_link('AK024_176')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "subslide"
+ }
+ },
+ "source": [
+ "Let's test the link"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {
+ "scrolled": true,
+ "slideshow": {
+ "slide_type": "fragment"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'@context': 'https://iiif.io/api/presentation/2/context.json',\n",
+ " '@id': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/manifest',\n",
+ " '@type': 'sc:Manifest',\n",
+ " 'label': 'Wien, III',\n",
+ " 'metadata': [{'label': [{'@value': 'Id', '@language': 'en'},\n",
+ " {'@value': 'Id', '@language': 'ger'}],\n",
+ " 'value': 'AK024_176'},\n",
+ " {'label': [{'@value': 'Title', '@language': 'en'},\n",
+ " {'@value': 'Titel', '@language': 'ger'}],\n",
+ " 'value': 'Wien, III'},\n",
+ " {'label': [{'@value': 'Place', '@language': 'en'},\n",
+ " {'@value': 'Ort', '@language': 'ger'}],\n",
+ " 'value': \"Landstraße\"},\n",
+ " {'label': [{'@value': 'Publisher', '@language': 'en'},\n",
+ " {'@value': 'Verlag', '@language': 'ger'}],\n",
+ " 'value': 'Ledermann'},\n",
+ " {'label': [{'@value': 'Place of Publications', '@language': 'en'},\n",
+ " {'@value': 'Erscheinungsort', '@language': 'ger'}],\n",
+ " 'value': 'Wien'},\n",
+ " {'label': [{'@value': 'Year', '@language': 'en'},\n",
+ " {'@value': 'Jahr', '@language': 'ger'}],\n",
+ " 'value': '1906'},\n",
+ " {'label': [{'@value': 'Disseminator', '@language': 'en'},\n",
+ " {'@value': 'Anbieter', '@language': 'ger'}],\n",
+ " 'value': \"Ansichtskarten Online\"},\n",
+ " {'label': [{'@value': 'Physical Location', '@language': 'en'},\n",
+ " {'@value': 'Standort', '@language': 'ger'}],\n",
+ " 'value': 'ÖNB'}],\n",
+ " 'description': 'Russische Kirche',\n",
+ " 'viewingDirection': 'left-to-right',\n",
+ " 'viewingHint': 'paged',\n",
+ " 'license': 'http://creativecommons.org/publicdomain/mark/1.0/',\n",
+ " 'attribution': [{'@value': 'Austrian National Library', '@language': 'en'},\n",
+ " {'@value': 'Österreichische Nationalbibliothek', '@language': 'ger'}],\n",
+ " 'logo': 'https://iiif.onb.ac.at/logo/',\n",
+ " 'seeAlso': [{'@id': 'http://data.onb.ac.at/AKON/AK024_176',\n",
+ " 'format': 'text/html'},\n",
+ " {'@id': 'http://data.onb.ac.at/AKON/AK024_176.rdf',\n",
+ " 'format': 'application/rdf+xml'}],\n",
+ " 'sequences': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n",
+ " '@id': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/sequence/normal',\n",
+ " '@type': 'sc:Sequence',\n",
+ " 'startCanvas': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/canvas/176',\n",
+ " 'canvases': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n",
+ " '@id': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/canvas/176',\n",
+ " '@type': 'sc:Canvas',\n",
+ " 'label': 'Wien, III',\n",
+ " 'height': 1681,\n",
+ " 'width': 1082,\n",
+ " 'images': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n",
+ " '@id': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/annotation/176',\n",
+ " '@type': 'oa:Annotation',\n",
+ " 'motivation': 'sc:painting',\n",
+ " 'resource': {'@id': 'https://iiif.onb.ac.at/images/AKON/AK024_176/176/full/full/0/native.jpg',\n",
+ " '@type': 'dctypes:Image',\n",
+ " 'height': 1681,\n",
+ " 'width': 1082,\n",
+ " 'format': 'image/jpeg',\n",
+ " 'service': {'@context': 'https://iiif.io/api/image/2/context.json',\n",
+ " '@id': 'https://iiif.onb.ac.at/images/AKON/AK024_176/176',\n",
+ " 'profile': 'https://iiif.io/api/image/2/level2.json'}},\n",
+ " 'on': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/canvas/176'}]}]}]}"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import requests\n",
+ "\n",
+ "r = requests.get(akon_id_to_manifest_link('AK024_176'))\n",
+ "r.json()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "subslide"
+ }
+ },
+ "source": [
+ "The manifest link seems to work. Let's add them to the dataframe."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {
+ "slideshow": {
+ "slide_type": "fragment"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "meta['manifest_link'] = meta['akon_id'].apply(akon_id_to_manifest_link)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {
+ "slideshow": {
+ "slide_type": "fragment"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " akon_id | \n",
+ " manifest_link | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 34406 | \n",
+ " AK072_262 | \n",
+ " https://iiif.onb.ac.at/presentation/AKON/AK072... | \n",
+ "
\n",
+ " \n",
+ " 30373 | \n",
+ " AK085_299 | \n",
+ " https://iiif.onb.ac.at/presentation/AKON/AK085... | \n",
+ "
\n",
+ " \n",
+ " 18098 | \n",
+ " AK023_263 | \n",
+ " https://iiif.onb.ac.at/presentation/AKON/AK023... | \n",
+ "
\n",
+ " \n",
+ " 3152 | \n",
+ " AK122_506 | \n",
+ " https://iiif.onb.ac.at/presentation/AKON/AK122... | \n",
+ "
\n",
+ " \n",
+ " 31981 | \n",
+ " AK097_136 | \n",
+ " https://iiif.onb.ac.at/presentation/AKON/AK097... | \n",
+ "
\n",
+ " \n",
+ " 160 | \n",
+ " AK111_325 | \n",
+ " https://iiif.onb.ac.at/presentation/AKON/AK111... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " akon_id manifest_link\n",
+ "34406 AK072_262 https://iiif.onb.ac.at/presentation/AKON/AK072...\n",
+ "30373 AK085_299 https://iiif.onb.ac.at/presentation/AKON/AK085...\n",
+ "18098 AK023_263 https://iiif.onb.ac.at/presentation/AKON/AK023...\n",
+ "3152 AK122_506 https://iiif.onb.ac.at/presentation/AKON/AK122...\n",
+ "31981 AK097_136 https://iiif.onb.ac.at/presentation/AKON/AK097...\n",
+ "160 AK111_325 https://iiif.onb.ac.at/presentation/AKON/AK111..."
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "meta.sample(6)[['akon_id', 'manifest_link']]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "subslide"
+ }
+ },
+ "source": [
+ "### Collect Image Links"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Let's take a look at that manifest again:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'@context': 'https://iiif.io/api/presentation/2/context.json',\n",
+ " '@id': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/manifest',\n",
+ " '@type': 'sc:Manifest',\n",
+ " 'label': 'Wien, III',\n",
+ " 'metadata': [{'label': [{'@value': 'Id', '@language': 'en'},\n",
+ " {'@value': 'Id', '@language': 'ger'}],\n",
+ " 'value': 'AK024_176'},\n",
+ " {'label': [{'@value': 'Title', '@language': 'en'},\n",
+ " {'@value': 'Titel', '@language': 'ger'}],\n",
+ " 'value': 'Wien, III'},\n",
+ " {'label': [{'@value': 'Place', '@language': 'en'},\n",
+ " {'@value': 'Ort', '@language': 'ger'}],\n",
+ " 'value': \"Landstraße\"},\n",
+ " {'label': [{'@value': 'Publisher', '@language': 'en'},\n",
+ " {'@value': 'Verlag', '@language': 'ger'}],\n",
+ " 'value': 'Ledermann'},\n",
+ " {'label': [{'@value': 'Place of Publications', '@language': 'en'},\n",
+ " {'@value': 'Erscheinungsort', '@language': 'ger'}],\n",
+ " 'value': 'Wien'},\n",
+ " {'label': [{'@value': 'Year', '@language': 'en'},\n",
+ " {'@value': 'Jahr', '@language': 'ger'}],\n",
+ " 'value': '1906'},\n",
+ " {'label': [{'@value': 'Disseminator', '@language': 'en'},\n",
+ " {'@value': 'Anbieter', '@language': 'ger'}],\n",
+ " 'value': \"Ansichtskarten Online\"},\n",
+ " {'label': [{'@value': 'Physical Location', '@language': 'en'},\n",
+ " {'@value': 'Standort', '@language': 'ger'}],\n",
+ " 'value': 'ÖNB'}],\n",
+ " 'description': 'Russische Kirche',\n",
+ " 'viewingDirection': 'left-to-right',\n",
+ " 'viewingHint': 'paged',\n",
+ " 'license': 'http://creativecommons.org/publicdomain/mark/1.0/',\n",
+ " 'attribution': [{'@value': 'Austrian National Library', '@language': 'en'},\n",
+ " {'@value': 'Österreichische Nationalbibliothek', '@language': 'ger'}],\n",
+ " 'logo': 'https://iiif.onb.ac.at/logo/',\n",
+ " 'seeAlso': [{'@id': 'http://data.onb.ac.at/AKON/AK024_176',\n",
+ " 'format': 'text/html'},\n",
+ " {'@id': 'http://data.onb.ac.at/AKON/AK024_176.rdf',\n",
+ " 'format': 'application/rdf+xml'}],\n",
+ " 'sequences': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n",
+ " '@id': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/sequence/normal',\n",
+ " '@type': 'sc:Sequence',\n",
+ " 'startCanvas': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/canvas/176',\n",
+ " 'canvases': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n",
+ " '@id': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/canvas/176',\n",
+ " '@type': 'sc:Canvas',\n",
+ " 'label': 'Wien, III',\n",
+ " 'height': 1681,\n",
+ " 'width': 1082,\n",
+ " 'images': [{'@context': 'https://iiif.io/api/presentation/2/context.json',\n",
+ " '@id': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/annotation/176',\n",
+ " '@type': 'oa:Annotation',\n",
+ " 'motivation': 'sc:painting',\n",
+ " 'resource': {'@id': 'https://iiif.onb.ac.at/images/AKON/AK024_176/176/full/full/0/native.jpg',\n",
+ " '@type': 'dctypes:Image',\n",
+ " 'height': 1681,\n",
+ " 'width': 1082,\n",
+ " 'format': 'image/jpeg',\n",
+ " 'service': {'@context': 'https://iiif.io/api/image/2/context.json',\n",
+ " '@id': 'https://iiif.onb.ac.at/images/AKON/AK024_176/176',\n",
+ " 'profile': 'https://iiif.io/api/image/2/level2.json'}},\n",
+ " 'on': 'https://iiif.onb.ac.at/presentation/AKON/AK024_176/canvas/176'}]}]}]}"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "r = requests.get(akon_id_to_manifest_link('AK024_176'))\n",
+ "r.json()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "subslide"
+ }
+ },
+ "source": [
+ "We need to collect all `@id`s from all `resource`s from all `images` from all `canvases`.\n",
+ "\n",
+ "That's tedious by hand. We'll use `jsonpath-ng`:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
- "PROJ = 'AKON'"
+ "from jsonpath_ng import jsonpath, parse\n",
+ "\n",
+ "image_id_jp = parse('$.sequences[*].canvases[*].images[*].resource.@id')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {
+ "slideshow": {
+ "slide_type": "fragment"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['https://iiif.onb.ac.at/images/AKON/AK024_176/176/full/full/0/native.jpg']"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "[match.value for match in image_id_jp.find(r.json())]"
]
},
{
"cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "subslide"
+ }
+ },
+ "source": [
+ "All of this in one function:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
"metadata": {},
+ "outputs": [],
"source": [
- "**TODO**"
+ "image_id_jp = parse('$.sequences[*].canvases[*].images[*].resource.@id')\n",
+ "\n",
+ "def image_links_for_manifest_link(manifest_link):\n",
+ " r = requests.get(manifest_link)\n",
+ " try:\n",
+ " json = r.json()\n",
+ " except:\n",
+ " # default to empty on any errors\n",
+ " # makes batch processing easier in pandas\n",
+ " json = {}\n",
+ " image_links = [match.value for match in image_id_jp.find(json)]\n",
+ " return image_links"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {
+ "slideshow": {
+ "slide_type": "fragment"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['https://iiif.onb.ac.at/images/AKON/AK024_176/176/full/full/0/native.jpg']"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "image_links_for_manifest_link(akon_id_to_manifest_link('AK024_176'))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {
+ "slideshow": {
+ "slide_type": "fragment"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['https://iiif.onb.ac.at/images/AKON/AK111_325/325/full/full/0/native.jpg']"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "image_links_for_manifest_link(akon_id_to_manifest_link('AK111_325'))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Looking good."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "subslide"
+ }
+ },
+ "source": [
+ "Now let's add the image links to the dataframe...\n",
+ "\n",
+ "...actually, let's not do that now, because it takes a while (upwards of 10 minutes). Let's cheat instead, skip this step and load the resulting dataframe directly."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {
+ "slideshow": {
+ "slide_type": "fragment"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CPU times: user 7min 26s, sys: 18 s, total: 7min 44s\n",
+ "Wall time: 12min 34s\n"
+ ]
+ }
+ ],
+ "source": [
+ "# %%time\n",
+ "# meta['image_links'] = meta['manifest_link'].apply(image_links_for_manifest_link)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/kst/tmp/dingsdi/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3049: DtypeWarning: Columns (14) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+ " interactivity=interactivity, compiler=compiler, result=result)\n"
+ ]
+ }
+ ],
+ "source": [
+ "meta = pd.read_csv('postcards_with_image_links.csv.bz2', compression='bz2')"
]
},
{
@@ -616,6 +1191,63 @@
"metadata": {},
"outputs": [],
"source": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "# Just The Code"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "\n",
+ "def akon_id_to_manifest_link(akon_id):\n",
+ " return f'https://iiif.onb.ac.at/presentation/AKON/{akon_id}/manifest'\n",
+ "\n",
+ "\n",
+ "meta = pd.read_csv('https://labs.onb.ac.at/gitlab/labs-team/' \\\n",
+ " 'raw-metadata/raw/master/akon_postcards_public_domain.csv.bz2', compression='bz2')\n",
+ "meta['manifest_link'] = meta['akon_id'].apply(akon_id_to_manifest_link)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "slideshow": {
+ "slide_type": "subslide"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import requests\n",
+ "from jsonpath_ng import jsonpath, parse\n",
+ "\n",
+ "\n",
+ "image_id_jp = parse('$.sequences[*].canvases[*].images[*].resource.@id')\n",
+ "\n",
+ "def image_links_for_manifest_link(manifest_link):\n",
+ " r = requests.get(manifest_link)\n",
+ " try:\n",
+ " json = r.json()\n",
+ " except:\n",
+ " json = {}\n",
+ " return [match.value for match in image_id_jp.find(json)]\n",
+ "\n",
+ "\n",
+ "meta['image_links'] = meta['manifest_link'].apply(image_links_for_manifest_link)"
+ ]
}
],
"metadata": {
diff --git a/postcards_with_image_links.csv.bz2 b/postcards_with_image_links.csv.bz2
new file mode 100644
index 0000000000000000000000000000000000000000..849c645900f67c6e8ee7ba99d490100532dd9841
Binary files /dev/null and b/postcards_with_image_links.csv.bz2 differ