Commit 4dd667c5 authored by Andreas's avatar Andreas

new stuff

parent b9ce798b
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"import necessary modules"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import json\n",
"import time"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"load metadata files from Webarchive Austria"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"\n",
"selective_crawls = requests.get('https://webarchiv.onb.ac.at/data/selective.json').json()\n",
"event_crawls = requests.get('https://webarchiv.onb.ac.at/data/events.json').json()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now you can extract metadata from these files"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Show the names and dates of all crawls"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Selective Crawls\n\nFrau/Gender\nPolitikkollektion\nMedienkollektion\n\nEvent Crawls\n\n100 Jahre Republik 2018\nEU-Ratsvorsitz 2018\nBundespräsidenten-Wahl 2016\nFlüchtlingskrise 2015\nSong Contest 2015\nErster Weltkrieg 2014\nEU-Wahl 2014\nOlympia 2014\nOlympia 2010\nBundespräsidenten-Wahl 2010\nEU-Wahl 2009\n"
]
}
],
"source": [
"print('Selective Crawls\\n')\n",
"for crawl in selective_crawls:\n",
" print(crawl['name'])\n",
"print('\\nEvent Crawls\\n')\n",
"for crawl in event_crawls:\n",
" print(crawl['name'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Show all seeds of Song Contest 2015 event"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"http://conchitawurst.com\nhttp://conchitawurst.com/news/\nhttp://de.wikipedia.org/wiki/Eurovision_Song_Contest_2015\nhttp://de.wikipedia.org/wiki/The_Makemakes\nhttp://debatte.orf.at/stories/1754300/\nhttp://derstandard.at/r2000004087668/Song-Contest-2015\nhttp://derstandard.at/r2000015639396/Eurovisions---Song-Contest-Blog\nhttp://diepresse.com/home/kultur/songcontest/index.do\nhttp://goodnight.at/magazin/kultur/516-60-eurovision-songcontest-in-wien\nhttp://kundendienst.orf.at/programm/fernsehen/orf1/songcontest2015.html\nhttp://kurier.at/kultur/song-contest\nhttp://medienportal.univie.ac.at/uniview/forschung/detailansicht/artikel/song-contest-zwischen-pop-und-politik/\nhttp://oe3.orf.at/stories/2679710\nhttp://orf.at/\nhttp://orf.at/stories/2254095/\nhttp://orf.at/stories/2254095/2254088/\nhttp://orf.at/stories/2254095/2254098/\nhttp://songcontest.orf.at/\nhttp://steiermark.orf.at/news/stories/2684488/\nhttp://tvthek.orf.at/topic/Eurovision-Song-Contest-2015/9137507\nhttp://tvthek.orf.at/topic/Oesterreich-beim-Song-Contest/9174892\nhttp://wien.orf.at/news/stories/2648040/\nhttp://wien.orf.at/news/stories/2648857/\nhttp://wien.orf.at/news/stories/2649515/\nhttp://wien.orf.at/news/stories/2661128/\nhttp://wien.orf.at/news/stories/2661370/\nhttp://wien.orf.at/news/stories/2661798/\nhttp://wien.orf.at/news/stories/2661804/\nhttp://wien.orf.at/news/stories/2661845/\nhttp://wien.orf.at/news/stories/2661855/\nhttp://wien.orf.at/news/stories/2661910/\nhttp://wien.orf.at/news/stories/2662093/\nhttp://wien.orf.at/news/stories/2662947/\nhttp://wien.orf.at/news/stories/2663106/\nhttp://wien.orf.at/news/stories/2664083/\nhttp://wien.orf.at/news/stories/2666541/\nhttp://wien.orf.at/news/stories/2666581/\nhttp://wien.orf.at/news/stories/2668462/\nhttp://wien.orf.at/news/stories/2670490/\nhttp://wien.orf.at/news/stories/2676170/\nhttp://wien.orf.at/news/stories/2678309/\nhttp://wien.orf.at/news/stories/2679672/\nhttp://wien.orf.at/news/stories/2680454/\nhttp://wien.orf.at/news/stories/2682245/\nhttp://wien.orf.at/news/stories/2683703/\nhttp://wien.orf.at/news/stories/2684423/\nhttp://wien.orf.at/news/stories/2687164/\nhttp://wien.orf.at/news/stories/2706977/\nhttp://wien.orf.at/news/tags/esc\nhttp://www.apa.at/Site/ESC2015/index.html\nhttp://www.austria-trend.at/blog/\nhttp://www.bmlfuw.gv.at/umwelt/nachhaltigkeit/green-events/projekte/kultur_und_musik/green_song_contest.html\nhttp://www.eurovision.tv/\nhttp://www.heute.at/song+contest./\nhttp://www.krone.at/song-contest\nhttp://www.news.at/a/song-contest-telegramm-2015\nhttp://www.oe24.at/kultur/song-contest\nhttp://www.oeticket.com/de/kuenstler/eurovision-song-contest-1020/profile.html\nhttp://www.picturedesk.com/ESC2015\nhttp://www.railtours.at/sonderangebote/oesterreich/wien/eurovision-song-contest-2015\nhttp://www.railtours.at/sonderangebote/oesterreich/wien/eurovision-song-contest-2015-family-show-finale.html\nhttp://www.railtours.at/sonderangebote/oesterreich/wien/eurovision-song-contest-2015-jury-show-finale.html\nhttp://www.railtours.at/sonderangebote/oesterreich/wien/eurovision-song-contest-2015-public-viewing-in-wien.html\nhttp://www.railtours.at/sonderangebote/oesterreich/wien/eurovision-song-contest-2015-semifinale-1-2.html\nhttp://www.songcontest.at/\nhttp://www.stadthalle.com/de/schauen/events/75/Eurovision-Song-Contest\nhttp://www.vienna.at/specials/eurovision-song-contest\nhttp://www.wien-event.at/events/song-contest/\nhttp://www.wien-ticket.at/de/service/Eurovision-Song-Contest-Wiener-Stadthalle-Wien\nhttp://www.wien.info/de/musik-buehne/eurovision-song-contest\nhttp://www.wienerlinien.at/eportal2/ep/tab.do/pageTypeId/74690\nhttp://www.wienerzeitung.at/dossiers/eurovision_song_contest_2015/\nhttps://instagram.com/esc_buildingbridges/\nhttps://instagram.com/themakemakes/\nhttps://twitter.com/ESC2015ORF\nhttps://twitter.com/Eurovision\nhttps://twitter.com/hashtag/eurovision?src=hash\nhttps://www.facebook.com/ESCBuildingBridges\nhttps://www.facebook.com/EurovisionSongContest\nhttps://www.facebook.com/themakemakes\nhttps://www.stadthalle.com/de/schauen/events/75/Eurovision-Song-Contest\nhttps://www.wien.gv.at/kultur/esc/\n"
]
}
],
"source": [
"for crawl in event_crawls:\n",
" if crawl['name'] == 'Song Contest 2015':\n",
" for group in crawl['groups']:\n",
" for seed in group['seeds']:\n",
" print(seed)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a WebarchivSession Object with convenience methods for easy access with your API-Key "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from webarchiv import WebarchivSession\n",
"\n",
"apikey = 'Zz2tQls7fuaocX2pjrfc2npojqbGwXL2'\n",
"w = WebarchivSession(apikey)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Submit a URL search to get all archived capturedates of the requested URL"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"url = \"http://www.onb.ac.at\"\n",
"response = w.wayback_search(url)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The search always returns the full response. Checking for status_code 200 before extracting the response is always a good idea"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"if response.status_code != 200:\n",
" print(\"Something went wrong ...\")\n",
" exit(1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now it is safe to extract the response\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Get the total number of captures of the requested URL"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2426\n"
]
}
],
"source": [
"print(response.json()['total'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Get the Archiveurl of the oldest Capuredate of the requested URL"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"https://webarchiv.onb.ac.at/web/20090916183601/http://www.onb.ac.at\n"
]
}
],
"source": [
"capturedate = datetime.datetime.fromtimestamp(response.json()['hits'][0]['c']).strftime('%Y%m%d%H%M%S')\n",
"\n",
"captureurl = 'https://webarchiv.onb.ac.at/web/' + capturedate + '/' + url\n",
"\n",
"print (captureurl)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from webarchiv import WebarchivSession\n",
"apikey = 'Zz2tQls7fuaocX2pjrfc2npojqbGwXL2'\n",
"w = WebarchivSession(apikey)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Submit a fulltext search to get a list of tophits per domain ordered by the largest number of hits"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"response = w.fulltext_search(\"Nationalbibliothek Prunksaal Schwarzenegger\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The search always returns the full response. Checking for status_code 200 before extracting the response is always a good idea"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"if response.status_code != 200:\n",
" print(\"Something went wrong ...\")\n",
" exit(1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now it is safe to extract the response"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Get the tophit of the domain with the largest number of hits which contains the words Nationalbliothek, Prunksaal and Schwarzenegger"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"http://www.onb.ac.at/services/21696.htm\n"
]
}
],
"source": [
"print (response.json()['hits'][0]['value'])\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import requests\n",
"import json\n",
"import time\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a WebarchivSession Object with convenience methods for easy access with your API-Key "
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from webarchiv import WebarchivSession\n",
"\n",
"apikey = 'Zz2tQls7fuaocX2pjrfc2npojqbGwXL2'\n",
"w = WebarchivSession(apikey)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"load metadata files from Webarchive Austria"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"selective_crawls = requests.get('https://webarchiv.onb.ac.at/data/selective.json').json()\n",
"event_crawls = requests.get('https://webarchiv.onb.ac.at/data/events.json').json()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Get the number of captures of all seeds of the Event Crawl Song Contest 2015"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"74 Captures for http://conchitawurst.com\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"8 Captures for http://conchitawurst.com/news/\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"6 Captures for http://de.wikipedia.org/wiki/Eurovision_Song_Contest_2015\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"5 Captures for http://de.wikipedia.org/wiki/The_Makemakes\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"32 Captures for http://debatte.orf.at/stories/1754300/\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"59 Captures for http://derstandard.at/r2000004087668/Song-Contest-2015\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"32 Captures for http://derstandard.at/r2000015639396/Eurovisions---Song-Contest-Blog\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"238 Captures for http://diepresse.com/home/kultur/songcontest/index.do\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"5 Captures for http://goodnight.at/magazin/kultur/516-60-eurovision-songcontest-in-wien\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"32 Captures for http://kundendienst.orf.at/programm/fernsehen/orf1/songcontest2015.html\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"150 Captures for http://kurier.at/kultur/song-contest\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"11 Captures for http://medienportal.univie.ac.at/uniview/forschung/detailansicht/artikel/song-contest-zwischen-pop-und-politik/\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"32 Captures for http://oe3.orf.at/stories/2679710\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"9976 Captures for http://orf.at/\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"32 Captures for http://orf.at/stories/2254095/\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"33 Captures for http://orf.at/stories/2254095/2254088/\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"33 Captures for http://orf.at/stories/2254095/2254098/\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"1383 Captures for http://songcontest.orf.at/\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"35 Captures for http://steiermark.orf.at/news/stories/2684488/\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"32 Captures for http://tvthek.orf.at/topic/Eurovision-Song-Contest-2015/9137507\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"32 Captures for http://tvthek.orf.at/topic/Oesterreich-beim-Song-Contest/9174892\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"35 Captures for http://wien.orf.at/news/stories/2648040/\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"38 Captures for http://wien.orf.at/news/stories/2648857/\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"42 Captures for http://wien.orf.at/news/stories/2649515/\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"36 Captures for http://wien.orf.at/news/stories/2661128/\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"43 Captures for http://wien.orf.at/news/stories/2661370/\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"39 Captures for http://wien.orf.at/news/stories/2661798/\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"43 Captures for http://wien.orf.at/news/stories/2661804/\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"40 Captures for http://wien.orf.at/news/stories/2661845/\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"38 Captures for http://wien.orf.at/news/stories/2661855/\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [