Commit f679092a authored by Andreas's avatar Andreas

new stuff

parent fbd15c05
This source diff could not be displayed because it is too large. You can view the blob instead.
import sys
import time
import requests
from requests import HTTPError
import datetime
......@@ -40,7 +41,9 @@ class WebarchivSession:
self.api_key = api_key
self.token = None
self.open_fulltext_queries = {}
self.open_wayback_queries = {}
self.finished_fulltext_queries = {}
self.finished_wayback_queries = {}
def connect(self):
"""
......@@ -130,7 +133,43 @@ class WebarchivSession:
print('Query for "{}" not added'.format(query_string))
def query_wayback_search(self, query_string, from_=None, to_=None):
raise NotImplementedError
"""
Start a wayback search query in the Webarchive.
The current status of running queries can be read via status_open_queries().
:param query_string: String to search for
:param from_: Optional earliest date bound for the search
in the format YYYYMM.
:param to_: Optional latest date bound for the search
in the format YYYYMM.
:return: None
"""
params = {'q': query_string}
if from_:
params['from'] = from_
if to_:
params['to'] = to_
try:
print('search for ', params)
r = self._get(op='/search/wayback', params=params)
return r
except HTTPError as e:
self._display_http_error(e)
print('Error:'.format(query_string))
def status_query(self, resp):
requestid = resp.json()['requestid'];
type = resp.json()['type']
print('call status for "{}"', requestid)
if type == 1:
r = self._get(op='/search/status/fulltext', params={'requestid': requestid})
elif type == 2:
r = self._get(op='/search/status/wayback', params={'requestid': requestid})
return r
def status_open_queries(self):
"""
......@@ -149,7 +188,25 @@ class WebarchivSession:
elif r.status_code == 202:
print('Query for "{}" is still running'.format(query_string))
for (timestamp, query_string), old_response in list(self.open_wayback_queries.items()):
requestid = old_response.json()['requestid']
r = self._get(op='/search/status/wayback', params={'requestid': requestid})
if r.status_code == 200:
self.finished_wayback_queries[(timestamp, query_string)] = r.json()
print('Query for "{}" done'.format(query_string))
del(self.open_wayback_queries[(timestamp, query_string)])
elif r.status_code == 202:
print('Query for "{}" is still running'.format(query_string))
if __name__ == '__main__':
# noinspection SpellCheckingInspection
w = WebarchivSession('Zz2tQls7fuaocX2pjrfc2npojqbGwXL2')
response = w.query_wayback_search("http://www.onb.ac.at")
print(response.status_code)
while response.status_code != 200:
print(response.status_code)
time.sleep(1)
response = w.status_query(response)
print(response.json()['total'])
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment