Newer
Older
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import requests
from requests import HTTPError
_datetime_format_string = '%Y%m%d%H%M%S'
class SessionTimeoutError(Exception):
pass
class WebarchivSession:
@property
def version(self):
"""
Current protocol version
"""
return '0.1.0'
@property
def base_url(self):
"""
Protocol, domain and path prefix for the Webarchive API,
with a single positional format string placeholder
for the REST operation and parameters.
"""
return 'https://webarchiv.onb.ac.at/api/{}'
@property
def _error_template(self):
"""
A format string for displaying HTTP Errors.
Must contain one placeholder 'status_code' for the HTTP status code.
Must contain one placeholder 'response_text' for the body of the response.
"""
return 'HTTP ERROR - status code {status_code}\n----\n{response_text}\n----\n\n'
def __init__(self, api_key):
self.api_key = api_key
self.token = None
def connect(self):
"""
Connect to the Webarchive API, request and save a token.
"""
try:
self.token = self._authenticate()
except HTTPError as e:
self._display_http_error(e)
def _authenticate(self):
r = requests.post(self.base_url.format('authentication'),
data='''{{
"apikey": "{api_key}",
"fingerprint": "string",
"version": "{version}"
}}'''.format(api_key=self.api_key, version=self.version),
headers={
'content-type': 'application/json',
'accept': 'application/ld+json'
}
)
if r.status_code == 201:
return r.json()['t']
else:
raise HTTPError(response=r)
def _add_api_key_and_token(self, params_dict: dict):
"""
Add the saved api key and token to a given dictionary.
:param params_dict: A dictionary that's probably used
as a 'params' keyword parameter for calling requests.get().
:return: The same dictionary extended by 'apikey' and 't' keys.
"""
params_dict['apikey'] = self.api_key
params_dict['t'] = self.token
return params_dict
def _display_http_error(self, e: HTTPError):
print(self._error_template.format(status_code=e.response.status_code,
response_text=e.response.text),
file=sys.stderr)
def _get(self, op, auto_connect=True, **kwargs, ):
kwargs['params'] = self._add_api_key_and_token(kwargs.pop('params', {}))
# kwargs = self._add_api_key_and_token(kwargs)
r = requests.get(self.base_url.format(op), **kwargs)
if r.ok:
return r
else:
if r.status_code == 403:
if auto_connect:
self.connect()
return self._get(op=op, auto_connect=False, **kwargs)
else:
print('Forbidden. Invalid Token or ApiKey transmitted', file=sys.stderr)
return r
elif r.status_code == 400:
print('Bad request', file=sys.stderr)
return r
elif r.status_code == 410:
print('The requested API Version (via X-API-VERSION Header) is not available', file=sys.stderr)
"""
Start a fulltext search query in the Webarchive.
The current status of running queries can be read via status_open_queries().
:param query_string: String to search for
:param from_: Optional earliest date bound for the search
in the format YYYYMM.
:param to_: Optional latest date bound for the search
in the format YYYYMM.
:return: None
"""
params = {'q': query_string}
if from_:
params['from'] = from_
if to_:
params['to'] = to_
try:
response = self._get(op='/search/fulltext', params=params)
return self.waitForResponse(response)
except HTTPError as e:
self._display_http_error(e)
print('Query for "{}" not added'.format(query_string))
"""
Start a wayback search query in the Webarchive.
The current status of running queries can be read via status_open_queries().
:param query_string: String to search for
:param from_: Optional earliest date bound for the search
in the format YYYYMM.
:param to_: Optional latest date bound for the search
in the format YYYYMM.
:return: None
"""
params = {'q': query_string}
if from_:
params['from'] = from_
if to_:
params['to'] = to_
try:
response = self._get(op='/search/wayback', params=params)
return self.waitForResponse(response)
except HTTPError as e:
self._display_http_error(e)
print('Error:'.format(query_string))
def waitForResponse(self, response):
if response.status_code == 400:
return response
while response.status_code != 200:
time.sleep(0.2)
response = self.status_query(response)
return response
def status_query(self, resp):
requestid = resp.json()['requestid'];
type = resp.json()['type']
if type == 1:
r = self._get(op='/search/status/fulltext', params={'requestid': requestid})
elif type == 2:
r = self._get(op='/search/status/wayback', params={'requestid': requestid})
return r
if __name__ == '__main__':
# noinspection SpellCheckingInspection
w = WebarchivSession('Zz2tQls7fuaocX2pjrfc2npojqbGwXL2')
# response = w.wayback_search("http://www.onb.ac.at")
#response = w.wayback_search("http://frauenhetz.jetzt")
response = w.fulltext_search("Nationalbibliothek Prunksaal Schwarzenegger")
# response = w.wayback_search("x")
if response.status_code == 200:
print(response.json()['total'], " Captures")
else:
print("Error ", response.status_code)