Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import sys
import requests
from requests import HTTPError
import datetime
_datetime_format_string = '%Y%m%d%H%M%S'
class SessionTimeoutError(Exception):
pass
class WebarchivSession:
@property
def version(self):
"""
Current protocol version
"""
return '0.1.0'
@property
def base_url(self):
"""
Protocol, domain and path prefix for the Webarchive API,
with a single positional format string placeholder
for the REST operation and parameters.
"""
return 'https://webarchiv.onb.ac.at/api/{}'
@property
def _error_template(self):
"""
A format string for displaying HTTP Errors.
Must contain one placeholder 'status_code' for the HTTP status code.
Must contain one placeholder 'response_text' for the body of the response.
"""
return 'HTTP ERROR - status code {status_code}\n----\n{response_text}\n----\n\n'
def __init__(self, api_key):
self.api_key = api_key
self.token = None
self.open_fulltext_queries = {}
self.finished_fulltext_queries = {}
def connect(self):
"""
Connect to the Webarchive API, request and save a token.
"""
try:
self.token = self._authenticate()
except HTTPError as e:
self._display_http_error(e)
def _authenticate(self):
r = requests.post(self.base_url.format('authentication'),
data='''{{
"apikey": "{api_key}",
"fingerprint": "string",
"version": "{version}"
}}'''.format(api_key=self.api_key, version=self.version),
headers={
'content-type': 'application/json',
'accept': 'application/ld+json'
}
)
if r.status_code == 201:
return r.json()['t']
else:
raise HTTPError(response=r)
def _add_api_key_and_token(self, params_dict: dict):
"""
Add the saved api key and token to a given dictionary.
:param params_dict: A dictionary that's probably used
as a 'params' keyword parameter for calling requests.get().
:return: The same dictionary extended by 'apikey' and 't' keys.
"""
params_dict['apikey'] = self.api_key
params_dict['t'] = self.token
return params_dict
def _display_http_error(self, e: HTTPError):
print(self._error_template.format(status_code=e.response.status_code,
response_text=e.response.text),
file=sys.stderr)
def _get(self, op, auto_connect=True, **kwargs, ):
kwargs['params'] = self._add_api_key_and_token(kwargs.pop('params', {}))
# kwargs = self._add_api_key_and_token(kwargs)
r = requests.get(self.base_url.format(op), **kwargs)
if r.ok:
return r
else:
if r.status_code == 403:
if auto_connect:
self.connect()
return self._get(op=op, auto_connect=False, **kwargs)
else:
print('Forbidden. Invalid Token or ApiKey transmitted', file=sys.stderr)
elif r.status_code == 410:
print('The requested API Version (via X-API-VERSION Header) is not available', file=sys.stderr)
raise HTTPError(response=r)
def query_fulltext_search(self, query_string, from_=None, to_=None):
"""
Start a fulltext search query in the Webarchive.
The current status of running queries can be read via status_open_queries().
:param query_string: String to search for
:param from_: Optional earliest date bound for the search
in the format YYYYMM.
:param to_: Optional latest date bound for the search
in the format YYYYMM.
:return: None
"""
params = {'q': query_string}
if from_:
params['from'] = from_
if to_:
params['to'] = to_
try:
r = self._get(op='/search/fulltext', params=params)
self.open_fulltext_queries[(datetime.datetime.now().strftime(_datetime_format_string),
query_string)] = r
print('Query for "{}" added. Message:"{}"'.format(query_string, r.json()['message']))
except HTTPError as e:
self._display_http_error(e)
print('Query for "{}" not added'.format(query_string))
def query_wayback_search(self, query_string, from_=None, to_=None):
raise NotImplementedError
def status_open_queries(self):
"""
Request the current status of running queries from the Webarchive.
Finished queries are moved from 'open_*' to 'finished_*' queues.
:return: None
"""
for (timestamp, query_string), old_response in list(self.open_fulltext_queries.items()):
requestid = old_response.json()['requestid']
r = self._get(op='/search/status/fulltext', params={'requestid': requestid})
if r.status_code == 200:
self.finished_fulltext_queries[(timestamp, query_string)] = r.json()
print('Query for "{}" done'.format(query_string))
del(self.open_fulltext_queries[(timestamp, query_string)])
elif r.status_code == 202:
print('Query for "{}" is still running'.format(query_string))
if __name__ == '__main__':
# noinspection SpellCheckingInspection
w = WebarchivSession('Zz2tQls7fuaocX2pjrfc2npojqbGwXL2')