import sys import requests from requests import HTTPError import datetime _datetime_format_string = '%Y%m%d%H%M%S' class SessionTimeoutError(Exception): pass class WebarchivSession: @property def version(self): """ Current protocol version """ return '0.1.0' @property def base_url(self): """ Protocol, domain and path prefix for the Webarchive API, with a single positional format string placeholder for the REST operation and parameters. """ return 'https://webarchiv.onb.ac.at/api/{}' @property def _error_template(self): """ A format string for displaying HTTP Errors. Must contain one placeholder 'status_code' for the HTTP status code. Must contain one placeholder 'response_text' for the body of the response. """ return 'HTTP ERROR - status code {status_code}\n----\n{response_text}\n----\n\n' def __init__(self, api_key): self.api_key = api_key self.token = None self.open_fulltext_queries = {} self.finished_fulltext_queries = {} def connect(self): """ Connect to the Webarchive API, request and save a token. """ try: self.token = self._authenticate() except HTTPError as e: self._display_http_error(e) def _authenticate(self): r = requests.post(self.base_url.format('authentication'), data='''{{ "apikey": "{api_key}", "fingerprint": "string", "version": "{version}" }}'''.format(api_key=self.api_key, version=self.version), headers={ 'content-type': 'application/json', 'accept': 'application/ld+json' } ) if r.status_code == 201: return r.json()['t'] else: raise HTTPError(response=r) def _add_api_key_and_token(self, params_dict: dict): """ Add the saved api key and token to a given dictionary. :param params_dict: A dictionary that's probably used as a 'params' keyword parameter for calling requests.get(). :return: The same dictionary extended by 'apikey' and 't' keys. """ params_dict['apikey'] = self.api_key params_dict['t'] = self.token return params_dict def _display_http_error(self, e: HTTPError): print(self._error_template.format(status_code=e.response.status_code, response_text=e.response.text), file=sys.stderr) def _get(self, op, auto_connect=True, **kwargs, ): kwargs['params'] = self._add_api_key_and_token(kwargs.pop('params', {})) # kwargs = self._add_api_key_and_token(kwargs) r = requests.get(self.base_url.format(op), **kwargs) if r.ok: return r else: if r.status_code == 403: if auto_connect: self.connect() return self._get(op=op, auto_connect=False, **kwargs) else: print('Forbidden. Invalid Token or ApiKey transmitted', file=sys.stderr) elif r.status_code == 410: print('The requested API Version (via X-API-VERSION Header) is not available', file=sys.stderr) raise HTTPError(response=r) def query_fulltext_search(self, query_string, from_=None, to_=None): """ Start a fulltext search query in the Webarchive. The current status of running queries can be read via status_open_queries(). :param query_string: String to search for :param from_: Optional earliest date bound for the search in the format YYYYMM. :param to_: Optional latest date bound for the search in the format YYYYMM. :return: None """ params = {'q': query_string} if from_: params['from'] = from_ if to_: params['to'] = to_ try: r = self._get(op='/search/fulltext', params=params) self.open_fulltext_queries[(datetime.datetime.now().strftime(_datetime_format_string), query_string)] = r print('Query for "{}" added. Message:"{}"'.format(query_string, r.json()['message'])) except HTTPError as e: self._display_http_error(e) print('Query for "{}" not added'.format(query_string)) def query_wayback_search(self, query_string, from_=None, to_=None): raise NotImplementedError def status_open_queries(self): """ Request the current status of running queries from the Webarchive. Finished queries are moved from 'open_*' to 'finished_*' queues. :return: None """ for (timestamp, query_string), old_response in list(self.open_fulltext_queries.items()): requestid = old_response.json()['requestid'] r = self._get(op='/search/status/fulltext', params={'requestid': requestid}) if r.status_code == 200: self.finished_fulltext_queries[(timestamp, query_string)] = r.json() print('Query for "{}" done'.format(query_string)) del(self.open_fulltext_queries[(timestamp, query_string)]) elif r.status_code == 202: print('Query for "{}" is still running'.format(query_string)) if __name__ == '__main__': # noinspection SpellCheckingInspection w = WebarchivSession('Zz2tQls7fuaocX2pjrfc2npojqbGwXL2')