import sys import time import requests from requests import HTTPError _datetime_format_string = '%Y%m%d%H%M%S' class SessionTimeoutError(Exception): pass class WebarchivSession: @property def version(self): """ Current protocol version """ return '0.1.0' @property def base_url(self): """ Protocol, domain and path prefix for the Webarchive API, with a single positional format string placeholder for the REST operation and parameters. """ return 'https://webarchiv.onb.ac.at/api/{}' @property def _error_template(self): """ A format string for displaying HTTP Errors. Must contain one placeholder 'status_code' for the HTTP status code. Must contain one placeholder 'response_text' for the body of the response. """ return 'HTTP ERROR - status code {status_code}\n----\n{response_text}\n----\n\n' def __init__(self, api_key): self.api_key = api_key self.token = None def connect(self): """ Connect to the Webarchive API, request and save a token. """ try: self.token = self._authenticate() except HTTPError as e: self._display_http_error(e) def _authenticate(self): r = requests.post(self.base_url.format('authentication'), data='''{{ "apikey": "{api_key}", "fingerprint": "string", "version": "{version}" }}'''.format(api_key=self.api_key, version=self.version), headers={ 'content-type': 'application/json', 'accept': 'application/ld+json' } ) if r.status_code == 201: return r.json()['t'] else: raise HTTPError(response=r) def _add_api_key_and_token(self, params_dict: dict): """ Add the saved api key and token to a given dictionary. :param params_dict: A dictionary that's probably used as a 'params' keyword parameter for calling requests.get(). :return: The same dictionary extended by 'apikey' and 't' keys. """ params_dict['apikey'] = self.api_key params_dict['t'] = self.token return params_dict def _display_http_error(self, e: HTTPError): print(self._error_template.format(status_code=e.response.status_code, response_text=e.response.text), file=sys.stderr) def _get(self, op, auto_connect=True, **kwargs, ): kwargs['params'] = self._add_api_key_and_token(kwargs.pop('params', {})) # kwargs = self._add_api_key_and_token(kwargs) r = requests.get(self.base_url.format(op), **kwargs) if r.ok: return r else: if r.status_code == 403: if auto_connect: self.connect() return self._get(op=op, auto_connect=False, **kwargs) else: print('Forbidden. Invalid Token or ApiKey transmitted', file=sys.stderr) return r elif r.status_code == 400: print('Bad request', file=sys.stderr) return r elif r.status_code == 410: print('The requested API Version (via X-API-VERSION Header) is not available', file=sys.stderr) return r raise HTTPError(response=r) def fulltext_search(self, query_string, from_=None, to_=None): """ Start a fulltext search query in the Webarchive. The current status of running queries can be read via status_open_queries(). :param query_string: String to search for :param from_: Optional earliest date bound for the search in the format YYYYMM. :param to_: Optional latest date bound for the search in the format YYYYMM. :return: None """ params = {'q': query_string} if from_: params['from'] = from_ if to_: params['to'] = to_ try: response = self._get(op='/search/fulltext', params=params) return self.waitForResponse(response) except HTTPError as e: self._display_http_error(e) print('Query for "{}" not added'.format(query_string)) def wayback_search(self, query_string, from_=None, to_=None): """ Start a wayback search query in the Webarchive. The current status of running queries can be read via status_open_queries(). :param query_string: String to search for :param from_: Optional earliest date bound for the search in the format YYYYMM. :param to_: Optional latest date bound for the search in the format YYYYMM. :return: None """ params = {'q': query_string} if from_: params['from'] = from_ if to_: params['to'] = to_ try: response = self._get(op='/search/wayback', params=params) return self.waitForResponse(response) except HTTPError as e: self._display_http_error(e) print('Error:'.format(query_string)) def waitForResponse(self, response): if response.status_code == 400: return response while response.status_code != 200: time.sleep(0.2) response = self.status_query(response) return response def status_query(self, resp): requestid = resp.json()['requestid']; type = resp.json()['type'] if type == 1: r = self._get(op='/search/status/fulltext', params={'requestid': requestid}) elif type == 2: r = self._get(op='/search/status/wayback', params={'requestid': requestid}) return r if __name__ == '__main__': # noinspection SpellCheckingInspection w = WebarchivSession('Zz2tQls7fuaocX2pjrfc2npojqbGwXL2') # response = w.wayback_search("http://www.onb.ac.at") #response = w.wayback_search("http://frauenhetz.jetzt") response = w.fulltext_search("Nationalbibliothek Prunksaal Schwarzenegger") # response = w.wayback_search("x") if response.status_code == 200: print(response.json()['total'], " Captures") else: print("Error ", response.status_code)