Skip to content
webarchiv.py 5.66 KiB
Newer Older
Stefan Karner's avatar
Stefan Karner committed
import sys
import requests
from requests import HTTPError
import datetime

_datetime_format_string = '%Y%m%d%H%M%S'


class SessionTimeoutError(Exception):
    pass


class WebarchivSession:
    @property
    def version(self):
        """
        Current protocol version
        """
        return '0.1.0'

    @property
    def base_url(self):
        """
        Protocol, domain and path prefix for the Webarchive API,
        with a single positional format string placeholder
        for the REST operation and parameters.
        """
        return 'https://webarchiv.onb.ac.at/api/{}'

    @property
    def _error_template(self):
        """
        A format string for displaying HTTP Errors.
        Must contain one placeholder 'status_code' for the HTTP status code.
        Must contain one placeholder 'response_text' for the body of the response.
        """
        return 'HTTP ERROR - status code {status_code}\n----\n{response_text}\n----\n\n'

    def __init__(self, api_key):
        self.api_key = api_key
        self.token = None
        self.open_fulltext_queries = {}
        self.finished_fulltext_queries = {}

    def connect(self):
        """
        Connect to the Webarchive API, request and save a token.
        """
        try:
            self.token = self._authenticate()
        except HTTPError as e:
            self._display_http_error(e)

    def _authenticate(self):
        r = requests.post(self.base_url.format('authentication'),
                          data='''{{
                              "apikey": "{api_key}",
                              "fingerprint": "string",
                              "version": "{version}"
                          }}'''.format(api_key=self.api_key, version=self.version),
                          headers={
                              'content-type': 'application/json',
                              'accept': 'application/ld+json'
                          }
                          )
        if r.status_code == 201:
            return r.json()['t']
        else:
            raise HTTPError(response=r)

    def _add_api_key_and_token(self, params_dict: dict):
        """
        Add the saved api key and token to a given dictionary.

        :param params_dict: A dictionary that's probably used
          as a 'params' keyword parameter for calling requests.get().
        :return: The same dictionary extended by 'apikey' and 't' keys.
        """
        params_dict['apikey'] = self.api_key
        params_dict['t'] = self.token
        return params_dict

    def _display_http_error(self, e: HTTPError):
        print(self._error_template.format(status_code=e.response.status_code,
                                          response_text=e.response.text),
              file=sys.stderr)

    def _get(self, op, auto_connect=True, **kwargs, ):
        kwargs['params'] = self._add_api_key_and_token(kwargs.pop('params', {}))
        # kwargs = self._add_api_key_and_token(kwargs)
        r = requests.get(self.base_url.format(op), **kwargs)
        if r.ok:
            return r
        else:
            if r.status_code == 403:
                if auto_connect:
                    self.connect()
                    return self._get(op=op, auto_connect=False, **kwargs)
                else:
                    print('Forbidden. Invalid Token or ApiKey transmitted', file=sys.stderr)
            elif r.status_code == 410:
                print('The requested API Version (via X-API-VERSION Header) is not available', file=sys.stderr)
            raise HTTPError(response=r)

    def query_fulltext_search(self, query_string, from_=None, to_=None):
        """
        Start a fulltext search query in the Webarchive.
        The current status of running queries can be read via status_open_queries().

        :param query_string: String to search for
        :param from_: Optional earliest date bound for the search
          in the format YYYYMM.
        :param to_: Optional latest date bound for the search
          in the format YYYYMM.
        :return: None
        """
        params = {'q': query_string}
        if from_:
            params['from'] = from_
        if to_:
            params['to'] = to_

        try:
            r = self._get(op='/search/fulltext', params=params)
            self.open_fulltext_queries[(datetime.datetime.now().strftime(_datetime_format_string),
                                        query_string)] = r
            print('Query for "{}" added. Message:"{}"'.format(query_string, r.json()['message']))
        except HTTPError as e:
            self._display_http_error(e)
            print('Query for "{}" not added'.format(query_string))

    def query_wayback_search(self, query_string, from_=None, to_=None):
        raise NotImplementedError

    def status_open_queries(self):
        """
        Request the current status of running queries from the Webarchive.
        Finished queries are moved from 'open_*' to 'finished_*' queues.

        :return: None
        """
        for (timestamp, query_string), old_response in list(self.open_fulltext_queries.items()):
            requestid = old_response.json()['requestid']
            r = self._get(op='/search/status/fulltext', params={'requestid': requestid})
            if r.status_code == 200:
                self.finished_fulltext_queries[(timestamp, query_string)] = r.json()
                print('Query for "{}" done'.format(query_string))
                del(self.open_fulltext_queries[(timestamp, query_string)])
            elif r.status_code == 202:
                print('Query for "{}" is still running'.format(query_string))


if __name__ == '__main__':
    # noinspection SpellCheckingInspection
    w = WebarchivSession('Zz2tQls7fuaocX2pjrfc2npojqbGwXL2')