Skip to content
webarchiv.py 7.79 KiB
Newer Older
Stefan Karner's avatar
Stefan Karner committed
import sys
onbpre's avatar
onbpre committed
import time
Stefan Karner's avatar
Stefan Karner committed
import requests
from requests import HTTPError
import datetime

_datetime_format_string = '%Y%m%d%H%M%S'


class SessionTimeoutError(Exception):
    pass


class WebarchivSession:
    @property
    def version(self):
        """
        Current protocol version
        """
        return '0.1.0'

    @property
    def base_url(self):
        """
        Protocol, domain and path prefix for the Webarchive API,
        with a single positional format string placeholder
        for the REST operation and parameters.
        """
        return 'https://webarchiv.onb.ac.at/api/{}'

    @property
    def _error_template(self):
        """
        A format string for displaying HTTP Errors.
        Must contain one placeholder 'status_code' for the HTTP status code.
        Must contain one placeholder 'response_text' for the body of the response.
        """
        return 'HTTP ERROR - status code {status_code}\n----\n{response_text}\n----\n\n'

    def __init__(self, api_key):
        self.api_key = api_key
        self.token = None
        self.open_fulltext_queries = {}
onbpre's avatar
onbpre committed
        self.open_wayback_queries = {}
Stefan Karner's avatar
Stefan Karner committed
        self.finished_fulltext_queries = {}
onbpre's avatar
onbpre committed
        self.finished_wayback_queries = {}
Stefan Karner's avatar
Stefan Karner committed

    def connect(self):
        """
        Connect to the Webarchive API, request and save a token.
        """
        try:
            self.token = self._authenticate()
        except HTTPError as e:
            self._display_http_error(e)

    def _authenticate(self):
        r = requests.post(self.base_url.format('authentication'),
                          data='''{{
                              "apikey": "{api_key}",
                              "fingerprint": "string",
                              "version": "{version}"
                          }}'''.format(api_key=self.api_key, version=self.version),
                          headers={
                              'content-type': 'application/json',
                              'accept': 'application/ld+json'
                          }
                          )
        if r.status_code == 201:
            return r.json()['t']
        else:
            raise HTTPError(response=r)

    def _add_api_key_and_token(self, params_dict: dict):
        """
        Add the saved api key and token to a given dictionary.

        :param params_dict: A dictionary that's probably used
          as a 'params' keyword parameter for calling requests.get().
        :return: The same dictionary extended by 'apikey' and 't' keys.
        """
        params_dict['apikey'] = self.api_key
        params_dict['t'] = self.token
        return params_dict

    def _display_http_error(self, e: HTTPError):
        print(self._error_template.format(status_code=e.response.status_code,
                                          response_text=e.response.text),
              file=sys.stderr)

    def _get(self, op, auto_connect=True, **kwargs, ):
        kwargs['params'] = self._add_api_key_and_token(kwargs.pop('params', {}))
        # kwargs = self._add_api_key_and_token(kwargs)
        r = requests.get(self.base_url.format(op), **kwargs)
        if r.ok:
            return r
        else:
            if r.status_code == 403:
                if auto_connect:
                    self.connect()
                    return self._get(op=op, auto_connect=False, **kwargs)
                else:
                    print('Forbidden. Invalid Token or ApiKey transmitted', file=sys.stderr)
            elif r.status_code == 410:
                print('The requested API Version (via X-API-VERSION Header) is not available', file=sys.stderr)
            raise HTTPError(response=r)

    def query_fulltext_search(self, query_string, from_=None, to_=None):
        """
        Start a fulltext search query in the Webarchive.
        The current status of running queries can be read via status_open_queries().

        :param query_string: String to search for
        :param from_: Optional earliest date bound for the search
          in the format YYYYMM.
        :param to_: Optional latest date bound for the search
          in the format YYYYMM.
        :return: None
        """
        params = {'q': query_string}
        if from_:
            params['from'] = from_
        if to_:
            params['to'] = to_

        try:
            r = self._get(op='/search/fulltext', params=params)
            self.open_fulltext_queries[(datetime.datetime.now().strftime(_datetime_format_string),
                                        query_string)] = r
            print('Query for "{}" added. Message:"{}"'.format(query_string, r.json()['message']))
        except HTTPError as e:
            self._display_http_error(e)
            print('Query for "{}" not added'.format(query_string))

    def query_wayback_search(self, query_string, from_=None, to_=None):
onbpre's avatar
onbpre committed
        """
        Start a wayback search query in the Webarchive.
        The current status of running queries can be read via status_open_queries().

        :param query_string: String to search for
        :param from_: Optional earliest date bound for the search
          in the format YYYYMM.
        :param to_: Optional latest date bound for the search
          in the format YYYYMM.
        :return: None
        """
        params = {'q': query_string}
        if from_:
            params['from'] = from_
        if to_:
            params['to'] = to_

        try:
            print('search for ', params)
            r = self._get(op='/search/wayback', params=params)
            return r
        except HTTPError as e:
            self._display_http_error(e)
            print('Error:'.format(query_string))


    def status_query(self, resp):
        requestid = resp.json()['requestid'];
        type = resp.json()['type']
        print('call status for "{}"', requestid)
        if type == 1:
            r = self._get(op='/search/status/fulltext', params={'requestid': requestid})
        elif type == 2:
            r = self._get(op='/search/status/wayback', params={'requestid': requestid})

        return r

Stefan Karner's avatar
Stefan Karner committed

    def status_open_queries(self):
        """
        Request the current status of running queries from the Webarchive.
        Finished queries are moved from 'open_*' to 'finished_*' queues.

        :return: None
        """
        for (timestamp, query_string), old_response in list(self.open_fulltext_queries.items()):
            requestid = old_response.json()['requestid']
            r = self._get(op='/search/status/fulltext', params={'requestid': requestid})
            if r.status_code == 200:
                self.finished_fulltext_queries[(timestamp, query_string)] = r.json()
                print('Query for "{}" done'.format(query_string))
                del(self.open_fulltext_queries[(timestamp, query_string)])
            elif r.status_code == 202:
                print('Query for "{}" is still running'.format(query_string))

onbpre's avatar
onbpre committed
        for (timestamp, query_string), old_response in list(self.open_wayback_queries.items()):
            requestid = old_response.json()['requestid']
            r = self._get(op='/search/status/wayback', params={'requestid': requestid})
            if r.status_code == 200:
                self.finished_wayback_queries[(timestamp, query_string)] = r.json()
                print('Query for "{}" done'.format(query_string))
                del(self.open_wayback_queries[(timestamp, query_string)])
            elif r.status_code == 202:
                print('Query for "{}" is still running'.format(query_string))

Stefan Karner's avatar
Stefan Karner committed

if __name__ == '__main__':
    # noinspection SpellCheckingInspection
    w = WebarchivSession('Zz2tQls7fuaocX2pjrfc2npojqbGwXL2')
onbpre's avatar
onbpre committed
    response = w.query_wayback_search("http://www.onb.ac.at")
    print(response.status_code)
    while response.status_code != 200:
        print(response.status_code)
        time.sleep(1)
        response = w.status_query(response)

    print(response.json()['total'])