webarchiv.py 9.28 KB
Newer Older
Stefan Karner's avatar
Stefan Karner committed
1
import sys
Andreas's avatar
Andreas committed
2
import time
Stefan Karner's avatar
Stefan Karner committed
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
import requests
from requests import HTTPError

_datetime_format_string = '%Y%m%d%H%M%S'


class SessionTimeoutError(Exception):
    pass


class WebarchivSession:
    @property
    def version(self):
        """
        Current protocol version
        """
        return '0.1.0'

onbpre's avatar
onbpre committed
21 22 23 24 25 26 27 28 29
    @property
    def api_path(self):
        """
        Protocol, domain and path prefix for the Webarchive API,
        with a single positional format string placeholder
        for the REST operation and parameters.
        """
        return 'https://webarchiv.onb.ac.at/api/'

Stefan Karner's avatar
Stefan Karner committed
30 31 32 33 34 35
    @property
    def base_url(self):
        """
        Protocol, domain and path prefix for the Webarchive API,
        with a single positional format string placeholder
        for the REST operation and parameters.
36
        """
onbpre's avatar
onbpre committed
37
        return self.api_path + '/{}'
Stefan Karner's avatar
Stefan Karner committed
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107

    @property
    def _error_template(self):
        """
        A format string for displaying HTTP Errors.
        Must contain one placeholder 'status_code' for the HTTP status code.
        Must contain one placeholder 'response_text' for the body of the response.
        """
        return 'HTTP ERROR - status code {status_code}\n----\n{response_text}\n----\n\n'

    def __init__(self, api_key):
        self.api_key = api_key
        self.token = None

    def connect(self):
        """
        Connect to the Webarchive API, request and save a token.
        """
        try:
            self.token = self._authenticate()
        except HTTPError as e:
            self._display_http_error(e)

    def _authenticate(self):
        r = requests.post(self.base_url.format('authentication'),
                          data='''{{
                              "apikey": "{api_key}",
                              "fingerprint": "string",
                              "version": "{version}"
                          }}'''.format(api_key=self.api_key, version=self.version),
                          headers={
                              'content-type': 'application/json',
                              'accept': 'application/ld+json'
                          }
                          )
        if r.status_code == 201:
            return r.json()['t']
        else:
            raise HTTPError(response=r)

    def _add_api_key_and_token(self, params_dict: dict):
        """
        Add the saved api key and token to a given dictionary.

        :param params_dict: A dictionary that's probably used
          as a 'params' keyword parameter for calling requests.get().
        :return: The same dictionary extended by 'apikey' and 't' keys.
        """
        params_dict['apikey'] = self.api_key
        params_dict['t'] = self.token
        return params_dict

    def _display_http_error(self, e: HTTPError):
        print(self._error_template.format(status_code=e.response.status_code,
                                          response_text=e.response.text),
              file=sys.stderr)

    def _get(self, op, auto_connect=True, **kwargs, ):
        kwargs['params'] = self._add_api_key_and_token(kwargs.pop('params', {}))
        # kwargs = self._add_api_key_and_token(kwargs)
        r = requests.get(self.base_url.format(op), **kwargs)
        if r.ok:
            return r
        else:
            if r.status_code == 403:
                if auto_connect:
                    self.connect()
                    return self._get(op=op, auto_connect=False, **kwargs)
                else:
                    print('Forbidden. Invalid Token or ApiKey transmitted', file=sys.stderr)
Andreas's avatar
Andreas committed
108 109 110 111
                    return r
            elif r.status_code == 400:
                print('Bad request', file=sys.stderr)
                return r
Stefan Karner's avatar
Stefan Karner committed
112 113
            elif r.status_code == 410:
                print('The requested API Version (via X-API-VERSION Header) is not available', file=sys.stderr)
Andreas's avatar
Andreas committed
114
            return r
Stefan Karner's avatar
Stefan Karner committed
115 116
            raise HTTPError(response=r)

Andreas's avatar
Andreas committed
117
    def fulltext_search(self, query_string, from_=None, to_=None):
Stefan Karner's avatar
Stefan Karner committed
118 119 120 121 122 123 124 125
        """
        Start a fulltext search query in the Webarchive.

        :param query_string: String to search for
        :param from_: Optional earliest date bound for the search
          in the format YYYYMM.
        :param to_: Optional latest date bound for the search
          in the format YYYYMM.
126
        :return: HTTP Response object
Stefan Karner's avatar
Stefan Karner committed
127 128 129 130 131 132 133 134
        """
        params = {'q': query_string}
        if from_:
            params['from'] = from_
        if to_:
            params['to'] = to_

        try:
Andreas's avatar
Andreas committed
135 136 137
            response = self._get(op='/search/fulltext', params=params)
            return self.waitForResponse(response)

Stefan Karner's avatar
Stefan Karner committed
138 139 140 141
        except HTTPError as e:
            self._display_http_error(e)
            print('Query for "{}" not added'.format(query_string))

Andreas's avatar
Andreas committed
142
    def wayback_search(self, query_string, from_=None, to_=None):
Andreas's avatar
Andreas committed
143 144 145 146 147 148 149 150
        """
        Start a wayback search query in the Webarchive.

        :param query_string: String to search for
        :param from_: Optional earliest date bound for the search
          in the format YYYYMM.
        :param to_: Optional latest date bound for the search
          in the format YYYYMM.
151
        :return: HTTP Response object
Andreas's avatar
Andreas committed
152 153 154 155 156 157 158 159
        """
        params = {'q': query_string}
        if from_:
            params['from'] = from_
        if to_:
            params['to'] = to_

        try:
Andreas's avatar
Andreas committed
160 161 162
            response = self._get(op='/search/wayback', params=params)
            return self.waitForResponse(response)

Andreas's avatar
Andreas committed
163 164 165 166
        except HTTPError as e:
            self._display_http_error(e)
            print('Error:'.format(query_string))

Andreas's avatar
Andreas committed
167
    def waitForResponse(self, response):
Andreas's avatar
Andreas committed
168 169 170 171 172 173
        """
        Polls until the server responds with a result

        :param response: String to search for
        :return: response
        """
Andreas's avatar
Andreas committed
174 175 176 177
        if response.status_code == 400:
            return response

        while response.status_code != 200:
Andreas's avatar
Andreas committed
178
            time.sleep(0.5)
Andreas's avatar
Andreas committed
179 180 181
            response = self.status_query(response)

        return response
Andreas's avatar
Andreas committed
182 183

    def status_query(self, resp):
Andreas's avatar
Andreas committed
184 185 186 187 188 189
        """
        this is the pollingrequest for the given typen of request

        :param response: String to search for
        :return: response
        """
Stefan Karner's avatar
Stefan Karner committed
190
        requestid = resp.json()['requestid']
Andreas's avatar
Andreas committed
191 192 193 194 195 196 197 198
        type = resp.json()['type']
        if type == 1:
            r = self._get(op='/search/status/fulltext', params={'requestid': requestid})
        elif type == 2:
            r = self._get(op='/search/status/wayback', params={'requestid': requestid})

        return r

199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221
    def domain_name_search(self, query_string, page_=1, pagesize_=100):
        """
        Start a domain name search in the Webarchive.

        :param query_string: String to search for
        :param page_: The page number parameter works with the page size parameter to control the offset of the records returned in the results. Default value is 1
        :param pagesize_: The page size parameter works with the page number parameter to control the offset of the records returned in the results. It also controls how many results are returned with each request. Default value is 10
        :return: result as json
        """
        params = {'q': query_string}
        if page_:
            params['page'] = page_
        if pagesize_:
            params['pagesize'] = pagesize_

        try:
            response = self._get(op='/search/domainname', params=params)
            return self.waitForResponse(response)

        except HTTPError as e:
            self._display_http_error(e)
            print('Error:'.format(query_string))

222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246
    def histogram_search(self, query_string, interval_=3, from_=None, to_=None):
        """
        Start a domain name search in the Webarchive.

        :param query_string: String to search for
        :param page_: The page number parameter works with the page size parameter to control the offset of the records returned in the results. Default value is 1
        :param pagesize_: The page size parameter works with the page number parameter to control the offset of the records returned in the results. It also controls how many results are returned with each request. Default value is 10
        :return: result as json
        """
        params = {'q': query_string}
        if interval_:
            params['interval'] = interval_
        if from_:
            params['from'] = from_
        if to_:
            params['to'] = to_

        try:
            response = self._get(op='/search/fulltext/histogram', params=params)
            return self.waitForResponse(response)

        except HTTPError as e:
            self._display_http_error(e)
            print('Error:'.format(query_string))

Andreas's avatar
Andreas committed
247 248
    def getSnapshotUrl(self, seed, capture, onlysvg):
        return self.api_path + 'snapshot?capture=' + capture + '&t=' + self.token + '&apikey=' + self.api_key + '&onlysvg=' + onlysvg + '&seed=' + seed;
Stefan Karner's avatar
Stefan Karner committed
249

Andreas's avatar
Andreas committed
250 251 252 253 254 255 256 257 258 259 260 261 262
    def savePage(self, url):
        self.connect()
        r = requests.post(self.base_url.format('savepage'),
                          data='''{{
                              "apikey": "{api_key}",
                              "t": "{token}",
                              "url": "{url}"
                          }}'''.format(api_key=self.api_key, token=self.token, url=url),
                          headers={
                              'content-type': 'application/json',
                              'accept': 'application/ld+json'
                          }
                          )
Stefan Karner's avatar
Stefan Karner committed
263
        return r