webarchiv.py 15.3 KB
Newer Older
Stefan Karner's avatar
Stefan Karner committed
1
import sys
Andreas's avatar
Andreas committed
2
import time
Stefan Karner's avatar
Stefan Karner committed
3
import requests
Stefan Karner's avatar
Stefan Karner committed
4
import hashlib
Andreas's avatar
Andreas committed
5
import json
Stefan Karner's avatar
Stefan Karner committed
6 7 8 9 10
from requests import HTTPError

_datetime_format_string = '%Y%m%d%H%M%S'


11
EXTRACTOR_TEXT = 1
Stefan Karner's avatar
Stefan Karner committed
12
EXTRACTOR_HTML = 2
13 14
EXTRACTOR_BINARY = 3

Andreas's avatar
Andreas committed
15 16 17 18
# Modes for TextExtractor
POSITIONLEN_MODE = 1
POSITION_MODE = 2
REGEX_MODE = 3
19

20 21 22 23 24
# searchmodes
SEARCHMODE_WEBARCHIV = 1
SEARCHMODE_INTERNETARCHIVE = 2
SEARCHMODE_WEBARCHIV_INTERNETARCHIVE = 3

Stefan Karner's avatar
Stefan Karner committed
25 26 27 28 29 30 31 32 33 34 35 36
class SessionTimeoutError(Exception):
    pass


class WebarchivSession:
    @property
    def version(self):
        """
        Current protocol version
        """
        return '0.1.0'

onbpre's avatar
onbpre committed
37 38 39 40 41 42 43
    @property
    def api_path(self):
        """
        Protocol, domain and path prefix for the Webarchive API,
        with a single positional format string placeholder
        for the REST operation and parameters.
        """
Andreas's avatar
Andreas committed
44
        return 'https://webarchiv.onb.ac.at/api'
onbpre's avatar
onbpre committed
45

Stefan Karner's avatar
Stefan Karner committed
46 47 48 49 50 51
    @property
    def base_url(self):
        """
        Protocol, domain and path prefix for the Webarchive API,
        with a single positional format string placeholder
        for the REST operation and parameters.
52
        """
Andreas's avatar
Andreas committed
53
        return self.api_path + '{}'
Stefan Karner's avatar
Stefan Karner committed
54 55 56 57 58 59 60 61 62 63

    @property
    def _error_template(self):
        """
        A format string for displaying HTTP Errors.
        Must contain one placeholder 'status_code' for the HTTP status code.
        Must contain one placeholder 'response_text' for the body of the response.
        """
        return 'HTTP ERROR - status code {status_code}\n----\n{response_text}\n----\n\n'

Stefan Karner's avatar
Stefan Karner committed
64
    def __init__(self, api_key, allow_tracking=False):
Stefan Karner's avatar
Stefan Karner committed
65
        self.api_key = api_key
Stefan Karner's avatar
Stefan Karner committed
66
        self.allow_tracking = allow_tracking
Stefan Karner's avatar
Stefan Karner committed
67 68 69 70 71 72 73 74 75 76 77 78
        self.token = None

    def connect(self):
        """
        Connect to the Webarchive API, request and save a token.
        """
        try:
            self.token = self._authenticate()
        except HTTPError as e:
            self._display_http_error(e)

    def _authenticate(self):
Stefan Karner's avatar
Stefan Karner committed
79 80 81 82 83 84 85 86 87
        if self.allow_tracking:
            from uuid import getnode as get_mac
            mac = get_mac()
            sha256 = hashlib.sha256()
            sha256.update(str(mac).encode('utf-8'))
            fingerprint = sha256.hexdigest()
        else:
            fingerprint = ''

Andreas's avatar
Andreas committed
88
        r = requests.post(self.base_url.format('/authentication'),
Stefan Karner's avatar
Stefan Karner committed
89 90
                          data='''{{
                              "apikey": "{api_key}",
Stefan Karner's avatar
Stefan Karner committed
91
                              "fingerprint": "{fingerprint}",
Stefan Karner's avatar
Stefan Karner committed
92
                              "version": "{version}"
Stefan Karner's avatar
Stefan Karner committed
93
                          }}'''.format(api_key=self.api_key, version=self.version, fingerprint=fingerprint),
Stefan Karner's avatar
Stefan Karner committed
94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
                          headers={
                              'content-type': 'application/json',
                              'accept': 'application/ld+json'
                          }
                          )
        if r.status_code == 201:
            return r.json()['t']
        else:
            raise HTTPError(response=r)

    def _add_api_key_and_token(self, params_dict: dict):
        """
        Add the saved api key and token to a given dictionary.

        :param params_dict: A dictionary that's probably used
          as a 'params' keyword parameter for calling requests.get().
        :return: The same dictionary extended by 'apikey' and 't' keys.
        """
        params_dict['apikey'] = self.api_key
        params_dict['t'] = self.token
        return params_dict

    def _display_http_error(self, e: HTTPError):
        print(self._error_template.format(status_code=e.response.status_code,
                                          response_text=e.response.text),
              file=sys.stderr)
Stefan Karner's avatar
Stefan Karner committed
120 121 122

    @staticmethod
    def _handle_response_errors(r):
123 124 125 126 127 128 129 130 131 132
        if r.status_code == 403:
            print('Forbidden. Invalid Token or ApiKey transmitted', file=sys.stderr)
            return r
        elif r.status_code == 400:
            print('Bad request', file=sys.stderr)
            return r
        elif r.status_code == 410:
            print('The requested API Version (via X-API-VERSION Header) is not available', file=sys.stderr)
        return r
    
Stefan Karner's avatar
Stefan Karner committed
133 134 135 136 137
    def _get(self, op, auto_connect=True, **kwargs, ):
        kwargs['params'] = self._add_api_key_and_token(kwargs.pop('params', {}))
        r = requests.get(self.base_url.format(op), **kwargs)
        if r.ok:
            return r
138 139 140
        elif r.status_code == 403 and auto_connect:
            self.connect()
            return self._get(op=op, auto_connect=False, **kwargs)
Stefan Karner's avatar
Stefan Karner committed
141
        else:
142 143
            return self._handle_response_errors(r)
    
Stefan Karner's avatar
Stefan Karner committed
144 145 146 147 148 149 150 151
    def _post(self, op, auto_connect=True, json: dict = None, **kwargs):
        if not json:
            json = {}
        kwargs['json'] = self._add_api_key_and_token(json)
        kwargs['headers'] = {
            'content-type': 'application/json',
            'accept': 'application/ld+json'
        }
152 153
        r = requests.post(self.base_url.format(op), **kwargs)
        if r.ok:
Andreas's avatar
Andreas committed
154
            return r
155 156 157 158 159
        elif r.status_code == 403 and auto_connect:
            self.connect()
            return self._post(op=op, auto_connect=False, **kwargs)
        else:
            return self._handle_response_errors(r)
Stefan Karner's avatar
Stefan Karner committed
160

Andreas's avatar
Andreas committed
161
    def fulltext_search(self, query_string, from_=None, to_=None):
Stefan Karner's avatar
Stefan Karner committed
162 163 164 165 166 167 168 169
        """
        Start a fulltext search query in the Webarchive.

        :param query_string: String to search for
        :param from_: Optional earliest date bound for the search
          in the format YYYYMM.
        :param to_: Optional latest date bound for the search
          in the format YYYYMM.
170
        :return: HTTP Response object
Stefan Karner's avatar
Stefan Karner committed
171 172 173 174 175 176 177 178
        """
        params = {'q': query_string}
        if from_:
            params['from'] = from_
        if to_:
            params['to'] = to_

        try:
Andreas's avatar
Andreas committed
179
            response = self._get(op='/search/fulltext', params=params)
Stefan Karner's avatar
Stefan Karner committed
180
            return self.wait_for_response(response)
Andreas's avatar
Andreas committed
181

Stefan Karner's avatar
Stefan Karner committed
182 183 184 185
        except HTTPError as e:
            self._display_http_error(e)
            print('Query for "{}" not added'.format(query_string))

186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
    def fulltext_search_within_domain(self, query_string, domain, from_=None, to_=None):
        """
        Start a fulltext seed search query in the Webarchive.

        :param query_string: String to search for
        :param domain: Search only within this domain name
        :param from_: Optional earliest date bound for the search
          in the format YYYYMM.
        :param to_: Optional latest date bound for the search
          in the format YYYYMM.
        :return: HTTP Response object
        """
        params = {'q': query_string, 'g': domain}
        if from_:
            params['from'] = from_
        if to_:
            params['to'] = to_

        try:
            response = self._get(op='/search/fulltext/seed', params=params)
Stefan Karner's avatar
Stefan Karner committed
206
            return self.wait_for_response(response)
207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230

        except HTTPError as e:
            self._display_http_error(e)

    def fulltext_search_within_url(self, query_string, url, pagesize=10, from_=None, to_=None):
        """
        Start a fulltext capture search query in the Webarchive.

        :param query_string: String to search for
        :param url: Search only captures starting at this exact web address
        :param from_: Optional earliest date bound for the search
          in the format YYYYMM.
        :param to_: Optional latest date bound for the search
          in the format YYYYMM.
        :return: HTTP Response object
        """
        params = {'q': query_string, 'g': url, 'pagesize': pagesize}
        if from_:
            params['from'] = from_
        if to_:
            params['to'] = to_

        try:
            response = self._get(op='/search/fulltext/capture', params=params)
Stefan Karner's avatar
Stefan Karner committed
231
            return self.wait_for_response(response)
232 233 234 235

        except HTTPError as e:
            self._display_http_error(e)

236
    def wayback_search(self, query_string, from_=None, to_=None, mode_=SEARCHMODE_WEBARCHIV):
Andreas's avatar
Andreas committed
237 238 239 240 241 242 243 244
        """
        Start a wayback search query in the Webarchive.

        :param query_string: String to search for
        :param from_: Optional earliest date bound for the search
          in the format YYYYMM.
        :param to_: Optional latest date bound for the search
          in the format YYYYMM.
245
        :return: HTTP Response object
Andreas's avatar
Andreas committed
246 247 248 249 250 251 252
        """
        params = {'q': query_string}
        if from_:
            params['from'] = from_
        if to_:
            params['to'] = to_

253 254
        params['mode'] = mode_

Andreas's avatar
Andreas committed
255
        try:
Andreas's avatar
Andreas committed
256
            response = self._get(op='/search/wayback', params=params)
Stefan Karner's avatar
Stefan Karner committed
257
            return self.wait_for_response(response)
Andreas's avatar
Andreas committed
258

Andreas's avatar
Andreas committed
259 260 261 262
        except HTTPError as e:
            self._display_http_error(e)
            print('Error:'.format(query_string))

Stefan Karner's avatar
Stefan Karner committed
263
    def wait_for_response(self, response):
Andreas's avatar
Andreas committed
264 265 266
        """
        Polls until the server responds with a result
        """
Andreas's avatar
Andreas committed
267 268 269 270
        if response.status_code == 400:
            return response

        while response.status_code != 200:
Andreas's avatar
Andreas committed
271
            time.sleep(0.5)
Andreas's avatar
Andreas committed
272 273 274
            response = self.status_query(response)

        return response
Andreas's avatar
Andreas committed
275 276

    def status_query(self, resp):
Andreas's avatar
Andreas committed
277
        """
278
        this is the polling request for the given type of request
Andreas's avatar
Andreas committed
279
        """
280 281 282
        j = resp.json()
        context = j['context']
        requestid = j['requestid']
283 284
        type_ = resp.json()['type']
        if type_ == 1:
Andreas's avatar
Andreas committed
285
            r = self._get(op='/search/status/fulltext', params={'requestid': requestid})
286
        elif type_ == 2:
Andreas's avatar
Andreas committed
287
            r = self._get(op='/search/status/wayback', params={'requestid': requestid})
onbpre's avatar
onbpre committed
288 289
        elif type_ == 5:
            r = self._get(op='/fragment/checksum/status', params={'requestid': requestid})
290 291
        else:
            raise NotImplementedError(f'Unknown status query type {type_} - Please update client.')
Andreas's avatar
Andreas committed
292 293 294

        return r

295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311
    def domain_name_search(self, query_string, page_=1, pagesize_=100):
        """
        Start a domain name search in the Webarchive.

        :param query_string: String to search for
        :param page_: The page number parameter works with the page size parameter to control the offset of the records returned in the results. Default value is 1
        :param pagesize_: The page size parameter works with the page number parameter to control the offset of the records returned in the results. It also controls how many results are returned with each request. Default value is 10
        :return: result as json
        """
        params = {'q': query_string}
        if page_:
            params['page'] = page_
        if pagesize_:
            params['pagesize'] = pagesize_

        try:
            response = self._get(op='/search/domainname', params=params)
Stefan Karner's avatar
Stefan Karner committed
312
            return self.wait_for_response(response)
313 314 315 316 317

        except HTTPError as e:
            self._display_http_error(e)
            print('Error:'.format(query_string))

318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334
    def histogram_search(self, query_string, interval_=3, from_=None, to_=None):
        """
        Start a domain name search in the Webarchive.

        :param query_string: String to search for
        :return: result as json
        """
        params = {'q': query_string}
        if interval_:
            params['interval'] = interval_
        if from_:
            params['from'] = from_
        if to_:
            params['to'] = to_

        try:
            response = self._get(op='/search/fulltext/histogram', params=params)
Stefan Karner's avatar
Stefan Karner committed
335
            return self.wait_for_response(response)
336 337 338 339 340

        except HTTPError as e:
            self._display_http_error(e)
            print('Error:'.format(query_string))

Stefan Karner's avatar
Stefan Karner committed
341
    def get_snapshot_url(self, seed, capture, onlysvg):
342 343 344
        return self.api_path + 'snapshot?capture=' + capture + '&t=' + self.token + '&apikey=' + self.api_key + '&onlysvg=' + onlysvg + '&seed=' + seed

    @staticmethod
Stefan Karner's avatar
Stefan Karner committed
345
    def result_contains_seeds(response):
346 347 348 349 350 351
        try:
            return response.json()['subtype'] == 2
        except:
            return False

    @staticmethod
Stefan Karner's avatar
Stefan Karner committed
352
    def result_contains_captures(response):
353 354 355 356
        try:
            return response.json()['subtype'] == 3
        except:
            return False
Stefan Karner's avatar
Stefan Karner committed
357

Stefan Karner's avatar
Stefan Karner committed
358
    def save_page(self, url):
Andreas's avatar
Andreas committed
359 360 361 362 363 364 365
        try:
            response = self._post(op='/savepage', json={
                              "url": url
                          })
            return response
        except HTTPError as e:
            self._display_http_error(e)
onbpre's avatar
onbpre committed
366

Stefan Karner's avatar
Stefan Karner committed
367
    def fragment_checksum_html(self, seed, capture, selector, occurrence):
onbpre's avatar
onbpre committed
368
        try:
Stefan Karner's avatar
Stefan Karner committed
369 370 371 372 373 374 375
            response = self._post(op='/fragment/checksum/html', json={
                              "seed": seed,
                              "capture": capture,
                              "selector": selector,
                              "occurrence": occurrence,
                              "extractortype": EXTRACTOR_HTML
                          })
onbpre's avatar
onbpre committed
376
            response = self.status_query(response)
Stefan Karner's avatar
Stefan Karner committed
377
            return self.wait_for_response(response)
onbpre's avatar
onbpre committed
378 379 380
        except HTTPError as e:
            self._display_http_error(e)

Andreas's avatar
Andreas committed
381 382 383 384 385 386 387 388 389 390 391
    def fragment_checksum_binary(self, seed, capture):
        try:
            response = self._post(op='/fragment/checksum/binary', json={
                              "seed": seed,
                              "capture": capture,
                              "extractortype": EXTRACTOR_BINARY
                          })
            response = self.status_query(response)
            return self.wait_for_response(response)
        except HTTPError as e:
            self._display_http_error(e)
onbpre's avatar
onbpre committed
392

Andreas's avatar
Andreas committed
393 394 395 396 397 398 399 400 401 402 403 404 405 406
    def fragment_checksum_text_positionlen(self, seed, capture, pos, len):
        try:
            response = self._post(op='/fragment/checksum/text', json={
                              "seed": seed,
                              "capture": capture,
                              "mode": POSITIONLEN_MODE,
                              "pos": pos,
                              "len": len,
                              "extractortype": EXTRACTOR_TEXT
                          })
            response = self.status_query(response)
            return self.wait_for_response(response)
        except HTTPError as e:
            self._display_http_error(e)
onbpre's avatar
onbpre committed
407

Andreas's avatar
Andreas committed
408 409 410 411 412 413 414 415 416 417 418 419 420
    def fragment_checksum_text_position(self, seed, capture, pos):
        try:
            response = self._post(op='/fragment/checksum/text', json={
                              "seed": seed,
                              "capture": capture,
                              "mode": POSITION_MODE,
                              "pos": pos,
                              "extractortype": EXTRACTOR_TEXT
                          })
            response = self.status_query(response)
            return self.wait_for_response(response)
        except HTTPError as e:
            self._display_http_error(e)
onbpre's avatar
onbpre committed
421

Andreas's avatar
Andreas committed
422 423 424 425 426 427 428 429 430 431 432 433 434 435
    def fragment_checksum_text_regex(self, seed, capture, regexpattern, occurrence):
        try:
            response = self._post(op='/fragment/checksum/text', json={
                              "seed": seed,
                              "capture": capture,
                              "mode": REGEX_MODE,
                              "regexpattern": regexpattern,
                              "occurrence": occurrence,
                              "extractortype": EXTRACTOR_TEXT
                          })
            response = self.status_query(response)
            return self.wait_for_response(response)
        except HTTPError as e:
            self._display_http_error(e)
onbpre's avatar
onbpre committed
436 437


Andreas's avatar
Andreas committed
438 439 440 441 442 443 444 445
    def create_watchlist(self, urls):
        try:
            response = self._post(op='/watchlist', json={
                              "urls": urls
                          })
            return response
        except HTTPError as e:
            self._display_http_error(e)
onbpre's avatar
onbpre committed
446