diff --git a/webarchiv.py b/webarchiv.py index 7e906d39ff9cdfda26c13486c1c6f49537c67d4d..0388094db403c976f71fee0e7deed9bd5fe3867b 100644 --- a/webarchiv.py +++ b/webarchiv.py @@ -7,6 +7,11 @@ from requests import HTTPError _datetime_format_string = '%Y%m%d%H%M%S' +EXTRACTOR_TEXT = 1 +EXTRACTOR_HTML = 2 +EXTRACTOR_BINARY = 3 + + class SessionTimeoutError(Exception): pass @@ -103,49 +108,45 @@ class WebarchivSession: response_text=e.response.text), file=sys.stderr) + @staticmethod + def _handle_response_errors(r): + if r.status_code == 403: + print('Forbidden. Invalid Token or ApiKey transmitted', file=sys.stderr) + return r + elif r.status_code == 400: + print('Bad request', file=sys.stderr) + return r + elif r.status_code == 410: + print('The requested API Version (via X-API-VERSION Header) is not available', file=sys.stderr) + return r + def _get(self, op, auto_connect=True, **kwargs, ): kwargs['params'] = self._add_api_key_and_token(kwargs.pop('params', {})) - # kwargs = self._add_api_key_and_token(kwargs) r = requests.get(self.base_url.format(op), **kwargs) if r.ok: return r + elif r.status_code == 403 and auto_connect: + self.connect() + return self._get(op=op, auto_connect=False, **kwargs) else: - if r.status_code == 403: - if auto_connect: - self.connect() - return self._get(op=op, auto_connect=False, **kwargs) - else: - print('Forbidden. Invalid Token or ApiKey transmitted', file=sys.stderr) - return r - elif r.status_code == 400: - print('Bad request', file=sys.stderr) - return r - elif r.status_code == 410: - print('The requested API Version (via X-API-VERSION Header) is not available', file=sys.stderr) - return r - - def _post(self, op, jsondata, auto_connect=True): - r = requests.post(self.base_url.format(op), jsondata, - headers={ - 'content-type': 'application/json', - 'accept': 'application/ld+json' - }) + return self._handle_response_errors(r) + + def _post(self, op, auto_connect=True, json: dict = None, **kwargs): + if not json: + json = {} + kwargs['json'] = self._add_api_key_and_token(json) + kwargs['headers'] = { + 'content-type': 'application/json', + 'accept': 'application/ld+json' + } + r = requests.post(self.base_url.format(op), **kwargs) if r.ok: return r + elif r.status_code == 403 and auto_connect: + self.connect() + return self._post(op=op, auto_connect=False, **kwargs) else: - if r.status_code == 403: - if auto_connect: - self.connect() - return self._post(op=op, jsondata=jsondata, auto_connect=False) - else: - print('Forbidden. Invalid Token or ApiKey transmitted', file=sys.stderr) - return r - elif r.status_code == 400: - print('Bad request', file=sys.stderr) - return r - elif r.status_code == 410: - print('The requested API Version (via X-API-VERSION Header) is not available', file=sys.stderr) - return r + return self._handle_response_errors(r) def fulltext_search(self, query_string, from_=None, to_=None): """ @@ -166,7 +167,7 @@ class WebarchivSession: try: response = self._get(op='/search/fulltext', params=params) - return self.waitForResponse(response) + return self.wait_for_response(response) except HTTPError as e: self._display_http_error(e) @@ -192,7 +193,7 @@ class WebarchivSession: try: response = self._get(op='/search/fulltext/seed', params=params) - return self.waitForResponse(response) + return self.wait_for_response(response) except HTTPError as e: self._display_http_error(e) @@ -217,7 +218,7 @@ class WebarchivSession: try: response = self._get(op='/search/fulltext/capture', params=params) - return self.waitForResponse(response) + return self.wait_for_response(response) except HTTPError as e: self._display_http_error(e) @@ -241,18 +242,15 @@ class WebarchivSession: try: response = self._get(op='/search/wayback', params=params) - return self.waitForResponse(response) + return self.wait_for_response(response) except HTTPError as e: self._display_http_error(e) print('Error:'.format(query_string)) - def waitForResponse(self, response): + def wait_for_response(self, response): """ Polls until the server responds with a result - - :param response: String to search for - :return: response """ if response.status_code == 400: return response @@ -265,12 +263,11 @@ class WebarchivSession: def status_query(self, resp): """ - this is the pollingrequest for the given typen of request - - :param resp: String to search for - :return: response + this is the polling request for the given type of request """ - requestid = resp.json()['requestid'] + j = resp.json() + context = j['context'] + requestid = j['requestid'] type_ = resp.json()['type'] if type_ == 1: r = self._get(op='/search/status/fulltext', params={'requestid': requestid}) @@ -300,7 +297,7 @@ class WebarchivSession: try: response = self._get(op='/search/domainname', params=params) - return self.waitForResponse(response) + return self.wait_for_response(response) except HTTPError as e: self._display_http_error(e) @@ -323,30 +320,30 @@ class WebarchivSession: try: response = self._get(op='/search/fulltext/histogram', params=params) - return self.waitForResponse(response) + return self.wait_for_response(response) except HTTPError as e: self._display_http_error(e) print('Error:'.format(query_string)) - def getSnapshotUrl(self, seed, capture, onlysvg): + def get_snapshot_url(self, seed, capture, onlysvg): return self.api_path + 'snapshot?capture=' + capture + '&t=' + self.token + '&apikey=' + self.api_key + '&onlysvg=' + onlysvg + '&seed=' + seed @staticmethod - def resultContainsSeeds(response): + def result_contains_seeds(response): try: return response.json()['subtype'] == 2 except: return False @staticmethod - def resultContainsCaptures(response): + def result_contains_captures(response): try: return response.json()['subtype'] == 3 except: return False - def savePage(self, url): + def save_page(self, url): self.connect() r = requests.post(self.base_url.format('savepage'), data='''{{ @@ -361,21 +358,17 @@ class WebarchivSession: ) return r - def fragmentChecksumHtml(self, seed, capture, selector, occurrence): + def fragment_checksum_html(self, seed, capture, selector, occurrence): try: - response = self._post(op='/fragment/checksum/html', jsondata='''{{ - "apikey": "{api_key}", - "t": "{token}", - "seed": "{seed}", - "capture": "{capture}", - "selector": "{selector}", - "occurrence": "{occurrence}", - "extractortype": 2 - - }}'''.format(api_key=self.api_key, token=self.token, seed=seed, capture=capture, - selector=selector, occurrence=occurrence)) + response = self._post(op='/fragment/checksum/html', json={ + "seed": seed, + "capture": capture, + "selector": selector, + "occurrence": occurrence, + "extractortype": EXTRACTOR_HTML + }) response = self.status_query(response) - return self.waitForResponse(response) + return self.wait_for_response(response) except HTTPError as e: self._display_http_error(e) @@ -401,7 +394,7 @@ if __name__ == '__main__': for capture in response.json()['hits']: capturedate = capture['c'] - resp = w.fragmentChecksumHtml(url, capturedate, ".odd td", 3); + resp = w.fragment_checksum_html(url, capturedate, ".odd td", 3) checksum = resp.json()['checksum'] returncode = resp.json()['returncode']