From bc6d08fd4ccc47fa36c59a1ba5319b9e2d76c1b8 Mon Sep 17 00:00:00 2001 From: Stefan Karner Date: Wed, 22 May 2019 15:23:10 +0200 Subject: [PATCH 1/3] Start _post method; add constants for extractor methods --- webarchiv.py | 57 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/webarchiv.py b/webarchiv.py index ee597e8..4b355e6 100644 --- a/webarchiv.py +++ b/webarchiv.py @@ -7,6 +7,11 @@ from requests import HTTPError _datetime_format_string = '%Y%m%d%H%M%S' +EXTRACTOR_TEXT = 1 +EXTRACTOR_CSS = 2 +EXTRACTOR_BINARY = 3 + + class SessionTimeoutError(Exception): pass @@ -102,27 +107,39 @@ class WebarchivSession: print(self._error_template.format(status_code=e.response.status_code, response_text=e.response.text), file=sys.stderr) - + + def _handle_response_errors(self, r): + if r.status_code == 403: + print('Forbidden. Invalid Token or ApiKey transmitted', file=sys.stderr) + return r + elif r.status_code == 400: + print('Bad request', file=sys.stderr) + return r + elif r.status_code == 410: + print('The requested API Version (via X-API-VERSION Header) is not available', file=sys.stderr) + return r + def _get(self, op, auto_connect=True, **kwargs, ): kwargs['params'] = self._add_api_key_and_token(kwargs.pop('params', {})) - # kwargs = self._add_api_key_and_token(kwargs) r = requests.get(self.base_url.format(op), **kwargs) if r.ok: return r + elif r.status_code == 403 and auto_connect: + self.connect() + return self._get(op=op, auto_connect=False, **kwargs) else: - if r.status_code == 403: - if auto_connect: - self.connect() - return self._get(op=op, auto_connect=False, **kwargs) - else: - print('Forbidden. Invalid Token or ApiKey transmitted', file=sys.stderr) - return r - elif r.status_code == 400: - print('Bad request', file=sys.stderr) - return r - elif r.status_code == 410: - print('The requested API Version (via X-API-VERSION Header) is not available', file=sys.stderr) + return self._handle_response_errors(r) + + def _post(self, op, auto_connect=True, **kwargs): + kwargs['json'] = self._add_api_key_and_token(kwargs.pop('json', {})) + r = requests.post(self.base_url.format(op), **kwargs) + if r.ok: return r + elif r.status_code == 403 and auto_connect: + self.connect() + return self._post(op=op, auto_connect=False, **kwargs) + else: + return self._handle_response_errors(r) def fulltext_search(self, query_string, from_=None, to_=None): """ @@ -227,9 +244,6 @@ class WebarchivSession: def waitForResponse(self, response): """ Polls until the server responds with a result - - :param response: String to search for - :return: response """ if response.status_code == 400: return response @@ -242,12 +256,11 @@ class WebarchivSession: def status_query(self, resp): """ - this is the pollingrequest for the given typen of request - - :param resp: String to search for - :return: response + this is the polling request for the given type of request """ - requestid = resp.json()['requestid'] + j = resp.json() + context = j['context'] + requestid = j['requestid'] type_ = resp.json()['type'] if type_ == 1: r = self._get(op='/search/status/fulltext', params={'requestid': requestid}) -- GitLab From 8dc2b37deeaced031b8a89cb5bab930d8458d769 Mon Sep 17 00:00:00 2001 From: Stefan Karner Date: Wed, 22 May 2019 15:52:09 +0200 Subject: [PATCH 2/3] Refactor fragmentChecksumHtml --- webarchiv.py | 55 ++++++++++++++++------------------------------------ 1 file changed, 17 insertions(+), 38 deletions(-) diff --git a/webarchiv.py b/webarchiv.py index 36ca857..b166098 100644 --- a/webarchiv.py +++ b/webarchiv.py @@ -8,7 +8,7 @@ _datetime_format_string = '%Y%m%d%H%M%S' EXTRACTOR_TEXT = 1 -EXTRACTOR_CSS = 2 +EXTRACTOR_HTML = 2 EXTRACTOR_BINARY = 3 @@ -130,8 +130,14 @@ class WebarchivSession: else: return self._handle_response_errors(r) - def _post(self, op, auto_connect=True, **kwargs): - kwargs['json'] = self._add_api_key_and_token(kwargs.pop('json', {})) + def _post(self, op, auto_connect=True, json: dict = None, **kwargs): + if not json: + json = {} + kwargs['json'] = self._add_api_key_and_token(json) + kwargs['headers'] = { + 'content-type': 'application/json', + 'accept': 'application/ld+json' + } r = requests.post(self.base_url.format(op), **kwargs) if r.ok: return r @@ -141,29 +147,6 @@ class WebarchivSession: else: return self._handle_response_errors(r) - def _post(self, op, jsondata, auto_connect=True): - r = requests.post(self.base_url.format(op), jsondata, - headers={ - 'content-type': 'application/json', - 'accept': 'application/ld+json' - }) - if r.ok: - return r - else: - if r.status_code == 403: - if auto_connect: - self.connect() - return self._post(op=op, jsondata=jsondata, auto_connect=False) - else: - print('Forbidden. Invalid Token or ApiKey transmitted', file=sys.stderr) - return r - elif r.status_code == 400: - print('Bad request', file=sys.stderr) - return r - elif r.status_code == 410: - print('The requested API Version (via X-API-VERSION Header) is not available', file=sys.stderr) - return r - def fulltext_search(self, query_string, from_=None, to_=None): """ Start a fulltext search query in the Webarchive. @@ -376,17 +359,13 @@ class WebarchivSession: def fragmentChecksumHtml(self, seed, capture, selector, occurrence): try: - response = self._post(op='/fragment/checksum/html', jsondata='''{{ - "apikey": "{api_key}", - "t": "{token}", - "seed": "{seed}", - "capture": "{capture}", - "selector": "{selector}", - "occurrence": "{occurrence}", - "extractortype": 2 - - }}'''.format(api_key=self.api_key, token=self.token, seed=seed, capture=capture, - selector=selector, occurrence=occurrence)) + response = self._post(op='/fragment/checksum/html', json={ + "seed": seed, + "capture": capture, + "selector": selector, + "occurrence": occurrence, + "extractortype": EXTRACTOR_HTML + }) response = self.status_query(response) return self.waitForResponse(response) except HTTPError as e: @@ -414,7 +393,7 @@ if __name__ == '__main__': for capture in response.json()['hits']: capturedate = capture['c'] - resp = w.fragmentChecksumHtml(url, capturedate, ".odd td", 3); + resp = w.fragmentChecksumHtml(url, capturedate, ".odd td", 3) checksum = resp.json()['checksum'] returncode = resp.json()['returncode'] -- GitLab From 60c8c0ff2ff1f9efcdba0503cd525f5457bcf283 Mon Sep 17 00:00:00 2001 From: Stefan Karner Date: Wed, 22 May 2019 15:58:16 +0200 Subject: [PATCH 3/3] Refactor method names --- webarchiv.py | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/webarchiv.py b/webarchiv.py index b166098..0388094 100644 --- a/webarchiv.py +++ b/webarchiv.py @@ -107,8 +107,9 @@ class WebarchivSession: print(self._error_template.format(status_code=e.response.status_code, response_text=e.response.text), file=sys.stderr) - - def _handle_response_errors(self, r): + + @staticmethod + def _handle_response_errors(r): if r.status_code == 403: print('Forbidden. Invalid Token or ApiKey transmitted', file=sys.stderr) return r @@ -166,7 +167,7 @@ class WebarchivSession: try: response = self._get(op='/search/fulltext', params=params) - return self.waitForResponse(response) + return self.wait_for_response(response) except HTTPError as e: self._display_http_error(e) @@ -192,7 +193,7 @@ class WebarchivSession: try: response = self._get(op='/search/fulltext/seed', params=params) - return self.waitForResponse(response) + return self.wait_for_response(response) except HTTPError as e: self._display_http_error(e) @@ -217,7 +218,7 @@ class WebarchivSession: try: response = self._get(op='/search/fulltext/capture', params=params) - return self.waitForResponse(response) + return self.wait_for_response(response) except HTTPError as e: self._display_http_error(e) @@ -241,13 +242,13 @@ class WebarchivSession: try: response = self._get(op='/search/wayback', params=params) - return self.waitForResponse(response) + return self.wait_for_response(response) except HTTPError as e: self._display_http_error(e) print('Error:'.format(query_string)) - def waitForResponse(self, response): + def wait_for_response(self, response): """ Polls until the server responds with a result """ @@ -296,7 +297,7 @@ class WebarchivSession: try: response = self._get(op='/search/domainname', params=params) - return self.waitForResponse(response) + return self.wait_for_response(response) except HTTPError as e: self._display_http_error(e) @@ -319,30 +320,30 @@ class WebarchivSession: try: response = self._get(op='/search/fulltext/histogram', params=params) - return self.waitForResponse(response) + return self.wait_for_response(response) except HTTPError as e: self._display_http_error(e) print('Error:'.format(query_string)) - def getSnapshotUrl(self, seed, capture, onlysvg): + def get_snapshot_url(self, seed, capture, onlysvg): return self.api_path + 'snapshot?capture=' + capture + '&t=' + self.token + '&apikey=' + self.api_key + '&onlysvg=' + onlysvg + '&seed=' + seed @staticmethod - def resultContainsSeeds(response): + def result_contains_seeds(response): try: return response.json()['subtype'] == 2 except: return False @staticmethod - def resultContainsCaptures(response): + def result_contains_captures(response): try: return response.json()['subtype'] == 3 except: return False - def savePage(self, url): + def save_page(self, url): self.connect() r = requests.post(self.base_url.format('savepage'), data='''{{ @@ -357,7 +358,7 @@ class WebarchivSession: ) return r - def fragmentChecksumHtml(self, seed, capture, selector, occurrence): + def fragment_checksum_html(self, seed, capture, selector, occurrence): try: response = self._post(op='/fragment/checksum/html', json={ "seed": seed, @@ -367,7 +368,7 @@ class WebarchivSession: "extractortype": EXTRACTOR_HTML }) response = self.status_query(response) - return self.waitForResponse(response) + return self.wait_for_response(response) except HTTPError as e: self._display_http_error(e) @@ -393,7 +394,7 @@ if __name__ == '__main__': for capture in response.json()['hits']: capturedate = capture['c'] - resp = w.fragmentChecksumHtml(url, capturedate, ".odd td", 3) + resp = w.fragment_checksum_html(url, capturedate, ".odd td", 3) checksum = resp.json()['checksum'] returncode = resp.json()['returncode'] -- GitLab