From 8dc2b37deeaced031b8a89cb5bab930d8458d769 Mon Sep 17 00:00:00 2001 From: Stefan Karner Date: Wed, 22 May 2019 15:52:09 +0200 Subject: [PATCH] Refactor fragmentChecksumHtml --- webarchiv.py | 55 ++++++++++++++++------------------------------------ 1 file changed, 17 insertions(+), 38 deletions(-) diff --git a/webarchiv.py b/webarchiv.py index 36ca857..b166098 100644 --- a/webarchiv.py +++ b/webarchiv.py @@ -8,7 +8,7 @@ _datetime_format_string = '%Y%m%d%H%M%S' EXTRACTOR_TEXT = 1 -EXTRACTOR_CSS = 2 +EXTRACTOR_HTML = 2 EXTRACTOR_BINARY = 3 @@ -130,8 +130,14 @@ class WebarchivSession: else: return self._handle_response_errors(r) - def _post(self, op, auto_connect=True, **kwargs): - kwargs['json'] = self._add_api_key_and_token(kwargs.pop('json', {})) + def _post(self, op, auto_connect=True, json: dict = None, **kwargs): + if not json: + json = {} + kwargs['json'] = self._add_api_key_and_token(json) + kwargs['headers'] = { + 'content-type': 'application/json', + 'accept': 'application/ld+json' + } r = requests.post(self.base_url.format(op), **kwargs) if r.ok: return r @@ -141,29 +147,6 @@ class WebarchivSession: else: return self._handle_response_errors(r) - def _post(self, op, jsondata, auto_connect=True): - r = requests.post(self.base_url.format(op), jsondata, - headers={ - 'content-type': 'application/json', - 'accept': 'application/ld+json' - }) - if r.ok: - return r - else: - if r.status_code == 403: - if auto_connect: - self.connect() - return self._post(op=op, jsondata=jsondata, auto_connect=False) - else: - print('Forbidden. Invalid Token or ApiKey transmitted', file=sys.stderr) - return r - elif r.status_code == 400: - print('Bad request', file=sys.stderr) - return r - elif r.status_code == 410: - print('The requested API Version (via X-API-VERSION Header) is not available', file=sys.stderr) - return r - def fulltext_search(self, query_string, from_=None, to_=None): """ Start a fulltext search query in the Webarchive. @@ -376,17 +359,13 @@ class WebarchivSession: def fragmentChecksumHtml(self, seed, capture, selector, occurrence): try: - response = self._post(op='/fragment/checksum/html', jsondata='''{{ - "apikey": "{api_key}", - "t": "{token}", - "seed": "{seed}", - "capture": "{capture}", - "selector": "{selector}", - "occurrence": "{occurrence}", - "extractortype": 2 - - }}'''.format(api_key=self.api_key, token=self.token, seed=seed, capture=capture, - selector=selector, occurrence=occurrence)) + response = self._post(op='/fragment/checksum/html', json={ + "seed": seed, + "capture": capture, + "selector": selector, + "occurrence": occurrence, + "extractortype": EXTRACTOR_HTML + }) response = self.status_query(response) return self.waitForResponse(response) except HTTPError as e: @@ -414,7 +393,7 @@ if __name__ == '__main__': for capture in response.json()['hits']: capturedate = capture['c'] - resp = w.fragmentChecksumHtml(url, capturedate, ".odd td", 3); + resp = w.fragmentChecksumHtml(url, capturedate, ".odd td", 3) checksum = resp.json()['checksum'] returncode = resp.json()['returncode'] -- GitLab