diff --git a/webarchiv.py b/webarchiv.py index 36ca857e846f617ebd65d80a103a6d9291bc0b7f..b166098fe935e3e7e22707ced8a1825f0edc3009 100644 --- a/webarchiv.py +++ b/webarchiv.py @@ -8,7 +8,7 @@ _datetime_format_string = '%Y%m%d%H%M%S' EXTRACTOR_TEXT = 1 -EXTRACTOR_CSS = 2 +EXTRACTOR_HTML = 2 EXTRACTOR_BINARY = 3 @@ -130,8 +130,14 @@ class WebarchivSession: else: return self._handle_response_errors(r) - def _post(self, op, auto_connect=True, **kwargs): - kwargs['json'] = self._add_api_key_and_token(kwargs.pop('json', {})) + def _post(self, op, auto_connect=True, json: dict = None, **kwargs): + if not json: + json = {} + kwargs['json'] = self._add_api_key_and_token(json) + kwargs['headers'] = { + 'content-type': 'application/json', + 'accept': 'application/ld+json' + } r = requests.post(self.base_url.format(op), **kwargs) if r.ok: return r @@ -141,29 +147,6 @@ class WebarchivSession: else: return self._handle_response_errors(r) - def _post(self, op, jsondata, auto_connect=True): - r = requests.post(self.base_url.format(op), jsondata, - headers={ - 'content-type': 'application/json', - 'accept': 'application/ld+json' - }) - if r.ok: - return r - else: - if r.status_code == 403: - if auto_connect: - self.connect() - return self._post(op=op, jsondata=jsondata, auto_connect=False) - else: - print('Forbidden. Invalid Token or ApiKey transmitted', file=sys.stderr) - return r - elif r.status_code == 400: - print('Bad request', file=sys.stderr) - return r - elif r.status_code == 410: - print('The requested API Version (via X-API-VERSION Header) is not available', file=sys.stderr) - return r - def fulltext_search(self, query_string, from_=None, to_=None): """ Start a fulltext search query in the Webarchive. @@ -376,17 +359,13 @@ class WebarchivSession: def fragmentChecksumHtml(self, seed, capture, selector, occurrence): try: - response = self._post(op='/fragment/checksum/html', jsondata='''{{ - "apikey": "{api_key}", - "t": "{token}", - "seed": "{seed}", - "capture": "{capture}", - "selector": "{selector}", - "occurrence": "{occurrence}", - "extractortype": 2 - - }}'''.format(api_key=self.api_key, token=self.token, seed=seed, capture=capture, - selector=selector, occurrence=occurrence)) + response = self._post(op='/fragment/checksum/html', json={ + "seed": seed, + "capture": capture, + "selector": selector, + "occurrence": occurrence, + "extractortype": EXTRACTOR_HTML + }) response = self.status_query(response) return self.waitForResponse(response) except HTTPError as e: @@ -414,7 +393,7 @@ if __name__ == '__main__': for capture in response.json()['hits']: capturedate = capture['c'] - resp = w.fragmentChecksumHtml(url, capturedate, ".odd td", 3); + resp = w.fragmentChecksumHtml(url, capturedate, ".odd td", 3) checksum = resp.json()['checksum'] returncode = resp.json()['returncode']