diff --git a/webarchiv.py b/webarchiv.py index 4b355e6752469ed9362baf17d0ad7aaef0864c3d..36ca857e846f617ebd65d80a103a6d9291bc0b7f 100644 --- a/webarchiv.py +++ b/webarchiv.py @@ -141,6 +141,29 @@ class WebarchivSession: else: return self._handle_response_errors(r) + def _post(self, op, jsondata, auto_connect=True): + r = requests.post(self.base_url.format(op), jsondata, + headers={ + 'content-type': 'application/json', + 'accept': 'application/ld+json' + }) + if r.ok: + return r + else: + if r.status_code == 403: + if auto_connect: + self.connect() + return self._post(op=op, jsondata=jsondata, auto_connect=False) + else: + print('Forbidden. Invalid Token or ApiKey transmitted', file=sys.stderr) + return r + elif r.status_code == 400: + print('Bad request', file=sys.stderr) + return r + elif r.status_code == 410: + print('The requested API Version (via X-API-VERSION Header) is not available', file=sys.stderr) + return r + def fulltext_search(self, query_string, from_=None, to_=None): """ Start a fulltext search query in the Webarchive. @@ -266,6 +289,8 @@ class WebarchivSession: r = self._get(op='/search/status/fulltext', params={'requestid': requestid}) elif type_ == 2: r = self._get(op='/search/status/wayback', params={'requestid': requestid}) + elif type_ == 5: + r = self._get(op='/fragment/checksum/status', params={'requestid': requestid}) else: raise NotImplementedError(f'Unknown status query type {type_} - Please update client.') @@ -348,3 +373,59 @@ class WebarchivSession: } ) return r + + def fragmentChecksumHtml(self, seed, capture, selector, occurrence): + try: + response = self._post(op='/fragment/checksum/html', jsondata='''{{ + "apikey": "{api_key}", + "t": "{token}", + "seed": "{seed}", + "capture": "{capture}", + "selector": "{selector}", + "occurrence": "{occurrence}", + "extractortype": 2 + + }}'''.format(api_key=self.api_key, token=self.token, seed=seed, capture=capture, + selector=selector, occurrence=occurrence)) + response = self.status_query(response) + return self.waitForResponse(response) + except HTTPError as e: + self._display_http_error(e) + + +if __name__ == '__main__': + # noinspection SpellCheckingInspection + w = WebarchivSession('2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c') +# response = w.wayback_search("http://www.onb.ac.at") +# response = w.wayback_search("http://frauenhetz.jetzt") + url = "http://sport.orf.at/l/stories/2003717/" + response = w.wayback_search("http://sport.orf.at/l/stories/2003717/", "20110101000000", "20120401000000") +# response = w.wayback_search("x") + + if response.status_code != 200: + print("Error ", response.status_code) + exit(1) + + print(response.json()['total']) + + print(url) + + lastchecksum = '' + for capture in response.json()['hits']: + capturedate = capture['c'] + + resp = w.fragmentChecksumHtml(url, capturedate, ".odd td", 3); + checksum = resp.json()['checksum'] + returncode = resp.json()['returncode'] + + if returncode == 2: + continue + + if checksum != lastchecksum: + print(resp.json()) + print("http://wayback/web/" + capturedate + "/" + url) + print(capturedate + " " + checksum) + + lastchecksum = checksum + + print("end")