diff --git a/webarchiv.py b/webarchiv.py index ee597e88baed6c0c1df63f6519aeaa7ddcb59428..7e906d39ff9cdfda26c13486c1c6f49537c67d4d 100644 --- a/webarchiv.py +++ b/webarchiv.py @@ -124,6 +124,29 @@ class WebarchivSession: print('The requested API Version (via X-API-VERSION Header) is not available', file=sys.stderr) return r + def _post(self, op, jsondata, auto_connect=True): + r = requests.post(self.base_url.format(op), jsondata, + headers={ + 'content-type': 'application/json', + 'accept': 'application/ld+json' + }) + if r.ok: + return r + else: + if r.status_code == 403: + if auto_connect: + self.connect() + return self._post(op=op, jsondata=jsondata, auto_connect=False) + else: + print('Forbidden. Invalid Token or ApiKey transmitted', file=sys.stderr) + return r + elif r.status_code == 400: + print('Bad request', file=sys.stderr) + return r + elif r.status_code == 410: + print('The requested API Version (via X-API-VERSION Header) is not available', file=sys.stderr) + return r + def fulltext_search(self, query_string, from_=None, to_=None): """ Start a fulltext search query in the Webarchive. @@ -253,6 +276,8 @@ class WebarchivSession: r = self._get(op='/search/status/fulltext', params={'requestid': requestid}) elif type_ == 2: r = self._get(op='/search/status/wayback', params={'requestid': requestid}) + elif type_ == 5: + r = self._get(op='/fragment/checksum/status', params={'requestid': requestid}) else: raise NotImplementedError(f'Unknown status query type {type_} - Please update client.') @@ -335,3 +360,59 @@ class WebarchivSession: } ) return r + + def fragmentChecksumHtml(self, seed, capture, selector, occurrence): + try: + response = self._post(op='/fragment/checksum/html', jsondata='''{{ + "apikey": "{api_key}", + "t": "{token}", + "seed": "{seed}", + "capture": "{capture}", + "selector": "{selector}", + "occurrence": "{occurrence}", + "extractortype": 2 + + }}'''.format(api_key=self.api_key, token=self.token, seed=seed, capture=capture, + selector=selector, occurrence=occurrence)) + response = self.status_query(response) + return self.waitForResponse(response) + except HTTPError as e: + self._display_http_error(e) + + +if __name__ == '__main__': + # noinspection SpellCheckingInspection + w = WebarchivSession('2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c') +# response = w.wayback_search("http://www.onb.ac.at") +# response = w.wayback_search("http://frauenhetz.jetzt") + url = "http://sport.orf.at/l/stories/2003717/" + response = w.wayback_search("http://sport.orf.at/l/stories/2003717/", "20110101000000", "20120401000000") +# response = w.wayback_search("x") + + if response.status_code != 200: + print("Error ", response.status_code) + exit(1) + + print(response.json()['total']) + + print(url) + + lastchecksum = '' + for capture in response.json()['hits']: + capturedate = capture['c'] + + resp = w.fragmentChecksumHtml(url, capturedate, ".odd td", 3); + checksum = resp.json()['checksum'] + returncode = resp.json()['returncode'] + + if returncode == 2: + continue + + if checksum != lastchecksum: + print(resp.json()) + print("http://wayback/web/" + capturedate + "/" + url) + print(capturedate + " " + checksum) + + lastchecksum = checksum + + print("end")