Commit 4668defb authored by onbpre's avatar onbpre

fragment checksumming

parent 8859522e
......@@ -124,6 +124,29 @@ class WebarchivSession:
print('The requested API Version (via X-API-VERSION Header) is not available', file=sys.stderr)
return r
def _post(self, op, jsondata, auto_connect=True):
r = requests.post(self.base_url.format(op), jsondata,
headers={
'content-type': 'application/json',
'accept': 'application/ld+json'
})
if r.ok:
return r
else:
if r.status_code == 403:
if auto_connect:
self.connect()
return self._post(op=op, jsondata=jsondata, auto_connect=False)
else:
print('Forbidden. Invalid Token or ApiKey transmitted', file=sys.stderr)
return r
elif r.status_code == 400:
print('Bad request', file=sys.stderr)
return r
elif r.status_code == 410:
print('The requested API Version (via X-API-VERSION Header) is not available', file=sys.stderr)
return r
def fulltext_search(self, query_string, from_=None, to_=None):
"""
Start a fulltext search query in the Webarchive.
......@@ -253,6 +276,8 @@ class WebarchivSession:
r = self._get(op='/search/status/fulltext', params={'requestid': requestid})
elif type_ == 2:
r = self._get(op='/search/status/wayback', params={'requestid': requestid})
elif type_ == 5:
r = self._get(op='/fragment/checksum/status', params={'requestid': requestid})
else:
raise NotImplementedError(f'Unknown status query type {type_} - Please update client.')
......@@ -335,3 +360,59 @@ class WebarchivSession:
}
)
return r
def fragmentChecksumHtml(self, seed, capture, selector, occurrence):
try:
response = self._post(op='/fragment/checksum/html', jsondata='''{{
"apikey": "{api_key}",
"t": "{token}",
"seed": "{seed}",
"capture": "{capture}",
"selector": "{selector}",
"occurrence": "{occurrence}",
"extractortype": 2
}}'''.format(api_key=self.api_key, token=self.token, seed=seed, capture=capture,
selector=selector, occurrence=occurrence))
response = self.status_query(response)
return self.waitForResponse(response)
except HTTPError as e:
self._display_http_error(e)
if __name__ == '__main__':
# noinspection SpellCheckingInspection
w = WebarchivSession('2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c')
# response = w.wayback_search("http://www.onb.ac.at")
# response = w.wayback_search("http://frauenhetz.jetzt")
url = "http://sport.orf.at/l/stories/2003717/"
response = w.wayback_search("http://sport.orf.at/l/stories/2003717/", "20110101000000", "20120401000000")
# response = w.wayback_search("x")
if response.status_code != 200:
print("Error ", response.status_code)
exit(1)
print(response.json()['total'])
print(url)
lastchecksum = ''
for capture in response.json()['hits']:
capturedate = capture['c']
resp = w.fragmentChecksumHtml(url, capturedate, ".odd td", 3);
checksum = resp.json()['checksum']
returncode = resp.json()['returncode']
if returncode == 2:
continue
if checksum != lastchecksum:
print(resp.json())
print("http://wayback/web/" + capturedate + "/" + url)
print(capturedate + " " + checksum)
lastchecksum = checksum
print("end")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment