Skip to content
Commits on Source (6)
......@@ -7,6 +7,11 @@ from requests import HTTPError
_datetime_format_string = '%Y%m%d%H%M%S'
EXTRACTOR_TEXT = 1
EXTRACTOR_HTML = 2
EXTRACTOR_BINARY = 3
class SessionTimeoutError(Exception):
pass
......@@ -103,49 +108,45 @@ class WebarchivSession:
response_text=e.response.text),
file=sys.stderr)
@staticmethod
def _handle_response_errors(r):
if r.status_code == 403:
print('Forbidden. Invalid Token or ApiKey transmitted', file=sys.stderr)
return r
elif r.status_code == 400:
print('Bad request', file=sys.stderr)
return r
elif r.status_code == 410:
print('The requested API Version (via X-API-VERSION Header) is not available', file=sys.stderr)
return r
def _get(self, op, auto_connect=True, **kwargs, ):
kwargs['params'] = self._add_api_key_and_token(kwargs.pop('params', {}))
# kwargs = self._add_api_key_and_token(kwargs)
r = requests.get(self.base_url.format(op), **kwargs)
if r.ok:
return r
elif r.status_code == 403 and auto_connect:
self.connect()
return self._get(op=op, auto_connect=False, **kwargs)
else:
if r.status_code == 403:
if auto_connect:
self.connect()
return self._get(op=op, auto_connect=False, **kwargs)
else:
print('Forbidden. Invalid Token or ApiKey transmitted', file=sys.stderr)
return r
elif r.status_code == 400:
print('Bad request', file=sys.stderr)
return r
elif r.status_code == 410:
print('The requested API Version (via X-API-VERSION Header) is not available', file=sys.stderr)
return r
def _post(self, op, jsondata, auto_connect=True):
r = requests.post(self.base_url.format(op), jsondata,
headers={
'content-type': 'application/json',
'accept': 'application/ld+json'
})
return self._handle_response_errors(r)
def _post(self, op, auto_connect=True, json: dict = None, **kwargs):
if not json:
json = {}
kwargs['json'] = self._add_api_key_and_token(json)
kwargs['headers'] = {
'content-type': 'application/json',
'accept': 'application/ld+json'
}
r = requests.post(self.base_url.format(op), **kwargs)
if r.ok:
return r
elif r.status_code == 403 and auto_connect:
self.connect()
return self._post(op=op, auto_connect=False, **kwargs)
else:
if r.status_code == 403:
if auto_connect:
self.connect()
return self._post(op=op, jsondata=jsondata, auto_connect=False)
else:
print('Forbidden. Invalid Token or ApiKey transmitted', file=sys.stderr)
return r
elif r.status_code == 400:
print('Bad request', file=sys.stderr)
return r
elif r.status_code == 410:
print('The requested API Version (via X-API-VERSION Header) is not available', file=sys.stderr)
return r
return self._handle_response_errors(r)
def fulltext_search(self, query_string, from_=None, to_=None):
"""
......@@ -166,7 +167,7 @@ class WebarchivSession:
try:
response = self._get(op='/search/fulltext', params=params)
return self.waitForResponse(response)
return self.wait_for_response(response)
except HTTPError as e:
self._display_http_error(e)
......@@ -192,7 +193,7 @@ class WebarchivSession:
try:
response = self._get(op='/search/fulltext/seed', params=params)
return self.waitForResponse(response)
return self.wait_for_response(response)
except HTTPError as e:
self._display_http_error(e)
......@@ -217,7 +218,7 @@ class WebarchivSession:
try:
response = self._get(op='/search/fulltext/capture', params=params)
return self.waitForResponse(response)
return self.wait_for_response(response)
except HTTPError as e:
self._display_http_error(e)
......@@ -241,18 +242,15 @@ class WebarchivSession:
try:
response = self._get(op='/search/wayback', params=params)
return self.waitForResponse(response)
return self.wait_for_response(response)
except HTTPError as e:
self._display_http_error(e)
print('Error:'.format(query_string))
def waitForResponse(self, response):
def wait_for_response(self, response):
"""
Polls until the server responds with a result
:param response: String to search for
:return: response
"""
if response.status_code == 400:
return response
......@@ -265,12 +263,11 @@ class WebarchivSession:
def status_query(self, resp):
"""
this is the pollingrequest for the given typen of request
:param resp: String to search for
:return: response
this is the polling request for the given type of request
"""
requestid = resp.json()['requestid']
j = resp.json()
context = j['context']
requestid = j['requestid']
type_ = resp.json()['type']
if type_ == 1:
r = self._get(op='/search/status/fulltext', params={'requestid': requestid})
......@@ -300,7 +297,7 @@ class WebarchivSession:
try:
response = self._get(op='/search/domainname', params=params)
return self.waitForResponse(response)
return self.wait_for_response(response)
except HTTPError as e:
self._display_http_error(e)
......@@ -323,30 +320,30 @@ class WebarchivSession:
try:
response = self._get(op='/search/fulltext/histogram', params=params)
return self.waitForResponse(response)
return self.wait_for_response(response)
except HTTPError as e:
self._display_http_error(e)
print('Error:'.format(query_string))
def getSnapshotUrl(self, seed, capture, onlysvg):
def get_snapshot_url(self, seed, capture, onlysvg):
return self.api_path + 'snapshot?capture=' + capture + '&t=' + self.token + '&apikey=' + self.api_key + '&onlysvg=' + onlysvg + '&seed=' + seed
@staticmethod
def resultContainsSeeds(response):
def result_contains_seeds(response):
try:
return response.json()['subtype'] == 2
except:
return False
@staticmethod
def resultContainsCaptures(response):
def result_contains_captures(response):
try:
return response.json()['subtype'] == 3
except:
return False
def savePage(self, url):
def save_page(self, url):
self.connect()
r = requests.post(self.base_url.format('savepage'),
data='''{{
......@@ -361,21 +358,17 @@ class WebarchivSession:
)
return r
def fragmentChecksumHtml(self, seed, capture, selector, occurrence):
def fragment_checksum_html(self, seed, capture, selector, occurrence):
try:
response = self._post(op='/fragment/checksum/html', jsondata='''{{
"apikey": "{api_key}",
"t": "{token}",
"seed": "{seed}",
"capture": "{capture}",
"selector": "{selector}",
"occurrence": "{occurrence}",
"extractortype": 2
}}'''.format(api_key=self.api_key, token=self.token, seed=seed, capture=capture,
selector=selector, occurrence=occurrence))
response = self._post(op='/fragment/checksum/html', json={
"seed": seed,
"capture": capture,
"selector": selector,
"occurrence": occurrence,
"extractortype": EXTRACTOR_HTML
})
response = self.status_query(response)
return self.waitForResponse(response)
return self.wait_for_response(response)
except HTTPError as e:
self._display_http_error(e)
......@@ -401,7 +394,7 @@ if __name__ == '__main__':
for capture in response.json()['hits']:
capturedate = capture['c']
resp = w.fragmentChecksumHtml(url, capturedate, ".odd td", 3);
resp = w.fragment_checksum_html(url, capturedate, ".odd td", 3)
checksum = resp.json()['checksum']
returncode = resp.json()['returncode']
......