Skip to content
Commits on Source (6)
...@@ -7,6 +7,11 @@ from requests import HTTPError ...@@ -7,6 +7,11 @@ from requests import HTTPError
_datetime_format_string = '%Y%m%d%H%M%S' _datetime_format_string = '%Y%m%d%H%M%S'
EXTRACTOR_TEXT = 1
EXTRACTOR_HTML = 2
EXTRACTOR_BINARY = 3
class SessionTimeoutError(Exception): class SessionTimeoutError(Exception):
pass pass
...@@ -103,49 +108,45 @@ class WebarchivSession: ...@@ -103,49 +108,45 @@ class WebarchivSession:
response_text=e.response.text), response_text=e.response.text),
file=sys.stderr) file=sys.stderr)
@staticmethod
def _handle_response_errors(r):
if r.status_code == 403:
print('Forbidden. Invalid Token or ApiKey transmitted', file=sys.stderr)
return r
elif r.status_code == 400:
print('Bad request', file=sys.stderr)
return r
elif r.status_code == 410:
print('The requested API Version (via X-API-VERSION Header) is not available', file=sys.stderr)
return r
def _get(self, op, auto_connect=True, **kwargs, ): def _get(self, op, auto_connect=True, **kwargs, ):
kwargs['params'] = self._add_api_key_and_token(kwargs.pop('params', {})) kwargs['params'] = self._add_api_key_and_token(kwargs.pop('params', {}))
# kwargs = self._add_api_key_and_token(kwargs)
r = requests.get(self.base_url.format(op), **kwargs) r = requests.get(self.base_url.format(op), **kwargs)
if r.ok: if r.ok:
return r return r
elif r.status_code == 403 and auto_connect:
self.connect()
return self._get(op=op, auto_connect=False, **kwargs)
else: else:
if r.status_code == 403: return self._handle_response_errors(r)
if auto_connect:
self.connect() def _post(self, op, auto_connect=True, json: dict = None, **kwargs):
return self._get(op=op, auto_connect=False, **kwargs) if not json:
else: json = {}
print('Forbidden. Invalid Token or ApiKey transmitted', file=sys.stderr) kwargs['json'] = self._add_api_key_and_token(json)
return r kwargs['headers'] = {
elif r.status_code == 400: 'content-type': 'application/json',
print('Bad request', file=sys.stderr) 'accept': 'application/ld+json'
return r }
elif r.status_code == 410: r = requests.post(self.base_url.format(op), **kwargs)
print('The requested API Version (via X-API-VERSION Header) is not available', file=sys.stderr)
return r
def _post(self, op, jsondata, auto_connect=True):
r = requests.post(self.base_url.format(op), jsondata,
headers={
'content-type': 'application/json',
'accept': 'application/ld+json'
})
if r.ok: if r.ok:
return r return r
elif r.status_code == 403 and auto_connect:
self.connect()
return self._post(op=op, auto_connect=False, **kwargs)
else: else:
if r.status_code == 403: return self._handle_response_errors(r)
if auto_connect:
self.connect()
return self._post(op=op, jsondata=jsondata, auto_connect=False)
else:
print('Forbidden. Invalid Token or ApiKey transmitted', file=sys.stderr)
return r
elif r.status_code == 400:
print('Bad request', file=sys.stderr)
return r
elif r.status_code == 410:
print('The requested API Version (via X-API-VERSION Header) is not available', file=sys.stderr)
return r
def fulltext_search(self, query_string, from_=None, to_=None): def fulltext_search(self, query_string, from_=None, to_=None):
""" """
...@@ -166,7 +167,7 @@ class WebarchivSession: ...@@ -166,7 +167,7 @@ class WebarchivSession:
try: try:
response = self._get(op='/search/fulltext', params=params) response = self._get(op='/search/fulltext', params=params)
return self.waitForResponse(response) return self.wait_for_response(response)
except HTTPError as e: except HTTPError as e:
self._display_http_error(e) self._display_http_error(e)
...@@ -192,7 +193,7 @@ class WebarchivSession: ...@@ -192,7 +193,7 @@ class WebarchivSession:
try: try:
response = self._get(op='/search/fulltext/seed', params=params) response = self._get(op='/search/fulltext/seed', params=params)
return self.waitForResponse(response) return self.wait_for_response(response)
except HTTPError as e: except HTTPError as e:
self._display_http_error(e) self._display_http_error(e)
...@@ -217,7 +218,7 @@ class WebarchivSession: ...@@ -217,7 +218,7 @@ class WebarchivSession:
try: try:
response = self._get(op='/search/fulltext/capture', params=params) response = self._get(op='/search/fulltext/capture', params=params)
return self.waitForResponse(response) return self.wait_for_response(response)
except HTTPError as e: except HTTPError as e:
self._display_http_error(e) self._display_http_error(e)
...@@ -241,18 +242,15 @@ class WebarchivSession: ...@@ -241,18 +242,15 @@ class WebarchivSession:
try: try:
response = self._get(op='/search/wayback', params=params) response = self._get(op='/search/wayback', params=params)
return self.waitForResponse(response) return self.wait_for_response(response)
except HTTPError as e: except HTTPError as e:
self._display_http_error(e) self._display_http_error(e)
print('Error:'.format(query_string)) print('Error:'.format(query_string))
def waitForResponse(self, response): def wait_for_response(self, response):
""" """
Polls until the server responds with a result Polls until the server responds with a result
:param response: String to search for
:return: response
""" """
if response.status_code == 400: if response.status_code == 400:
return response return response
...@@ -265,12 +263,11 @@ class WebarchivSession: ...@@ -265,12 +263,11 @@ class WebarchivSession:
def status_query(self, resp): def status_query(self, resp):
""" """
this is the pollingrequest for the given typen of request this is the polling request for the given type of request
:param resp: String to search for
:return: response
""" """
requestid = resp.json()['requestid'] j = resp.json()
context = j['context']
requestid = j['requestid']
type_ = resp.json()['type'] type_ = resp.json()['type']
if type_ == 1: if type_ == 1:
r = self._get(op='/search/status/fulltext', params={'requestid': requestid}) r = self._get(op='/search/status/fulltext', params={'requestid': requestid})
...@@ -300,7 +297,7 @@ class WebarchivSession: ...@@ -300,7 +297,7 @@ class WebarchivSession:
try: try:
response = self._get(op='/search/domainname', params=params) response = self._get(op='/search/domainname', params=params)
return self.waitForResponse(response) return self.wait_for_response(response)
except HTTPError as e: except HTTPError as e:
self._display_http_error(e) self._display_http_error(e)
...@@ -323,30 +320,30 @@ class WebarchivSession: ...@@ -323,30 +320,30 @@ class WebarchivSession:
try: try:
response = self._get(op='/search/fulltext/histogram', params=params) response = self._get(op='/search/fulltext/histogram', params=params)
return self.waitForResponse(response) return self.wait_for_response(response)
except HTTPError as e: except HTTPError as e:
self._display_http_error(e) self._display_http_error(e)
print('Error:'.format(query_string)) print('Error:'.format(query_string))
def getSnapshotUrl(self, seed, capture, onlysvg): def get_snapshot_url(self, seed, capture, onlysvg):
return self.api_path + 'snapshot?capture=' + capture + '&t=' + self.token + '&apikey=' + self.api_key + '&onlysvg=' + onlysvg + '&seed=' + seed return self.api_path + 'snapshot?capture=' + capture + '&t=' + self.token + '&apikey=' + self.api_key + '&onlysvg=' + onlysvg + '&seed=' + seed
@staticmethod @staticmethod
def resultContainsSeeds(response): def result_contains_seeds(response):
try: try:
return response.json()['subtype'] == 2 return response.json()['subtype'] == 2
except: except:
return False return False
@staticmethod @staticmethod
def resultContainsCaptures(response): def result_contains_captures(response):
try: try:
return response.json()['subtype'] == 3 return response.json()['subtype'] == 3
except: except:
return False return False
def savePage(self, url): def save_page(self, url):
self.connect() self.connect()
r = requests.post(self.base_url.format('savepage'), r = requests.post(self.base_url.format('savepage'),
data='''{{ data='''{{
...@@ -361,21 +358,17 @@ class WebarchivSession: ...@@ -361,21 +358,17 @@ class WebarchivSession:
) )
return r return r
def fragmentChecksumHtml(self, seed, capture, selector, occurrence): def fragment_checksum_html(self, seed, capture, selector, occurrence):
try: try:
response = self._post(op='/fragment/checksum/html', jsondata='''{{ response = self._post(op='/fragment/checksum/html', json={
"apikey": "{api_key}", "seed": seed,
"t": "{token}", "capture": capture,
"seed": "{seed}", "selector": selector,
"capture": "{capture}", "occurrence": occurrence,
"selector": "{selector}", "extractortype": EXTRACTOR_HTML
"occurrence": "{occurrence}", })
"extractortype": 2
}}'''.format(api_key=self.api_key, token=self.token, seed=seed, capture=capture,
selector=selector, occurrence=occurrence))
response = self.status_query(response) response = self.status_query(response)
return self.waitForResponse(response) return self.wait_for_response(response)
except HTTPError as e: except HTTPError as e:
self._display_http_error(e) self._display_http_error(e)
...@@ -401,7 +394,7 @@ if __name__ == '__main__': ...@@ -401,7 +394,7 @@ if __name__ == '__main__':
for capture in response.json()['hits']: for capture in response.json()['hits']:
capturedate = capture['c'] capturedate = capture['c']
resp = w.fragmentChecksumHtml(url, capturedate, ".odd td", 3); resp = w.fragment_checksum_html(url, capturedate, ".odd td", 3)
checksum = resp.json()['checksum'] checksum = resp.json()['checksum']
returncode = resp.json()['returncode'] returncode = resp.json()['returncode']
......