Skip to content
Commits on Source (8)
import sys
import time
import requests
import hashlib
from requests import HTTPError
_datetime_format_string = '%Y%m%d%H%M%S'
......@@ -45,8 +46,9 @@ class WebarchivSession:
"""
return 'HTTP ERROR - status code {status_code}\n----\n{response_text}\n----\n\n'
def __init__(self, api_key):
def __init__(self, api_key, allow_tracking=False):
self.api_key = api_key
self.allow_tracking = allow_tracking
self.token = None
def connect(self):
......@@ -59,12 +61,21 @@ class WebarchivSession:
self._display_http_error(e)
def _authenticate(self):
if self.allow_tracking:
from uuid import getnode as get_mac
mac = get_mac()
sha256 = hashlib.sha256()
sha256.update(str(mac).encode('utf-8'))
fingerprint = sha256.hexdigest()
else:
fingerprint = ''
r = requests.post(self.base_url.format('authentication'),
data='''{{
"apikey": "{api_key}",
"fingerprint": "string",
"fingerprint": "{fingerprint}",
"version": "{version}"
}}'''.format(api_key=self.api_key, version=self.version),
}}'''.format(api_key=self.api_key, version=self.version, fingerprint=fingerprint),
headers={
'content-type': 'application/json',
'accept': 'application/ld+json'
......@@ -112,19 +123,17 @@ class WebarchivSession:
elif r.status_code == 410:
print('The requested API Version (via X-API-VERSION Header) is not available', file=sys.stderr)
return r
raise HTTPError(response=r)
def fulltext_search(self, query_string, from_=None, to_=None):
"""
Start a fulltext search query in the Webarchive.
The current status of running queries can be read via status_open_queries().
:param query_string: String to search for
:param from_: Optional earliest date bound for the search
in the format YYYYMM.
:param to_: Optional latest date bound for the search
in the format YYYYMM.
:return: None
:return: HTTP Response object
"""
params = {'q': query_string}
if from_:
......@@ -140,17 +149,66 @@ class WebarchivSession:
self._display_http_error(e)
print('Query for "{}" not added'.format(query_string))
def fulltext_search_within_domain(self, query_string, domain, from_=None, to_=None):
"""
Start a fulltext seed search query in the Webarchive.
:param query_string: String to search for
:param domain: Search only within this domain name
:param from_: Optional earliest date bound for the search
in the format YYYYMM.
:param to_: Optional latest date bound for the search
in the format YYYYMM.
:return: HTTP Response object
"""
params = {'q': query_string, 'g': domain}
if from_:
params['from'] = from_
if to_:
params['to'] = to_
try:
response = self._get(op='/search/fulltext/seed', params=params)
return self.waitForResponse(response)
except HTTPError as e:
self._display_http_error(e)
def fulltext_search_within_url(self, query_string, url, pagesize=10, from_=None, to_=None):
"""
Start a fulltext capture search query in the Webarchive.
:param query_string: String to search for
:param url: Search only captures starting at this exact web address
:param from_: Optional earliest date bound for the search
in the format YYYYMM.
:param to_: Optional latest date bound for the search
in the format YYYYMM.
:return: HTTP Response object
"""
params = {'q': query_string, 'g': url, 'pagesize': pagesize}
if from_:
params['from'] = from_
if to_:
params['to'] = to_
try:
response = self._get(op='/search/fulltext/capture', params=params)
return self.waitForResponse(response)
except HTTPError as e:
self._display_http_error(e)
def wayback_search(self, query_string, from_=None, to_=None):
"""
Start a wayback search query in the Webarchive.
The current status of running queries can be read via status_open_queries().
:param query_string: String to search for
:param from_: Optional earliest date bound for the search
in the format YYYYMM.
:param to_: Optional latest date bound for the search
in the format YYYYMM.
:return: None
:return: HTTP Response object
"""
params = {'q': query_string}
if from_:
......@@ -186,22 +244,23 @@ class WebarchivSession:
"""
this is the pollingrequest for the given typen of request
:param response: String to search for
:param resp: String to search for
:return: response
"""
requestid = resp.json()['requestid'];
type = resp.json()['type']
if type == 1:
requestid = resp.json()['requestid']
type_ = resp.json()['type']
if type_ == 1:
r = self._get(op='/search/status/fulltext', params={'requestid': requestid})
elif type == 2:
elif type_ == 2:
r = self._get(op='/search/status/wayback', params={'requestid': requestid})
else:
raise NotImplementedError(f'Unknown status query type {type_} - Please update client.')
return r
def domain_name_search(self, query_string, page_=1, pagesize_=100):
"""
Start a domain name search in the Webarchive.
The current status of running queries can be read via status_open_queries().
:param query_string: String to search for
:param page_: The page number parameter works with the page size parameter to control the offset of the records returned in the results. Default value is 1
......@@ -225,11 +284,8 @@ class WebarchivSession:
def histogram_search(self, query_string, interval_=3, from_=None, to_=None):
"""
Start a domain name search in the Webarchive.
The current status of running queries can be read via status_open_queries().
:param query_string: String to search for
:param page_: The page number parameter works with the page size parameter to control the offset of the records returned in the results. Default value is 1
:param pagesize_: The page size parameter works with the page number parameter to control the offset of the records returned in the results. It also controls how many results are returned with each request. Default value is 10
:return: result as json
"""
params = {'q': query_string}
......@@ -249,7 +305,21 @@ class WebarchivSession:
print('Error:'.format(query_string))
def getSnapshotUrl(self, seed, capture, onlysvg):
return self.api_path + 'snapshot?capture=' + capture + '&t=' + self.token + '&apikey=' + self.api_key + '&onlysvg=' + onlysvg + '&seed=' + seed;
return self.api_path + 'snapshot?capture=' + capture + '&t=' + self.token + '&apikey=' + self.api_key + '&onlysvg=' + onlysvg + '&seed=' + seed
@staticmethod
def resultContainsSeeds(response):
try:
return response.json()['subtype'] == 2
except:
return False
@staticmethod
def resultContainsCaptures(response):
try:
return response.json()['subtype'] == 3
except:
return False
def savePage(self, url):
self.connect()
......@@ -264,17 +334,4 @@ class WebarchivSession:
'accept': 'application/ld+json'
}
)
return r;
if __name__ == '__main__':
# noinspection SpellCheckingInspection
w = WebarchivSession('2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c')
# response = w.wayback_search("http://www.onb.ac.at")
# response = w.wayback_search("http://frauenhetz.jetzt")
#response = w.histogram_search("Nationalbibliothek Prunksaal Schwarzenegger")
response = w.savePage("http://www.onb.ac.at")
if response.status_code == 201:
print(response.json())
else:
print("Error ", response.status_code)
return r