Skip to content
Commits on Source (4)
%% Cell type:markdown id: tags:
%% Cell type:code id: tags:
``` python
import requests
import json
```
%% Cell type:markdown id: tags:
Create a WebarchivSession Object with convenience methods for easy access with your API-Key
%% Cell type:code id: tags:
``` python
from webarchiv import WebarchivSession
apikey = '2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c'
w = WebarchivSession(apikey)
```
%% Cell type:raw id: tags:
request archiving a Webpage
%% Cell type:code id: tags:
``` python
response = w.savePage("http://www.onb.ac.at")
response = w.save_page("http://www.onb.ac.at")
if response.status_code == 201:
print(response.json())
else:
print("Error ", response.status_code)
```
%% Output
{'nomination_id': 247, 'seed': 'http://www.onb.ac.at', 'nominationtype': 5}
{'nomination_id': 374, 'seed': 'http://www.onb.ac.at', 'nominationtype': 5}
%% Cell type:code id: tags:
``` python
```
......
%% Cell type:markdown id: tags:
%% Cell type:code id: tags:
``` python
import requests
import json
```
%% Cell type:markdown id: tags:
Create a WebarchivSession Object with convenience methods for easy access with your API-Key
%% Cell type:code id: tags:
``` python
from webarchiv import WebarchivSession
apikey = '2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c'
w = WebarchivSession(apikey)
```
%% Cell type:code id: tags:
``` python
url = "http://sport.orf.at/l/stories/2003717/"
response = w.wayback_search("http://sport.orf.at/l/stories/2003717/", "20110101000000", "20120401000000")
```
%% Cell type:code id: tags:
``` python
if response.status_code != 200:
print("Error ", response.status_code)
```
%% Cell type:code id: tags:
``` python
lastchecksum = ''
captures = []
for capture in response.json()['hits']:
capturedate = capture['c']
resp = w.fragment_checksum_html(url, capturedate, ".odd td", 3)
checksum = resp.json()['checksum']
returncode = resp.json()['returncode']
if returncode != 0:
continue
if checksum != lastchecksum:
#print(resp.json())
print("http://wayback/web/" + capturedate + "/" + url)
capture = {"url": url, "timestamp": capturedate}
captures.append(capture)
#print(capturedate + " " + checksum)
lastchecksum = checksum
if len(captures) > 0:
response = w.create_watchlist(captures)
print ("A watchlist with all captures mentioned above was generated. The code for this watchlist is " + response.json() + ". " )
print("end")
```
%% Output
http://wayback/web/20110401202828/http://sport.orf.at/l/stories/2003717/
http://wayback/web/20110704202825/http://sport.orf.at/l/stories/2003717/
A watchlist with all captures mentioned above was generated. The code for this watchlist is Zp.
end
......@@ -2,6 +2,7 @@ import sys
import time
import requests
import hashlib
import json
from requests import HTTPError
_datetime_format_string = '%Y%m%d%H%M%S'
......@@ -11,6 +12,10 @@ EXTRACTOR_TEXT = 1
EXTRACTOR_HTML = 2
EXTRACTOR_BINARY = 3
# Modes for TextExtractor
POSITIONLEN_MODE = 1
POSITION_MODE = 2
REGEX_MODE = 3
class SessionTimeoutError(Exception):
pass
......@@ -344,19 +349,13 @@ class WebarchivSession:
return False
def save_page(self, url):
self.connect()
r = requests.post(self.base_url.format('savepage'),
data='''{{
"apikey": "{api_key}",
"t": "{token}",
"url": "{url}"
}}'''.format(api_key=self.api_key, token=self.token, url=url),
headers={
'content-type': 'application/json',
'accept': 'application/ld+json'
}
)
return r
try:
response = self._post(op='/savepage', json={
"url": url
})
return response
except HTTPError as e:
self._display_http_error(e)
def fragment_checksum_html(self, seed, capture, selector, occurrence):
try:
......@@ -372,40 +371,69 @@ class WebarchivSession:
except HTTPError as e:
self._display_http_error(e)
def fragment_checksum_binary(self, seed, capture):
try:
response = self._post(op='/fragment/checksum/binary', json={
"seed": seed,
"capture": capture,
"extractortype": EXTRACTOR_BINARY
})
response = self.status_query(response)
return self.wait_for_response(response)
except HTTPError as e:
self._display_http_error(e)
if __name__ == '__main__':
# noinspection SpellCheckingInspection
w = WebarchivSession('2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c')
# response = w.wayback_search("http://www.onb.ac.at")
# response = w.wayback_search("http://frauenhetz.jetzt")
url = "http://sport.orf.at/l/stories/2003717/"
response = w.wayback_search("http://sport.orf.at/l/stories/2003717/", "20110101000000", "20120401000000")
# response = w.wayback_search("x")
if response.status_code != 200:
print("Error ", response.status_code)
exit(1)
print(response.json()['total'])
print(url)
lastchecksum = ''
for capture in response.json()['hits']:
capturedate = capture['c']
def fragment_checksum_text_positionlen(self, seed, capture, pos, len):
try:
response = self._post(op='/fragment/checksum/text', json={
"seed": seed,
"capture": capture,
"mode": POSITIONLEN_MODE,
"pos": pos,
"len": len,
"extractortype": EXTRACTOR_TEXT
})
response = self.status_query(response)
return self.wait_for_response(response)
except HTTPError as e:
self._display_http_error(e)
resp = w.fragment_checksum_html(url, capturedate, ".odd td", 3)
checksum = resp.json()['checksum']
returncode = resp.json()['returncode']
def fragment_checksum_text_position(self, seed, capture, pos):
try:
response = self._post(op='/fragment/checksum/text', json={
"seed": seed,
"capture": capture,
"mode": POSITION_MODE,
"pos": pos,
"extractortype": EXTRACTOR_TEXT
})
response = self.status_query(response)
return self.wait_for_response(response)
except HTTPError as e:
self._display_http_error(e)
if returncode == 2:
continue
def fragment_checksum_text_regex(self, seed, capture, regexpattern, occurrence):
try:
response = self._post(op='/fragment/checksum/text', json={
"seed": seed,
"capture": capture,
"mode": REGEX_MODE,
"regexpattern": regexpattern,
"occurrence": occurrence,
"extractortype": EXTRACTOR_TEXT
})
response = self.status_query(response)
return self.wait_for_response(response)
except HTTPError as e:
self._display_http_error(e)
if checksum != lastchecksum:
print(resp.json())
print("http://wayback/web/" + capturedate + "/" + url)
print(capturedate + " " + checksum)
lastchecksum = checksum
def create_watchlist(self, urls):
try:
response = self._post(op='/watchlist', json={
"urls": urls
})
return response
except HTTPError as e:
self._display_http_error(e)
print("end")