Skip to content
Commits on Source (4)
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import requests import requests
import json import json
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Create a WebarchivSession Object with convenience methods for easy access with your API-Key Create a WebarchivSession Object with convenience methods for easy access with your API-Key
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
from webarchiv import WebarchivSession from webarchiv import WebarchivSession
apikey = '2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c' apikey = '2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c'
w = WebarchivSession(apikey) w = WebarchivSession(apikey)
``` ```
%% Cell type:raw id: tags: %% Cell type:raw id: tags:
request archiving a Webpage request archiving a Webpage
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
response = w.savePage("http://www.onb.ac.at") response = w.save_page("http://www.onb.ac.at")
if response.status_code == 201: if response.status_code == 201:
print(response.json()) print(response.json())
else: else:
print("Error ", response.status_code) print("Error ", response.status_code)
``` ```
%% Output %% Output
{'nomination_id': 247, 'seed': 'http://www.onb.ac.at', 'nominationtype': 5} {'nomination_id': 374, 'seed': 'http://www.onb.ac.at', 'nominationtype': 5}
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
``` ```
......
%% Cell type:markdown id: tags:
%% Cell type:code id: tags:
``` python
import requests
import json
```
%% Cell type:markdown id: tags:
Create a WebarchivSession Object with convenience methods for easy access with your API-Key
%% Cell type:code id: tags:
``` python
from webarchiv import WebarchivSession
apikey = '2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c'
w = WebarchivSession(apikey)
```
%% Cell type:code id: tags:
``` python
url = "http://sport.orf.at/l/stories/2003717/"
response = w.wayback_search("http://sport.orf.at/l/stories/2003717/", "20110101000000", "20120401000000")
```
%% Cell type:code id: tags:
``` python
if response.status_code != 200:
print("Error ", response.status_code)
```
%% Cell type:code id: tags:
``` python
lastchecksum = ''
captures = []
for capture in response.json()['hits']:
capturedate = capture['c']
resp = w.fragment_checksum_html(url, capturedate, ".odd td", 3)
checksum = resp.json()['checksum']
returncode = resp.json()['returncode']
if returncode != 0:
continue
if checksum != lastchecksum:
#print(resp.json())
print("http://wayback/web/" + capturedate + "/" + url)
capture = {"url": url, "timestamp": capturedate}
captures.append(capture)
#print(capturedate + " " + checksum)
lastchecksum = checksum
if len(captures) > 0:
response = w.create_watchlist(captures)
print ("A watchlist with all captures mentioned above was generated. The code for this watchlist is " + response.json() + ". " )
print("end")
```
%% Output
http://wayback/web/20110401202828/http://sport.orf.at/l/stories/2003717/
http://wayback/web/20110704202825/http://sport.orf.at/l/stories/2003717/
A watchlist with all captures mentioned above was generated. The code for this watchlist is Zp.
end
...@@ -2,6 +2,7 @@ import sys ...@@ -2,6 +2,7 @@ import sys
import time import time
import requests import requests
import hashlib import hashlib
import json
from requests import HTTPError from requests import HTTPError
_datetime_format_string = '%Y%m%d%H%M%S' _datetime_format_string = '%Y%m%d%H%M%S'
...@@ -11,6 +12,10 @@ EXTRACTOR_TEXT = 1 ...@@ -11,6 +12,10 @@ EXTRACTOR_TEXT = 1
EXTRACTOR_HTML = 2 EXTRACTOR_HTML = 2
EXTRACTOR_BINARY = 3 EXTRACTOR_BINARY = 3
# Modes for TextExtractor
POSITIONLEN_MODE = 1
POSITION_MODE = 2
REGEX_MODE = 3
class SessionTimeoutError(Exception): class SessionTimeoutError(Exception):
pass pass
...@@ -344,19 +349,13 @@ class WebarchivSession: ...@@ -344,19 +349,13 @@ class WebarchivSession:
return False return False
def save_page(self, url): def save_page(self, url):
self.connect() try:
r = requests.post(self.base_url.format('savepage'), response = self._post(op='/savepage', json={
data='''{{ "url": url
"apikey": "{api_key}", })
"t": "{token}", return response
"url": "{url}" except HTTPError as e:
}}'''.format(api_key=self.api_key, token=self.token, url=url), self._display_http_error(e)
headers={
'content-type': 'application/json',
'accept': 'application/ld+json'
}
)
return r
def fragment_checksum_html(self, seed, capture, selector, occurrence): def fragment_checksum_html(self, seed, capture, selector, occurrence):
try: try:
...@@ -372,40 +371,69 @@ class WebarchivSession: ...@@ -372,40 +371,69 @@ class WebarchivSession:
except HTTPError as e: except HTTPError as e:
self._display_http_error(e) self._display_http_error(e)
def fragment_checksum_binary(self, seed, capture):
try:
response = self._post(op='/fragment/checksum/binary', json={
"seed": seed,
"capture": capture,
"extractortype": EXTRACTOR_BINARY
})
response = self.status_query(response)
return self.wait_for_response(response)
except HTTPError as e:
self._display_http_error(e)
if __name__ == '__main__': def fragment_checksum_text_positionlen(self, seed, capture, pos, len):
# noinspection SpellCheckingInspection try:
w = WebarchivSession('2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c') response = self._post(op='/fragment/checksum/text', json={
# response = w.wayback_search("http://www.onb.ac.at") "seed": seed,
# response = w.wayback_search("http://frauenhetz.jetzt") "capture": capture,
url = "http://sport.orf.at/l/stories/2003717/" "mode": POSITIONLEN_MODE,
response = w.wayback_search("http://sport.orf.at/l/stories/2003717/", "20110101000000", "20120401000000") "pos": pos,
# response = w.wayback_search("x") "len": len,
"extractortype": EXTRACTOR_TEXT
if response.status_code != 200: })
print("Error ", response.status_code) response = self.status_query(response)
exit(1) return self.wait_for_response(response)
except HTTPError as e:
print(response.json()['total']) self._display_http_error(e)
print(url)
lastchecksum = ''
for capture in response.json()['hits']:
capturedate = capture['c']
resp = w.fragment_checksum_html(url, capturedate, ".odd td", 3) def fragment_checksum_text_position(self, seed, capture, pos):
checksum = resp.json()['checksum'] try:
returncode = resp.json()['returncode'] response = self._post(op='/fragment/checksum/text', json={
"seed": seed,
"capture": capture,
"mode": POSITION_MODE,
"pos": pos,
"extractortype": EXTRACTOR_TEXT
})
response = self.status_query(response)
return self.wait_for_response(response)
except HTTPError as e:
self._display_http_error(e)
if returncode == 2: def fragment_checksum_text_regex(self, seed, capture, regexpattern, occurrence):
continue try:
response = self._post(op='/fragment/checksum/text', json={
"seed": seed,
"capture": capture,
"mode": REGEX_MODE,
"regexpattern": regexpattern,
"occurrence": occurrence,
"extractortype": EXTRACTOR_TEXT
})
response = self.status_query(response)
return self.wait_for_response(response)
except HTTPError as e:
self._display_http_error(e)
if checksum != lastchecksum:
print(resp.json())
print("http://wayback/web/" + capturedate + "/" + url)
print(capturedate + " " + checksum)
lastchecksum = checksum def create_watchlist(self, urls):
try:
response = self._post(op='/watchlist', json={
"urls": urls
})
return response
except HTTPError as e:
self._display_http_error(e)
print("end")