Commit 8207c2d1 authored by Andreas's avatar Andreas

new methods

parent 4ff4d740
......@@ -2,6 +2,7 @@ import sys
import time
import requests
import hashlib
import json
from requests import HTTPError
_datetime_format_string = '%Y%m%d%H%M%S'
......@@ -11,6 +12,10 @@ EXTRACTOR_TEXT = 1
EXTRACTOR_HTML = 2
EXTRACTOR_BINARY = 3
# Modes for TextExtractor
POSITIONLEN_MODE = 1
POSITION_MODE = 2
REGEX_MODE = 3
class SessionTimeoutError(Exception):
pass
......@@ -344,19 +349,13 @@ class WebarchivSession:
return False
def save_page(self, url):
self.connect()
r = requests.post(self.base_url.format('savepage'),
data='''{{
"apikey": "{api_key}",
"t": "{token}",
"url": "{url}"
}}'''.format(api_key=self.api_key, token=self.token, url=url),
headers={
'content-type': 'application/json',
'accept': 'application/ld+json'
}
)
return r
try:
response = self._post(op='/savepage', json={
"url": url
})
return response
except HTTPError as e:
self._display_http_error(e)
def fragment_checksum_html(self, seed, capture, selector, occurrence):
try:
......@@ -372,40 +371,69 @@ class WebarchivSession:
except HTTPError as e:
self._display_http_error(e)
def fragment_checksum_binary(self, seed, capture):
try:
response = self._post(op='/fragment/checksum/binary', json={
"seed": seed,
"capture": capture,
"extractortype": EXTRACTOR_BINARY
})
response = self.status_query(response)
return self.wait_for_response(response)
except HTTPError as e:
self._display_http_error(e)
if __name__ == '__main__':
# noinspection SpellCheckingInspection
w = WebarchivSession('2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c')
# response = w.wayback_search("http://www.onb.ac.at")
# response = w.wayback_search("http://frauenhetz.jetzt")
url = "http://sport.orf.at/l/stories/2003717/"
response = w.wayback_search("http://sport.orf.at/l/stories/2003717/", "20110101000000", "20120401000000")
# response = w.wayback_search("x")
if response.status_code != 200:
print("Error ", response.status_code)
exit(1)
print(response.json()['total'])
print(url)
lastchecksum = ''
for capture in response.json()['hits']:
capturedate = capture['c']
def fragment_checksum_text_positionlen(self, seed, capture, pos, len):
try:
response = self._post(op='/fragment/checksum/text', json={
"seed": seed,
"capture": capture,
"mode": POSITIONLEN_MODE,
"pos": pos,
"len": len,
"extractortype": EXTRACTOR_TEXT
})
response = self.status_query(response)
return self.wait_for_response(response)
except HTTPError as e:
self._display_http_error(e)
resp = w.fragment_checksum_html(url, capturedate, ".odd td", 3)
checksum = resp.json()['checksum']
returncode = resp.json()['returncode']
def fragment_checksum_text_position(self, seed, capture, pos):
try:
response = self._post(op='/fragment/checksum/text', json={
"seed": seed,
"capture": capture,
"mode": POSITION_MODE,
"pos": pos,
"extractortype": EXTRACTOR_TEXT
})
response = self.status_query(response)
return self.wait_for_response(response)
except HTTPError as e:
self._display_http_error(e)
if returncode == 2:
continue
def fragment_checksum_text_regex(self, seed, capture, regexpattern, occurrence):
try:
response = self._post(op='/fragment/checksum/text', json={
"seed": seed,
"capture": capture,
"mode": REGEX_MODE,
"regexpattern": regexpattern,
"occurrence": occurrence,
"extractortype": EXTRACTOR_TEXT
})
response = self.status_query(response)
return self.wait_for_response(response)
except HTTPError as e:
self._display_http_error(e)
if checksum != lastchecksum:
print(resp.json())
print("http://wayback/web/" + capturedate + "/" + url)
print(capturedate + " " + checksum)
lastchecksum = checksum
def create_watchlist(self, urls):
try:
response = self._post(op='/watchlist', json={
"urls": urls
})
return response
except HTTPError as e:
self._display_http_error(e)
print("end")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment