diff --git a/webarchiv.py b/webarchiv.py index 0388094db403c976f71fee0e7deed9bd5fe3867b..0fb6bd17fe9a069904fb5985a6b5c341c3e09736 100644 --- a/webarchiv.py +++ b/webarchiv.py @@ -2,6 +2,7 @@ import sys import time import requests import hashlib +import json from requests import HTTPError _datetime_format_string = '%Y%m%d%H%M%S' @@ -11,6 +12,10 @@ EXTRACTOR_TEXT = 1 EXTRACTOR_HTML = 2 EXTRACTOR_BINARY = 3 +# Modes for TextExtractor +POSITIONLEN_MODE = 1 +POSITION_MODE = 2 +REGEX_MODE = 3 class SessionTimeoutError(Exception): pass @@ -344,19 +349,13 @@ class WebarchivSession: return False def save_page(self, url): - self.connect() - r = requests.post(self.base_url.format('savepage'), - data='''{{ - "apikey": "{api_key}", - "t": "{token}", - "url": "{url}" - }}'''.format(api_key=self.api_key, token=self.token, url=url), - headers={ - 'content-type': 'application/json', - 'accept': 'application/ld+json' - } - ) - return r + try: + response = self._post(op='/savepage', json={ + "url": url + }) + return response + except HTTPError as e: + self._display_http_error(e) def fragment_checksum_html(self, seed, capture, selector, occurrence): try: @@ -372,40 +371,69 @@ class WebarchivSession: except HTTPError as e: self._display_http_error(e) + def fragment_checksum_binary(self, seed, capture): + try: + response = self._post(op='/fragment/checksum/binary', json={ + "seed": seed, + "capture": capture, + "extractortype": EXTRACTOR_BINARY + }) + response = self.status_query(response) + return self.wait_for_response(response) + except HTTPError as e: + self._display_http_error(e) -if __name__ == '__main__': - # noinspection SpellCheckingInspection - w = WebarchivSession('2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c') -# response = w.wayback_search("http://www.onb.ac.at") -# response = w.wayback_search("http://frauenhetz.jetzt") - url = "http://sport.orf.at/l/stories/2003717/" - response = w.wayback_search("http://sport.orf.at/l/stories/2003717/", "20110101000000", "20120401000000") -# response = w.wayback_search("x") - - if response.status_code != 200: - print("Error ", response.status_code) - exit(1) - - print(response.json()['total']) - - print(url) - - lastchecksum = '' - for capture in response.json()['hits']: - capturedate = capture['c'] + def fragment_checksum_text_positionlen(self, seed, capture, pos, len): + try: + response = self._post(op='/fragment/checksum/text', json={ + "seed": seed, + "capture": capture, + "mode": POSITIONLEN_MODE, + "pos": pos, + "len": len, + "extractortype": EXTRACTOR_TEXT + }) + response = self.status_query(response) + return self.wait_for_response(response) + except HTTPError as e: + self._display_http_error(e) - resp = w.fragment_checksum_html(url, capturedate, ".odd td", 3) - checksum = resp.json()['checksum'] - returncode = resp.json()['returncode'] + def fragment_checksum_text_position(self, seed, capture, pos): + try: + response = self._post(op='/fragment/checksum/text', json={ + "seed": seed, + "capture": capture, + "mode": POSITION_MODE, + "pos": pos, + "extractortype": EXTRACTOR_TEXT + }) + response = self.status_query(response) + return self.wait_for_response(response) + except HTTPError as e: + self._display_http_error(e) - if returncode == 2: - continue + def fragment_checksum_text_regex(self, seed, capture, regexpattern, occurrence): + try: + response = self._post(op='/fragment/checksum/text', json={ + "seed": seed, + "capture": capture, + "mode": REGEX_MODE, + "regexpattern": regexpattern, + "occurrence": occurrence, + "extractortype": EXTRACTOR_TEXT + }) + response = self.status_query(response) + return self.wait_for_response(response) + except HTTPError as e: + self._display_http_error(e) - if checksum != lastchecksum: - print(resp.json()) - print("http://wayback/web/" + capturedate + "/" + url) - print(capturedate + " " + checksum) - lastchecksum = checksum + def create_watchlist(self, urls): + try: + response = self._post(op='/watchlist', json={ + "urls": urls + }) + return response + except HTTPError as e: + self._display_http_error(e) - print("end")