diff --git a/webarchiv.py b/webarchiv.py index 0fb6bd17fe9a069904fb5985a6b5c341c3e09736..733171ea41f400e6fe22cf419815a477e0b55fbe 100644 --- a/webarchiv.py +++ b/webarchiv.py @@ -2,20 +2,31 @@ import sys import time import requests import hashlib -import json +from enum import IntEnum from requests import HTTPError _datetime_format_string = '%Y%m%d%H%M%S' -EXTRACTOR_TEXT = 1 -EXTRACTOR_HTML = 2 -EXTRACTOR_BINARY = 3 +class ExtractorMode(IntEnum): + TEXT = 1 + HTML = 2 + BINARY = 3 + + +class TextExtractorMode(IntEnum): + POSITION_LEN = 1 + POSITION = 2 + REGEX = 3 + + +class ReturnType(IntEnum): + ELASTIC = 1 + WAYBACKCALHEAT = 2 + WAYBACK = 3 + COUNT = 4 + FRAGMENT = 5 -# Modes for TextExtractor -POSITIONLEN_MODE = 1 -POSITION_MODE = 2 -REGEX_MODE = 3 class SessionTimeoutError(Exception): pass @@ -209,6 +220,7 @@ class WebarchivSession: :param query_string: String to search for :param url: Search only captures starting at this exact web address + :param pagesize: Number of results per result page :param from_: Optional earliest date bound for the search in the format YYYYMM. :param to_: Optional latest date bound for the search @@ -274,11 +286,11 @@ class WebarchivSession: context = j['context'] requestid = j['requestid'] type_ = resp.json()['type'] - if type_ == 1: + if type_ == ReturnType.ELASTIC: r = self._get(op='/search/status/fulltext', params={'requestid': requestid}) - elif type_ == 2: + elif type_ == ReturnType.WAYBACKCALHEAT: r = self._get(op='/search/status/wayback', params={'requestid': requestid}) - elif type_ == 5: + elif type_ == ReturnType.FRAGMENT: r = self._get(op='/fragment/checksum/status', params={'requestid': requestid}) else: raise NotImplementedError(f'Unknown status query type {type_} - Please update client.') @@ -290,8 +302,11 @@ class WebarchivSession: Start a domain name search in the Webarchive. :param query_string: String to search for - :param page_: The page number parameter works with the page size parameter to control the offset of the records returned in the results. Default value is 1 - :param pagesize_: The page size parameter works with the page number parameter to control the offset of the records returned in the results. It also controls how many results are returned with each request. Default value is 10 + :param page_: The page number parameter works with the page size parameter + to control the offset of the records returned in the results. Default value is 1 + :param pagesize_: The page size parameter works with the page number parameter + to control the offset of the records returned in the results. + It also controls how many results are returned with each request. Default value is 100 :return: result as json """ params = {'q': query_string} @@ -364,7 +379,7 @@ class WebarchivSession: "capture": capture, "selector": selector, "occurrence": occurrence, - "extractortype": EXTRACTOR_HTML + "extractortype": ExtractorMode.HTML }) response = self.status_query(response) return self.wait_for_response(response) @@ -376,7 +391,7 @@ class WebarchivSession: response = self._post(op='/fragment/checksum/binary', json={ "seed": seed, "capture": capture, - "extractortype": EXTRACTOR_BINARY + "extractortype": ExtractorMode.BINARY }) response = self.status_query(response) return self.wait_for_response(response) @@ -388,10 +403,10 @@ class WebarchivSession: response = self._post(op='/fragment/checksum/text', json={ "seed": seed, "capture": capture, - "mode": POSITIONLEN_MODE, + "mode": TextExtractorMode.POSITION_LEN, "pos": pos, "len": len, - "extractortype": EXTRACTOR_TEXT + "extractortype": ExtractorMode.TEXT }) response = self.status_query(response) return self.wait_for_response(response) @@ -403,9 +418,9 @@ class WebarchivSession: response = self._post(op='/fragment/checksum/text', json={ "seed": seed, "capture": capture, - "mode": POSITION_MODE, + "mode": TextExtractorMode.POSITION, "pos": pos, - "extractortype": EXTRACTOR_TEXT + "extractortype": ExtractorMode.TEXT }) response = self.status_query(response) return self.wait_for_response(response) @@ -417,17 +432,16 @@ class WebarchivSession: response = self._post(op='/fragment/checksum/text', json={ "seed": seed, "capture": capture, - "mode": REGEX_MODE, + "mode": TextExtractorMode.REGEX, "regexpattern": regexpattern, "occurrence": occurrence, - "extractortype": EXTRACTOR_TEXT + "extractortype": ExtractorMode.TEXT }) response = self.status_query(response) return self.wait_for_response(response) except HTTPError as e: self._display_http_error(e) - def create_watchlist(self, urls): try: response = self._post(op='/watchlist', json={ @@ -436,4 +450,3 @@ class WebarchivSession: return response except HTTPError as e: self._display_http_error(e) -