From b9c64ec9c705f582e453d0f4e7464bc74dfee551 Mon Sep 17 00:00:00 2001 From: Stefan Karner Date: Fri, 24 May 2019 16:22:23 +0200 Subject: [PATCH] Refactor: Use Enums for constants; reformat --- webarchiv.py | 59 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 23 deletions(-) diff --git a/webarchiv.py b/webarchiv.py index 0fb6bd1..733171e 100644 --- a/webarchiv.py +++ b/webarchiv.py @@ -2,20 +2,31 @@ import sys import time import requests import hashlib -import json +from enum import IntEnum from requests import HTTPError _datetime_format_string = '%Y%m%d%H%M%S' -EXTRACTOR_TEXT = 1 -EXTRACTOR_HTML = 2 -EXTRACTOR_BINARY = 3 +class ExtractorMode(IntEnum): + TEXT = 1 + HTML = 2 + BINARY = 3 + + +class TextExtractorMode(IntEnum): + POSITION_LEN = 1 + POSITION = 2 + REGEX = 3 + + +class ReturnType(IntEnum): + ELASTIC = 1 + WAYBACKCALHEAT = 2 + WAYBACK = 3 + COUNT = 4 + FRAGMENT = 5 -# Modes for TextExtractor -POSITIONLEN_MODE = 1 -POSITION_MODE = 2 -REGEX_MODE = 3 class SessionTimeoutError(Exception): pass @@ -209,6 +220,7 @@ class WebarchivSession: :param query_string: String to search for :param url: Search only captures starting at this exact web address + :param pagesize: Number of results per result page :param from_: Optional earliest date bound for the search in the format YYYYMM. :param to_: Optional latest date bound for the search @@ -274,11 +286,11 @@ class WebarchivSession: context = j['context'] requestid = j['requestid'] type_ = resp.json()['type'] - if type_ == 1: + if type_ == ReturnType.ELASTIC: r = self._get(op='/search/status/fulltext', params={'requestid': requestid}) - elif type_ == 2: + elif type_ == ReturnType.WAYBACKCALHEAT: r = self._get(op='/search/status/wayback', params={'requestid': requestid}) - elif type_ == 5: + elif type_ == ReturnType.FRAGMENT: r = self._get(op='/fragment/checksum/status', params={'requestid': requestid}) else: raise NotImplementedError(f'Unknown status query type {type_} - Please update client.') @@ -290,8 +302,11 @@ class WebarchivSession: Start a domain name search in the Webarchive. :param query_string: String to search for - :param page_: The page number parameter works with the page size parameter to control the offset of the records returned in the results. Default value is 1 - :param pagesize_: The page size parameter works with the page number parameter to control the offset of the records returned in the results. It also controls how many results are returned with each request. Default value is 10 + :param page_: The page number parameter works with the page size parameter + to control the offset of the records returned in the results. Default value is 1 + :param pagesize_: The page size parameter works with the page number parameter + to control the offset of the records returned in the results. + It also controls how many results are returned with each request. Default value is 100 :return: result as json """ params = {'q': query_string} @@ -364,7 +379,7 @@ class WebarchivSession: "capture": capture, "selector": selector, "occurrence": occurrence, - "extractortype": EXTRACTOR_HTML + "extractortype": ExtractorMode.HTML }) response = self.status_query(response) return self.wait_for_response(response) @@ -376,7 +391,7 @@ class WebarchivSession: response = self._post(op='/fragment/checksum/binary', json={ "seed": seed, "capture": capture, - "extractortype": EXTRACTOR_BINARY + "extractortype": ExtractorMode.BINARY }) response = self.status_query(response) return self.wait_for_response(response) @@ -388,10 +403,10 @@ class WebarchivSession: response = self._post(op='/fragment/checksum/text', json={ "seed": seed, "capture": capture, - "mode": POSITIONLEN_MODE, + "mode": TextExtractorMode.POSITION_LEN, "pos": pos, "len": len, - "extractortype": EXTRACTOR_TEXT + "extractortype": ExtractorMode.TEXT }) response = self.status_query(response) return self.wait_for_response(response) @@ -403,9 +418,9 @@ class WebarchivSession: response = self._post(op='/fragment/checksum/text', json={ "seed": seed, "capture": capture, - "mode": POSITION_MODE, + "mode": TextExtractorMode.POSITION, "pos": pos, - "extractortype": EXTRACTOR_TEXT + "extractortype": ExtractorMode.TEXT }) response = self.status_query(response) return self.wait_for_response(response) @@ -417,17 +432,16 @@ class WebarchivSession: response = self._post(op='/fragment/checksum/text', json={ "seed": seed, "capture": capture, - "mode": REGEX_MODE, + "mode": TextExtractorMode.REGEX, "regexpattern": regexpattern, "occurrence": occurrence, - "extractortype": EXTRACTOR_TEXT + "extractortype": ExtractorMode.TEXT }) response = self.status_query(response) return self.wait_for_response(response) except HTTPError as e: self._display_http_error(e) - def create_watchlist(self, urls): try: response = self._post(op='/watchlist', json={ @@ -436,4 +450,3 @@ class WebarchivSession: return response except HTTPError as e: self._display_http_error(e) - -- GitLab