Skip to content
Snippets Groups Projects
Commit b9c64ec9 authored by Stefan Karner's avatar Stefan Karner
Browse files

Refactor: Use Enums for constants; reformat

parent 0ea7228d
No related branches found
No related tags found
1 merge request!13Refactor: Use Enums for constants; reformat
......@@ -2,20 +2,31 @@ import sys
import time
import requests
import hashlib
import json
from enum import IntEnum
from requests import HTTPError
_datetime_format_string = '%Y%m%d%H%M%S'
EXTRACTOR_TEXT = 1
EXTRACTOR_HTML = 2
EXTRACTOR_BINARY = 3
class ExtractorMode(IntEnum):
TEXT = 1
HTML = 2
BINARY = 3
class TextExtractorMode(IntEnum):
POSITION_LEN = 1
POSITION = 2
REGEX = 3
class ReturnType(IntEnum):
ELASTIC = 1
WAYBACKCALHEAT = 2
WAYBACK = 3
COUNT = 4
FRAGMENT = 5
# Modes for TextExtractor
POSITIONLEN_MODE = 1
POSITION_MODE = 2
REGEX_MODE = 3
class SessionTimeoutError(Exception):
pass
......@@ -209,6 +220,7 @@ class WebarchivSession:
:param query_string: String to search for
:param url: Search only captures starting at this exact web address
:param pagesize: Number of results per result page
:param from_: Optional earliest date bound for the search
in the format YYYYMM.
:param to_: Optional latest date bound for the search
......@@ -274,11 +286,11 @@ class WebarchivSession:
context = j['context']
requestid = j['requestid']
type_ = resp.json()['type']
if type_ == 1:
if type_ == ReturnType.ELASTIC:
r = self._get(op='/search/status/fulltext', params={'requestid': requestid})
elif type_ == 2:
elif type_ == ReturnType.WAYBACKCALHEAT:
r = self._get(op='/search/status/wayback', params={'requestid': requestid})
elif type_ == 5:
elif type_ == ReturnType.FRAGMENT:
r = self._get(op='/fragment/checksum/status', params={'requestid': requestid})
else:
raise NotImplementedError(f'Unknown status query type {type_} - Please update client.')
......@@ -290,8 +302,11 @@ class WebarchivSession:
Start a domain name search in the Webarchive.
:param query_string: String to search for
:param page_: The page number parameter works with the page size parameter to control the offset of the records returned in the results. Default value is 1
:param pagesize_: The page size parameter works with the page number parameter to control the offset of the records returned in the results. It also controls how many results are returned with each request. Default value is 10
:param page_: The page number parameter works with the page size parameter
to control the offset of the records returned in the results. Default value is 1
:param pagesize_: The page size parameter works with the page number parameter
to control the offset of the records returned in the results.
It also controls how many results are returned with each request. Default value is 100
:return: result as json
"""
params = {'q': query_string}
......@@ -364,7 +379,7 @@ class WebarchivSession:
"capture": capture,
"selector": selector,
"occurrence": occurrence,
"extractortype": EXTRACTOR_HTML
"extractortype": ExtractorMode.HTML
})
response = self.status_query(response)
return self.wait_for_response(response)
......@@ -376,7 +391,7 @@ class WebarchivSession:
response = self._post(op='/fragment/checksum/binary', json={
"seed": seed,
"capture": capture,
"extractortype": EXTRACTOR_BINARY
"extractortype": ExtractorMode.BINARY
})
response = self.status_query(response)
return self.wait_for_response(response)
......@@ -388,10 +403,10 @@ class WebarchivSession:
response = self._post(op='/fragment/checksum/text', json={
"seed": seed,
"capture": capture,
"mode": POSITIONLEN_MODE,
"mode": TextExtractorMode.POSITION_LEN,
"pos": pos,
"len": len,
"extractortype": EXTRACTOR_TEXT
"extractortype": ExtractorMode.TEXT
})
response = self.status_query(response)
return self.wait_for_response(response)
......@@ -403,9 +418,9 @@ class WebarchivSession:
response = self._post(op='/fragment/checksum/text', json={
"seed": seed,
"capture": capture,
"mode": POSITION_MODE,
"mode": TextExtractorMode.POSITION,
"pos": pos,
"extractortype": EXTRACTOR_TEXT
"extractortype": ExtractorMode.TEXT
})
response = self.status_query(response)
return self.wait_for_response(response)
......@@ -417,17 +432,16 @@ class WebarchivSession:
response = self._post(op='/fragment/checksum/text', json={
"seed": seed,
"capture": capture,
"mode": REGEX_MODE,
"mode": TextExtractorMode.REGEX,
"regexpattern": regexpattern,
"occurrence": occurrence,
"extractortype": EXTRACTOR_TEXT
"extractortype": ExtractorMode.TEXT
})
response = self.status_query(response)
return self.wait_for_response(response)
except HTTPError as e:
self._display_http_error(e)
def create_watchlist(self, urls):
try:
response = self._post(op='/watchlist', json={
......@@ -436,4 +450,3 @@ class WebarchivSession:
return response
except HTTPError as e:
self._display_http_error(e)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment