onbpre · onbpre · onbpre · onbpre · 7db1c130 · 7db1c130
--- a/sample8.ipynb
+++ b/sample8.ipynb
 %% Cell type:markdown id: tags:


 %% Cell type:code id: tags:

 ``` python
 import requests
 import json
 ```

 %% Cell type:markdown id: tags:

 Create a WebarchivSession Object with convenience methods for easy access with your API-Key

 %% Cell type:code id: tags:

 ``` python
 from webarchiv import WebarchivSession

 apikey = '2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c'
 w = WebarchivSession(apikey)
 ```

 %% Cell type:raw id: tags:

 request archiving a Webpage

 %% Cell type:code id: tags:

 ``` python
-response = w.savePage("http://www.onb.ac.at")
+response = w.save_page("http://www.onb.ac.at")

 if response.status_code == 201:
    print(response.json())
 else:
    print("Error ", response.status_code)




 ```

 %% Output

-    {'nomination_id': 247, 'seed': 'http://www.onb.ac.at', 'nominationtype': 5}
+    {'nomination_id': 374, 'seed': 'http://www.onb.ac.at', 'nominationtype': 5}

 %% Cell type:code id: tags:

 ``` python
 ```

 %% Cell type:markdown id: tags:


 %% Cell type:code id: tags:

 ``` python
 import requests
 import json
 ```

 %% Cell type:markdown id: tags:

 Create a WebarchivSession Object with convenience methods for easy access with your API-Key

 %% Cell type:code id: tags:

 ``` python
 from webarchiv import WebarchivSession

 apikey = '2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c'
 w = WebarchivSession(apikey)
 ```

 %% Cell type:raw id: tags:

 request archiving a Webpage

 %% Cell type:code id: tags:

 ``` python
-response = w.savePage("http://www.onb.ac.at")
+response = w.save_page("http://www.onb.ac.at")

 if response.status_code == 201:
    print(response.json())
 else:
    print("Error ", response.status_code)




 ```

 %% Output

-    {'nomination_id': 247, 'seed': 'http://www.onb.ac.at', 'nominationtype': 5}
+    {'nomination_id': 374, 'seed': 'http://www.onb.ac.at', 'nominationtype': 5}

 %% Cell type:code id: tags:

 ``` python
 ```

--- a/sample9.ipynb
+++ b/sample9.ipynb
+%% Cell type:markdown id: tags:
+
+
+%% Cell type:code id: tags:
+
+``` python
+import requests
+import json
+```
+
+%% Cell type:markdown id: tags:
+
+Create a WebarchivSession Object with convenience methods for easy access with your API-Key
+
+%% Cell type:code id: tags:
+
+``` python
+from webarchiv import WebarchivSession
+
+apikey = '2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c'
+w = WebarchivSession(apikey)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+url = "http://sport.orf.at/l/stories/2003717/"
+response = w.wayback_search("http://sport.orf.at/l/stories/2003717/", "20110101000000", "20120401000000")
+```
+
+%% Cell type:code id: tags:
+
+``` python
+if response.status_code != 200:
+    print("Error ", response.status_code)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+lastchecksum = ''
+captures = []
+for capture in response.json()['hits']:
+    capturedate = capture['c']
+
+    resp = w.fragment_checksum_html(url, capturedate, ".odd td", 3)
+    checksum = resp.json()['checksum']
+    returncode = resp.json()['returncode']
+
+    if returncode != 0:
+        continue
+
+    if checksum != lastchecksum:
+        #print(resp.json())
+        print("http://wayback/web/" + capturedate + "/" + url)
+        capture = {"url": url, "timestamp": capturedate}
+        captures.append(capture)
+        #print(capturedate + " " + checksum)
+
+    lastchecksum = checksum
+
+if len(captures) > 0:
+    response = w.create_watchlist(captures)
+    print ("A watchlist with all captures mentioned above was generated. The code for this watchlist is " +  response.json() + ". " )
+
+
+print("end")
+```
+
+%% Output
+
+    http://wayback/web/20110401202828/http://sport.orf.at/l/stories/2003717/
+    http://wayback/web/20110704202825/http://sport.orf.at/l/stories/2003717/
+    A watchlist with all captures mentioned above was generated. The code for this watchlist is Zp.
+    end
+%% Cell type:markdown id: tags:
+
+
+%% Cell type:code id: tags:
+
+``` python
+import requests
+import json
+```
+
+%% Cell type:markdown id: tags:
+
+Create a WebarchivSession Object with convenience methods for easy access with your API-Key
+
+%% Cell type:code id: tags:
+
+``` python
+from webarchiv import WebarchivSession
+
+apikey = '2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c'
+w = WebarchivSession(apikey)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+url = "http://sport.orf.at/l/stories/2003717/"
+response = w.wayback_search("http://sport.orf.at/l/stories/2003717/", "20110101000000", "20120401000000")
+```
+
+%% Cell type:code id: tags:
+
+``` python
+if response.status_code != 200:
+    print("Error ", response.status_code)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+lastchecksum = ''
+captures = []
+for capture in response.json()['hits']:
+    capturedate = capture['c']
+
+    resp = w.fragment_checksum_html(url, capturedate, ".odd td", 3)
+    checksum = resp.json()['checksum']
+    returncode = resp.json()['returncode']
+
+    if returncode != 0:
+        continue
+
+    if checksum != lastchecksum:
+        #print(resp.json())
+        print("http://wayback/web/" + capturedate + "/" + url)
+        capture = {"url": url, "timestamp": capturedate}
+        captures.append(capture)
+        #print(capturedate + " " + checksum)
+
+    lastchecksum = checksum
+
+if len(captures) > 0:
+    response = w.create_watchlist(captures)
+    print ("A watchlist with all captures mentioned above was generated. The code for this watchlist is " +  response.json() + ". " )
+
+
+print("end")
+```
+
+%% Output
+
+    http://wayback/web/20110401202828/http://sport.orf.at/l/stories/2003717/
+    http://wayback/web/20110704202825/http://sport.orf.at/l/stories/2003717/
+    A watchlist with all captures mentioned above was generated. The code for this watchlist is Zp.
+    end
--- a/webarchiv.py
+++ b/webarchiv.py
@@ -2,6 +2,7 @@ import sys
 import time
 import requests
 import hashlib
+import json
 from requests import HTTPError

 _datetime_format_string = '%Y%m%d%H%M%S'
@@ -11,6 +12,10 @@ EXTRACTOR_TEXT = 1
 EXTRACTOR_HTML = 2
 EXTRACTOR_BINARY = 3

+# Modes for TextExtractor
+POSITIONLEN_MODE = 1
+POSITION_MODE = 2
+REGEX_MODE = 3

 class SessionTimeoutError(Exception):
    pass
@@ -344,19 +349,13 @@ class WebarchivSession:
            return False

    def save_page(self, url):
-        self.connect()
-        r = requests.post(self.base_url.format('savepage'),
-                          data='''{{
-                              "apikey": "{api_key}",
-                              "t": "{token}",
-                              "url": "{url}"
-                          }}'''.format(api_key=self.api_key, token=self.token, url=url),
-                          headers={
-                              'content-type': 'application/json',
-                              'accept': 'application/ld+json'
-                          }
-                          )
-        return r
+        try:
+            response = self._post(op='/savepage', json={
+                              "url": url
+                          })
+            return response
+        except HTTPError as e:
+            self._display_http_error(e)

    def fragment_checksum_html(self, seed, capture, selector, occurrence):
        try:
@@ -372,40 +371,69 @@ class WebarchivSession:
        except HTTPError as e:
            self._display_http_error(e)

+    def fragment_checksum_binary(self, seed, capture):
+        try:
+            response = self._post(op='/fragment/checksum/binary', json={
+                              "seed": seed,
+                              "capture": capture,
+                              "extractortype": EXTRACTOR_BINARY
+                          })
+            response = self.status_query(response)
+            return self.wait_for_response(response)
+        except HTTPError as e:
+            self._display_http_error(e)

-if __name__ == '__main__':
-    # noinspection SpellCheckingInspection
-    w = WebarchivSession('2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c')
-#    response = w.wayback_search("http://www.onb.ac.at")
-#    response = w.wayback_search("http://frauenhetz.jetzt")
-    url = "http://sport.orf.at/l/stories/2003717/"
-    response = w.wayback_search("http://sport.orf.at/l/stories/2003717/", "20110101000000", "20120401000000")
-#    response = w.wayback_search("x")
-
-    if response.status_code != 200:
-        print("Error ", response.status_code)
-        exit(1)
-
-    print(response.json()['total'])
-
-    print(url)
-
-    lastchecksum = ''
-    for capture in response.json()['hits']:
-        capturedate = capture['c']
+    def fragment_checksum_text_positionlen(self, seed, capture, pos, len):
+        try:
+            response = self._post(op='/fragment/checksum/text', json={
+                              "seed": seed,
+                              "capture": capture,
+                              "mode": POSITIONLEN_MODE,
+                              "pos": pos,
+                              "len": len,
+                              "extractortype": EXTRACTOR_TEXT
+                          })
+            response = self.status_query(response)
+            return self.wait_for_response(response)
+        except HTTPError as e:
+            self._display_http_error(e)

-        resp = w.fragment_checksum_html(url, capturedate, ".odd td", 3)
-        checksum = resp.json()['checksum']
-        returncode = resp.json()['returncode']
+    def fragment_checksum_text_position(self, seed, capture, pos):
+        try:
+            response = self._post(op='/fragment/checksum/text', json={
+                              "seed": seed,
+                              "capture": capture,
+                              "mode": POSITION_MODE,
+                              "pos": pos,
+                              "extractortype": EXTRACTOR_TEXT
+                          })
+            response = self.status_query(response)
+            return self.wait_for_response(response)
+        except HTTPError as e:
+            self._display_http_error(e)

-        if returncode == 2:
-            continue
+    def fragment_checksum_text_regex(self, seed, capture, regexpattern, occurrence):
+        try:
+            response = self._post(op='/fragment/checksum/text', json={
+                              "seed": seed,
+                              "capture": capture,
+                              "mode": REGEX_MODE,
+                              "regexpattern": regexpattern,
+                              "occurrence": occurrence,
+                              "extractortype": EXTRACTOR_TEXT
+                          })
+            response = self.status_query(response)
+            return self.wait_for_response(response)
+        except HTTPError as e:
+            self._display_http_error(e)

-        if checksum != lastchecksum:
-            print(resp.json())
-            print("http://wayback/web/" + capturedate + "/" + url)
-            print(capturedate + " " + checksum)

-        lastchecksum = checksum
+    def create_watchlist(self, urls):
+        try:
+            response = self._post(op='/watchlist', json={
+                              "urls": urls
+                          })
+            return response
+        except HTTPError as e:
+            self._display_http_error(e)

-    print("end")