From 4ff4d740a2e336f4ff4502ccedd53b466e11ba64 Mon Sep 17 00:00:00 2001 From: Andreas Date: Thu, 23 May 2019 16:40:59 +0200 Subject: [PATCH 1/4] save_page renaming --- sample8.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sample8.ipynb b/sample8.ipynb index 7254f5d..e4b7029 100644 --- a/sample8.ipynb +++ b/sample8.ipynb @@ -55,7 +55,7 @@ } ], "source": [ - "response = w.savePage(\"http://www.onb.ac.at\")\n", + "response = w.save_page(\"http://www.onb.ac.at\")\n", "\n", "if response.status_code == 201:\n", " print(response.json())\n", -- GitLab From 8207c2d198a67eabcc5b4f1328a59d95019e5f9b Mon Sep 17 00:00:00 2001 From: Andreas Date: Thu, 23 May 2019 16:42:14 +0200 Subject: [PATCH 2/4] new methods --- webarchiv.py | 116 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 72 insertions(+), 44 deletions(-) diff --git a/webarchiv.py b/webarchiv.py index 0388094..0fb6bd1 100644 --- a/webarchiv.py +++ b/webarchiv.py @@ -2,6 +2,7 @@ import sys import time import requests import hashlib +import json from requests import HTTPError _datetime_format_string = '%Y%m%d%H%M%S' @@ -11,6 +12,10 @@ EXTRACTOR_TEXT = 1 EXTRACTOR_HTML = 2 EXTRACTOR_BINARY = 3 +# Modes for TextExtractor +POSITIONLEN_MODE = 1 +POSITION_MODE = 2 +REGEX_MODE = 3 class SessionTimeoutError(Exception): pass @@ -344,19 +349,13 @@ class WebarchivSession: return False def save_page(self, url): - self.connect() - r = requests.post(self.base_url.format('savepage'), - data='''{{ - "apikey": "{api_key}", - "t": "{token}", - "url": "{url}" - }}'''.format(api_key=self.api_key, token=self.token, url=url), - headers={ - 'content-type': 'application/json', - 'accept': 'application/ld+json' - } - ) - return r + try: + response = self._post(op='/savepage', json={ + "url": url + }) + return response + except HTTPError as e: + self._display_http_error(e) def fragment_checksum_html(self, seed, capture, selector, occurrence): try: @@ -372,40 +371,69 @@ class WebarchivSession: except HTTPError as e: self._display_http_error(e) + def fragment_checksum_binary(self, seed, capture): + try: + response = self._post(op='/fragment/checksum/binary', json={ + "seed": seed, + "capture": capture, + "extractortype": EXTRACTOR_BINARY + }) + response = self.status_query(response) + return self.wait_for_response(response) + except HTTPError as e: + self._display_http_error(e) -if __name__ == '__main__': - # noinspection SpellCheckingInspection - w = WebarchivSession('2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c') -# response = w.wayback_search("http://www.onb.ac.at") -# response = w.wayback_search("http://frauenhetz.jetzt") - url = "http://sport.orf.at/l/stories/2003717/" - response = w.wayback_search("http://sport.orf.at/l/stories/2003717/", "20110101000000", "20120401000000") -# response = w.wayback_search("x") - - if response.status_code != 200: - print("Error ", response.status_code) - exit(1) - - print(response.json()['total']) - - print(url) - - lastchecksum = '' - for capture in response.json()['hits']: - capturedate = capture['c'] + def fragment_checksum_text_positionlen(self, seed, capture, pos, len): + try: + response = self._post(op='/fragment/checksum/text', json={ + "seed": seed, + "capture": capture, + "mode": POSITIONLEN_MODE, + "pos": pos, + "len": len, + "extractortype": EXTRACTOR_TEXT + }) + response = self.status_query(response) + return self.wait_for_response(response) + except HTTPError as e: + self._display_http_error(e) - resp = w.fragment_checksum_html(url, capturedate, ".odd td", 3) - checksum = resp.json()['checksum'] - returncode = resp.json()['returncode'] + def fragment_checksum_text_position(self, seed, capture, pos): + try: + response = self._post(op='/fragment/checksum/text', json={ + "seed": seed, + "capture": capture, + "mode": POSITION_MODE, + "pos": pos, + "extractortype": EXTRACTOR_TEXT + }) + response = self.status_query(response) + return self.wait_for_response(response) + except HTTPError as e: + self._display_http_error(e) - if returncode == 2: - continue + def fragment_checksum_text_regex(self, seed, capture, regexpattern, occurrence): + try: + response = self._post(op='/fragment/checksum/text', json={ + "seed": seed, + "capture": capture, + "mode": REGEX_MODE, + "regexpattern": regexpattern, + "occurrence": occurrence, + "extractortype": EXTRACTOR_TEXT + }) + response = self.status_query(response) + return self.wait_for_response(response) + except HTTPError as e: + self._display_http_error(e) - if checksum != lastchecksum: - print(resp.json()) - print("http://wayback/web/" + capturedate + "/" + url) - print(capturedate + " " + checksum) - lastchecksum = checksum + def create_watchlist(self, urls): + try: + response = self._post(op='/watchlist', json={ + "urls": urls + }) + return response + except HTTPError as e: + self._display_http_error(e) - print("end") -- GitLab From 0133bb316011d228d42124abe88525851f38bfb8 Mon Sep 17 00:00:00 2001 From: Andreas Date: Thu, 23 May 2019 16:42:52 +0200 Subject: [PATCH 3/4] sample for html fragment checksumming and creating of a watchlist --- sample9.ipynb | 125 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 sample9.ipynb diff --git a/sample9.ipynb b/sample9.ipynb new file mode 100644 index 0000000..fff08d2 --- /dev/null +++ b/sample9.ipynb @@ -0,0 +1,125 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "import json" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a WebarchivSession Object with convenience methods for easy access with your API-Key " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from webarchiv import WebarchivSession\n", + "\n", + "apikey = '2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c'\n", + "w = WebarchivSession(apikey)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "url = \"http://sport.orf.at/l/stories/2003717/\"\n", + "response = w.wayback_search(\"http://sport.orf.at/l/stories/2003717/\", \"20110101000000\", \"20120401000000\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "if response.status_code != 200:\n", + " print(\"Error \", response.status_code)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "http://wayback/web/20110401202828/http://sport.orf.at/l/stories/2003717/\n", + "http://wayback/web/20110704202825/http://sport.orf.at/l/stories/2003717/\n", + "A watchlist with all captures mentioned above was generated. The code for this watchlist is Zp. \n", + "end\n" + ] + } + ], + "source": [ + "lastchecksum = ''\n", + "captures = []\n", + "for capture in response.json()['hits']:\n", + " capturedate = capture['c']\n", + "\n", + " resp = w.fragment_checksum_html(url, capturedate, \".odd td\", 3)\n", + " checksum = resp.json()['checksum']\n", + " returncode = resp.json()['returncode']\n", + "\n", + " if returncode != 0:\n", + " continue\n", + "\n", + " if checksum != lastchecksum:\n", + " #print(resp.json())\n", + " print(\"http://wayback/web/\" + capturedate + \"/\" + url)\n", + " capture = {\"url\": url, \"timestamp\": capturedate}\n", + " captures.append(capture)\n", + " #print(capturedate + \" \" + checksum)\n", + "\n", + " lastchecksum = checksum\n", + "\n", + "if len(captures) > 0:\n", + " response = w.create_watchlist(captures)\n", + " print (\"A watchlist with all captures mentioned above was generated. The code for this watchlist is \" + response.json() + \". \" )\n", + " \n", + " \n", + "print(\"end\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} -- GitLab From 7db1c1307c9633ca1223d96e4911cc887bca8b7e Mon Sep 17 00:00:00 2001 From: Andreas Date: Thu, 23 May 2019 16:43:32 +0200 Subject: [PATCH 4/4] sample fix --- sample8.ipynb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sample8.ipynb b/sample8.ipynb index e4b7029..51714d5 100644 --- a/sample8.ipynb +++ b/sample8.ipynb @@ -43,14 +43,14 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'nomination_id': 247, 'seed': 'http://www.onb.ac.at', 'nominationtype': 5}\n" + "{'nomination_id': 374, 'seed': 'http://www.onb.ac.at', 'nominationtype': 5}\n" ] } ], @@ -91,7 +91,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.6.6" } }, "nbformat": 4, -- GitLab