From 89f62c55aae8b8ad107635bb8748cbf9f4a77dea Mon Sep 17 00:00:00 2001 From: Andreas Date: Mon, 11 Feb 2019 16:53:45 +0100 Subject: [PATCH] Sample for using https://webarchiv.onb.ac.at/api.html#/search/fulltext/histogram --- requirements.txt | 3 +- sample6.ipynb | 152 +++++++++++++++++++++++++++++++++++++++++++++++ webarchiv.py | 33 +++++++++- 3 files changed, 184 insertions(+), 4 deletions(-) create mode 100644 sample6.ipynb diff --git a/requirements.txt b/requirements.txt index 484a974..740b633 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ requests>=2.0 -jupyter>=1.0 \ No newline at end of file +jupyter>=1.0 +plotly>=1.9.0 \ No newline at end of file diff --git a/sample6.ipynb b/sample6.ipynb new file mode 100644 index 0000000..651dcb0 --- /dev/null +++ b/sample6.ipynb @@ -0,0 +1,152 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import plotly.offline as py\n", + "import plotly.graph_objs as go\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from webarchiv import WebarchivSession\n", + "apikey = 'Zz2tQls7fuaocX2pjrfc2npojqbGwXL2'\n", + "w = WebarchivSession(apikey)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Submit a fulltext search to get the number of captures returns ordered by period and domaina" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "response = w.histogram_search(\"situationselastisch\", 5)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The search always returns the full response. Checking for status_code 200 before extracting the response is always a good idea" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "if response.status_code != 200:\n", + " print(\"Something went wrong ...\")\n", + " exit(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now it is safe to extract the response" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get the tophit of the domain with the largest number of hits which contains the words Nationalbliothek, Prunksaal and Schwarzenegger" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['2009-01-01', '2010-01-01', '2011-01-01', '2012-01-01', '2013-01-01', '2014-01-01', '2015-01-01', '2016-01-01', '2017-01-01']\n[4, 4, 23, 28, 65, 859, 799, 77, 21]\n" + ] + } + ], + "source": [ + "x = [];\n", + "y = [];\n", + "for period in response.json()['hits']:\n", + " x.append(period['period'])\n", + " y.append(period['total'])\n", + " \n", + "trace0 = go.Bar(\n", + " x=x,\n", + " y=y,\n", + " name='Anzahl Captures mit \"situationselastisch\"',\n", + " marker=dict(\n", + " color='rgb(49,130,189)'\n", + " )\n", + ")\n", + "\n", + "data = [trace0]\n", + "layout = go.Layout(\n", + " xaxis=dict(tickangle=-45),\n", + " barmode='group',\n", + ")\n", + "\n", + "fig = go.Figure(data=data, layout=layout)\n", + "py.iplot(fig, filename='angled-text-bar')\n", + "\n", + "\n", + "print(x)\n", + "print(y)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/webarchiv.py b/webarchiv.py index 4dd3bd6..922b439 100644 --- a/webarchiv.py +++ b/webarchiv.py @@ -24,8 +24,9 @@ class WebarchivSession: Protocol, domain and path prefix for the Webarchive API, with a single positional format string placeholder for the REST operation and parameters. - """ return 'https://webarchiv.onb.ac.at/api/{}' + """ + return 'http://192.168.1.202:8080/webarchive_web/api/{}' @property def _error_template(self): @@ -213,16 +214,42 @@ class WebarchivSession: self._display_http_error(e) print('Error:'.format(query_string)) + def histogram_search(self, query_string, interval_=3, from_=None, to_=None): + """ + Start a domain name search in the Webarchive. + The current status of running queries can be read via status_open_queries(). + + :param query_string: String to search for + :param page_: The page number parameter works with the page size parameter to control the offset of the records returned in the results. Default value is 1 + :param pagesize_: The page size parameter works with the page number parameter to control the offset of the records returned in the results. It also controls how many results are returned with each request. Default value is 10 + :return: result as json + """ + params = {'q': query_string} + if interval_: + params['interval'] = interval_ + if from_: + params['from'] = from_ + if to_: + params['to'] = to_ + + try: + response = self._get(op='/search/fulltext/histogram', params=params) + return self.waitForResponse(response) + + except HTTPError as e: + self._display_http_error(e) + print('Error:'.format(query_string)) + if __name__ == '__main__': # noinspection SpellCheckingInspection w = WebarchivSession('2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c') # response = w.wayback_search("http://www.onb.ac.at") #response = w.wayback_search("http://frauenhetz.jetzt") - response = w.fulltext_search("Nationalbibliothek Prunksaal Schwarzenegger") + response = w.histogram_search("Nationalbibliothek Prunksaal Schwarzenegger") # response = w.wayback_search("x") if response.status_code == 200: - print(response.json()['total'], " Captures") + print(response.json()) else: print("Error ", response.status_code) -- GitLab