diff --git a/requirements.txt b/requirements.txt index 484a974635e087f6c9fefb168ba491b6110756de..740b633bf20b621a70155dd9bf04b8db225fee7b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ requests>=2.0 -jupyter>=1.0 \ No newline at end of file +jupyter>=1.0 +plotly>=1.9.0 \ No newline at end of file diff --git a/sample6.ipynb b/sample6.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..651dcb02e24ce69395fa5fa5a5954fcfd2061774 --- /dev/null +++ b/sample6.ipynb @@ -0,0 +1,152 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import plotly.offline as py\n", + "import plotly.graph_objs as go\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from webarchiv import WebarchivSession\n", + "apikey = 'Zz2tQls7fuaocX2pjrfc2npojqbGwXL2'\n", + "w = WebarchivSession(apikey)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Submit a fulltext search to get the number of captures returns ordered by period and domaina" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "response = w.histogram_search(\"situationselastisch\", 5)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The search always returns the full response. Checking for status_code 200 before extracting the response is always a good idea" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "if response.status_code != 200:\n", + " print(\"Something went wrong ...\")\n", + " exit(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now it is safe to extract the response" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get the tophit of the domain with the largest number of hits which contains the words Nationalbliothek, Prunksaal and Schwarzenegger" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['2009-01-01', '2010-01-01', '2011-01-01', '2012-01-01', '2013-01-01', '2014-01-01', '2015-01-01', '2016-01-01', '2017-01-01']\n[4, 4, 23, 28, 65, 859, 799, 77, 21]\n" + ] + } + ], + "source": [ + "x = [];\n", + "y = [];\n", + "for period in response.json()['hits']:\n", + " x.append(period['period'])\n", + " y.append(period['total'])\n", + " \n", + "trace0 = go.Bar(\n", + " x=x,\n", + " y=y,\n", + " name='Anzahl Captures mit \"situationselastisch\"',\n", + " marker=dict(\n", + " color='rgb(49,130,189)'\n", + " )\n", + ")\n", + "\n", + "data = [trace0]\n", + "layout = go.Layout(\n", + " xaxis=dict(tickangle=-45),\n", + " barmode='group',\n", + ")\n", + "\n", + "fig = go.Figure(data=data, layout=layout)\n", + "py.iplot(fig, filename='angled-text-bar')\n", + "\n", + "\n", + "print(x)\n", + "print(y)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/webarchiv.py b/webarchiv.py index 4dd3bd6481402e577c4a3ed7da66102cd4ea4f05..922b4394382dd56eab47e09de3d6fa8e6410b8b0 100644 --- a/webarchiv.py +++ b/webarchiv.py @@ -24,8 +24,9 @@ class WebarchivSession: Protocol, domain and path prefix for the Webarchive API, with a single positional format string placeholder for the REST operation and parameters. - """ return 'https://webarchiv.onb.ac.at/api/{}' + """ + return 'http://192.168.1.202:8080/webarchive_web/api/{}' @property def _error_template(self): @@ -213,16 +214,42 @@ class WebarchivSession: self._display_http_error(e) print('Error:'.format(query_string)) + def histogram_search(self, query_string, interval_=3, from_=None, to_=None): + """ + Start a domain name search in the Webarchive. + The current status of running queries can be read via status_open_queries(). + + :param query_string: String to search for + :param page_: The page number parameter works with the page size parameter to control the offset of the records returned in the results. Default value is 1 + :param pagesize_: The page size parameter works with the page number parameter to control the offset of the records returned in the results. It also controls how many results are returned with each request. Default value is 10 + :return: result as json + """ + params = {'q': query_string} + if interval_: + params['interval'] = interval_ + if from_: + params['from'] = from_ + if to_: + params['to'] = to_ + + try: + response = self._get(op='/search/fulltext/histogram', params=params) + return self.waitForResponse(response) + + except HTTPError as e: + self._display_http_error(e) + print('Error:'.format(query_string)) + if __name__ == '__main__': # noinspection SpellCheckingInspection w = WebarchivSession('2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c') # response = w.wayback_search("http://www.onb.ac.at") #response = w.wayback_search("http://frauenhetz.jetzt") - response = w.fulltext_search("Nationalbibliothek Prunksaal Schwarzenegger") + response = w.histogram_search("Nationalbibliothek Prunksaal Schwarzenegger") # response = w.wayback_search("x") if response.status_code == 200: - print(response.json()['total'], " Captures") + print(response.json()) else: print("Error ", response.status_code)