diff --git a/sample6.ipynb b/sample6.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..6b024593315de92c7f050fe4c7ff63b63e20d848 --- /dev/null +++ b/sample6.ipynb @@ -0,0 +1,122 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from webarchiv import WebarchivSession\n", + "apikey = '2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c'\n", + "w = WebarchivSession(apikey)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Submit a fulltext search to get the number of captures returns ordered by period and domaina" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "response = w.histogram_search(\"situationselastisch\", 5)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The search always returns the full response. Checking for status_code 200 before extracting the response is always a good idea" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "if response.status_code != 200:\n", + " print(\"Something went wrong ...\")\n", + " exit(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now it is safe to extract the response" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get the tophit of the domain with the largest number of hits which contains the words Nationalbliothek, Prunksaal and Schwarzenegger" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['2009-01-01', '2010-01-01', '2011-01-01', '2012-01-01', '2013-01-01', '2014-01-01', '2015-01-01', '2016-01-01', '2017-01-01']\n[4, 4, 23, 28, 65, 859, 799, 77, 21]\n" + ] + } + ], + "source": [ + "x = [];\n", + "y = [];\n", + "for period in response.json()['hits']:\n", + " x.append(period['period'])\n", + " y.append(period['total'])\n", + " \n", + "print(x)\n", + "print(y)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/webarchiv.py b/webarchiv.py index 4dd3bd6481402e577c4a3ed7da66102cd4ea4f05..c70054af62625eb9350ead8da4aa82e705572427 100644 --- a/webarchiv.py +++ b/webarchiv.py @@ -213,16 +213,42 @@ class WebarchivSession: self._display_http_error(e) print('Error:'.format(query_string)) + def histogram_search(self, query_string, interval_=3, from_=None, to_=None): + """ + Start a domain name search in the Webarchive. + The current status of running queries can be read via status_open_queries(). + + :param query_string: String to search for + :param page_: The page number parameter works with the page size parameter to control the offset of the records returned in the results. Default value is 1 + :param pagesize_: The page size parameter works with the page number parameter to control the offset of the records returned in the results. It also controls how many results are returned with each request. Default value is 10 + :return: result as json + """ + params = {'q': query_string} + if interval_: + params['interval'] = interval_ + if from_: + params['from'] = from_ + if to_: + params['to'] = to_ + + try: + response = self._get(op='/search/fulltext/histogram', params=params) + return self.waitForResponse(response) + + except HTTPError as e: + self._display_http_error(e) + print('Error:'.format(query_string)) + if __name__ == '__main__': # noinspection SpellCheckingInspection w = WebarchivSession('2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c') # response = w.wayback_search("http://www.onb.ac.at") - #response = w.wayback_search("http://frauenhetz.jetzt") - response = w.fulltext_search("Nationalbibliothek Prunksaal Schwarzenegger") +# response = w.wayback_search("http://frauenhetz.jetzt") + response = w.histogram_search("Nationalbibliothek Prunksaal Schwarzenegger") # response = w.wayback_search("x") if response.status_code == 200: - print(response.json()['total'], " Captures") + print(response.json()) else: print("Error ", response.status_code)