Commit 47703c1f authored by Stefan Karner's avatar Stefan Karner

Merge branch 'master' into 'master'

histogram search prototype

See merge request !3
parents e2561047 17c7d67f
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from webarchiv import WebarchivSession\n",
"apikey = '2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c'\n",
"w = WebarchivSession(apikey)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Submit a fulltext search to get the number of captures returns ordered by period and domaina"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"response = w.histogram_search(\"situationselastisch\", 5)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The search always returns the full response. Checking for status_code 200 before extracting the response is always a good idea"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"if response.status_code != 200:\n",
" print(\"Something went wrong ...\")\n",
" exit(1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now it is safe to extract the response"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Get the tophit of the domain with the largest number of hits which contains the words Nationalbliothek, Prunksaal and Schwarzenegger"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['2009-01-01', '2010-01-01', '2011-01-01', '2012-01-01', '2013-01-01', '2014-01-01', '2015-01-01', '2016-01-01', '2017-01-01']\n[4, 4, 23, 28, 65, 859, 799, 77, 21]\n"
]
}
],
"source": [
"x = [];\n",
"y = [];\n",
"for period in response.json()['hits']:\n",
" x.append(period['period'])\n",
" y.append(period['total'])\n",
" \n",
"print(x)\n",
"print(y)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
......@@ -213,16 +213,42 @@ class WebarchivSession:
self._display_http_error(e)
print('Error:'.format(query_string))
def histogram_search(self, query_string, interval_=3, from_=None, to_=None):
"""
Start a domain name search in the Webarchive.
The current status of running queries can be read via status_open_queries().
:param query_string: String to search for
:param page_: The page number parameter works with the page size parameter to control the offset of the records returned in the results. Default value is 1
:param pagesize_: The page size parameter works with the page number parameter to control the offset of the records returned in the results. It also controls how many results are returned with each request. Default value is 10
:return: result as json
"""
params = {'q': query_string}
if interval_:
params['interval'] = interval_
if from_:
params['from'] = from_
if to_:
params['to'] = to_
try:
response = self._get(op='/search/fulltext/histogram', params=params)
return self.waitForResponse(response)
except HTTPError as e:
self._display_http_error(e)
print('Error:'.format(query_string))
if __name__ == '__main__':
# noinspection SpellCheckingInspection
w = WebarchivSession('2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c')
# response = w.wayback_search("http://www.onb.ac.at")
#response = w.wayback_search("http://frauenhetz.jetzt")
response = w.fulltext_search("Nationalbibliothek Prunksaal Schwarzenegger")
# response = w.wayback_search("http://frauenhetz.jetzt")
response = w.histogram_search("Nationalbibliothek Prunksaal Schwarzenegger")
# response = w.wayback_search("x")
if response.status_code == 200:
print(response.json()['total'], " Captures")
print(response.json())
else:
print("Error ", response.status_code)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment