Commit 89f62c55 authored by Andreas's avatar Andreas
parent 92be5697
requests>=2.0
jupyter>=1.0
\ No newline at end of file
jupyter>=1.0
plotly>=1.9.0
\ No newline at end of file
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import plotly.offline as py\n",
"import plotly.graph_objs as go\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from webarchiv import WebarchivSession\n",
"apikey = 'Zz2tQls7fuaocX2pjrfc2npojqbGwXL2'\n",
"w = WebarchivSession(apikey)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Submit a fulltext search to get the number of captures returns ordered by period and domaina"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"response = w.histogram_search(\"situationselastisch\", 5)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The search always returns the full response. Checking for status_code 200 before extracting the response is always a good idea"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"if response.status_code != 200:\n",
" print(\"Something went wrong ...\")\n",
" exit(1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now it is safe to extract the response"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Get the tophit of the domain with the largest number of hits which contains the words Nationalbliothek, Prunksaal and Schwarzenegger"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['2009-01-01', '2010-01-01', '2011-01-01', '2012-01-01', '2013-01-01', '2014-01-01', '2015-01-01', '2016-01-01', '2017-01-01']\n[4, 4, 23, 28, 65, 859, 799, 77, 21]\n"
]
}
],
"source": [
"x = [];\n",
"y = [];\n",
"for period in response.json()['hits']:\n",
" x.append(period['period'])\n",
" y.append(period['total'])\n",
" \n",
"trace0 = go.Bar(\n",
" x=x,\n",
" y=y,\n",
" name='Anzahl Captures mit \"situationselastisch\"',\n",
" marker=dict(\n",
" color='rgb(49,130,189)'\n",
" )\n",
")\n",
"\n",
"data = [trace0]\n",
"layout = go.Layout(\n",
" xaxis=dict(tickangle=-45),\n",
" barmode='group',\n",
")\n",
"\n",
"fig = go.Figure(data=data, layout=layout)\n",
"py.iplot(fig, filename='angled-text-bar')\n",
"\n",
"\n",
"print(x)\n",
"print(y)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
......@@ -24,8 +24,9 @@ class WebarchivSession:
Protocol, domain and path prefix for the Webarchive API,
with a single positional format string placeholder
for the REST operation and parameters.
"""
return 'https://webarchiv.onb.ac.at/api/{}'
"""
return 'http://192.168.1.202:8080/webarchive_web/api/{}'
@property
def _error_template(self):
......@@ -213,16 +214,42 @@ class WebarchivSession:
self._display_http_error(e)
print('Error:'.format(query_string))
def histogram_search(self, query_string, interval_=3, from_=None, to_=None):
"""
Start a domain name search in the Webarchive.
The current status of running queries can be read via status_open_queries().
:param query_string: String to search for
:param page_: The page number parameter works with the page size parameter to control the offset of the records returned in the results. Default value is 1
:param pagesize_: The page size parameter works with the page number parameter to control the offset of the records returned in the results. It also controls how many results are returned with each request. Default value is 10
:return: result as json
"""
params = {'q': query_string}
if interval_:
params['interval'] = interval_
if from_:
params['from'] = from_
if to_:
params['to'] = to_
try:
response = self._get(op='/search/fulltext/histogram', params=params)
return self.waitForResponse(response)
except HTTPError as e:
self._display_http_error(e)
print('Error:'.format(query_string))
if __name__ == '__main__':
# noinspection SpellCheckingInspection
w = WebarchivSession('2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c')
# response = w.wayback_search("http://www.onb.ac.at")
#response = w.wayback_search("http://frauenhetz.jetzt")
response = w.fulltext_search("Nationalbibliothek Prunksaal Schwarzenegger")
response = w.histogram_search("Nationalbibliothek Prunksaal Schwarzenegger")
# response = w.wayback_search("x")
if response.status_code == 200:
print(response.json()['total'], " Captures")
print(response.json())
else:
print("Error ", response.status_code)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment