Skip to content
GitLab
Explore
Sign in
aponb
webarchive-api
Compare revisions
b923bb69418f40546f40e06a6a1454a25acff8a8 to 7db1c1307c9633ca1223d96e4911cc887bca8b7e
Commits on Source (4)
save_page renaming
· 4ff4d740
onbpre
authored
May 23, 2019
4ff4d740
new methods
· 8207c2d1
onbpre
authored
May 23, 2019
8207c2d1
sample for html fragment checksumming and creating of a watchlist
· 0133bb31
onbpre
authored
May 23, 2019
0133bb31
sample fix
· 7db1c130
onbpre
authored
May 23, 2019
7db1c130
Hide whitespace changes
Inline
Side-by-side
sample8.ipynb
View file @
7db1c130
%% Cell type:markdown id: tags:
%% Cell type:code id: tags:
```
python
import
requests
import
json
```
%% Cell type:markdown id: tags:
Create a WebarchivSession Object with convenience methods for easy access with your API-Key
%% Cell type:code id: tags:
```
python
from
webarchiv
import
WebarchivSession
apikey
=
'
2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c
'
w
=
WebarchivSession
(
apikey
)
```
%% Cell type:raw id: tags:
request archiving a Webpage
%% Cell type:code id: tags:
```
python
response
=
w
.
save
P
age
(
"
http://www.onb.ac.at
"
)
response
=
w
.
save
_p
age
(
"
http://www.onb.ac.at
"
)
if
response
.
status_code
==
201
:
print
(
response
.
json
())
else
:
print
(
"
Error
"
,
response
.
status_code
)
```
%% Output
{'nomination_id':
247
, 'seed': 'http://www.onb.ac.at', 'nominationtype': 5}
{'nomination_id':
374
, 'seed': 'http://www.onb.ac.at', 'nominationtype': 5}
%% Cell type:code id: tags:
```
python
```
...
...
%% Cell type:markdown id: tags:
%% Cell type:code id: tags:
```
python
import
requests
import
json
```
%% Cell type:markdown id: tags:
Create a WebarchivSession Object with convenience methods for easy access with your API-Key
%% Cell type:code id: tags:
```
python
from
webarchiv
import
WebarchivSession
apikey
=
'
2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c
'
w
=
WebarchivSession
(
apikey
)
```
%% Cell type:raw id: tags:
request archiving a Webpage
%% Cell type:code id: tags:
```
python
response
=
w
.
save
P
age
(
"
http://www.onb.ac.at
"
)
response
=
w
.
save
_p
age
(
"
http://www.onb.ac.at
"
)
if
response
.
status_code
==
201
:
print
(
response
.
json
())
else
:
print
(
"
Error
"
,
response
.
status_code
)
```
%% Output
{'nomination_id':
247
, 'seed': 'http://www.onb.ac.at', 'nominationtype': 5}
{'nomination_id':
374
, 'seed': 'http://www.onb.ac.at', 'nominationtype': 5}
%% Cell type:code id: tags:
```
python
```
...
...
sample9.ipynb
0 → 100644
View file @
7db1c130
%% Cell type:markdown id: tags:
%% Cell type:code id: tags:
```
python
import
requests
import
json
```
%% Cell type:markdown id: tags:
Create a WebarchivSession Object with convenience methods for easy access with your API-Key
%% Cell type:code id: tags:
```
python
from
webarchiv
import
WebarchivSession
apikey
=
'
2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c
'
w
=
WebarchivSession
(
apikey
)
```
%% Cell type:code id: tags:
```
python
url
=
"
http://sport.orf.at/l/stories/2003717/
"
response
=
w
.
wayback_search
(
"
http://sport.orf.at/l/stories/2003717/
"
,
"
20110101000000
"
,
"
20120401000000
"
)
```
%% Cell type:code id: tags:
```
python
if
response
.
status_code
!=
200
:
print
(
"
Error
"
,
response
.
status_code
)
```
%% Cell type:code id: tags:
```
python
lastchecksum
=
''
captures
=
[]
for
capture
in
response
.
json
()[
'
hits
'
]:
capturedate
=
capture
[
'
c
'
]
resp
=
w
.
fragment_checksum_html
(
url
,
capturedate
,
"
.odd td
"
,
3
)
checksum
=
resp
.
json
()[
'
checksum
'
]
returncode
=
resp
.
json
()[
'
returncode
'
]
if
returncode
!=
0
:
continue
if
checksum
!=
lastchecksum
:
#print(resp.json())
print
(
"
http://wayback/web/
"
+
capturedate
+
"
/
"
+
url
)
capture
=
{
"
url
"
:
url
,
"
timestamp
"
:
capturedate
}
captures
.
append
(
capture
)
#print(capturedate + " " + checksum)
lastchecksum
=
checksum
if
len
(
captures
)
>
0
:
response
=
w
.
create_watchlist
(
captures
)
print
(
"
A watchlist with all captures mentioned above was generated. The code for this watchlist is
"
+
response
.
json
()
+
"
.
"
)
print
(
"
end
"
)
```
%% Output
http://wayback/web/20110401202828/http://sport.orf.at/l/stories/2003717/
http://wayback/web/20110704202825/http://sport.orf.at/l/stories/2003717/
A watchlist with all captures mentioned above was generated. The code for this watchlist is Zp.
end
%% Cell type:markdown id: tags:
%% Cell type:code id: tags:
```
python
import
requests
import
json
```
%% Cell type:markdown id: tags:
Create a WebarchivSession Object with convenience methods for easy access with your API-Key
%% Cell type:code id: tags:
```
python
from
webarchiv
import
WebarchivSession
apikey
=
'
2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c
'
w
=
WebarchivSession
(
apikey
)
```
%% Cell type:code id: tags:
```
python
url
=
"
http://sport.orf.at/l/stories/2003717/
"
response
=
w
.
wayback_search
(
"
http://sport.orf.at/l/stories/2003717/
"
,
"
20110101000000
"
,
"
20120401000000
"
)
```
%% Cell type:code id: tags:
```
python
if
response
.
status_code
!=
200
:
print
(
"
Error
"
,
response
.
status_code
)
```
%% Cell type:code id: tags:
```
python
lastchecksum
=
''
captures
=
[]
for
capture
in
response
.
json
()[
'
hits
'
]:
capturedate
=
capture
[
'
c
'
]
resp
=
w
.
fragment_checksum_html
(
url
,
capturedate
,
"
.odd td
"
,
3
)
checksum
=
resp
.
json
()[
'
checksum
'
]
returncode
=
resp
.
json
()[
'
returncode
'
]
if
returncode
!=
0
:
continue
if
checksum
!=
lastchecksum
:
#print(resp.json())
print
(
"
http://wayback/web/
"
+
capturedate
+
"
/
"
+
url
)
capture
=
{
"
url
"
:
url
,
"
timestamp
"
:
capturedate
}
captures
.
append
(
capture
)
#print(capturedate + " " + checksum)
lastchecksum
=
checksum
if
len
(
captures
)
>
0
:
response
=
w
.
create_watchlist
(
captures
)
print
(
"
A watchlist with all captures mentioned above was generated. The code for this watchlist is
"
+
response
.
json
()
+
"
.
"
)
print
(
"
end
"
)
```
%% Output
http://wayback/web/20110401202828/http://sport.orf.at/l/stories/2003717/
http://wayback/web/20110704202825/http://sport.orf.at/l/stories/2003717/
A watchlist with all captures mentioned above was generated. The code for this watchlist is Zp.
end
webarchiv.py
View file @
7db1c130
...
...
@@ -2,6 +2,7 @@ import sys
import
time
import
requests
import
hashlib
import
json
from
requests
import
HTTPError
_datetime_format_string
=
'
%Y%m%d%H%M%S
'
...
...
@@ -11,6 +12,10 @@ EXTRACTOR_TEXT = 1
EXTRACTOR_HTML
=
2
EXTRACTOR_BINARY
=
3
# Modes for TextExtractor
POSITIONLEN_MODE
=
1
POSITION_MODE
=
2
REGEX_MODE
=
3
class
SessionTimeoutError
(
Exception
):
pass
...
...
@@ -344,19 +349,13 @@ class WebarchivSession:
return
False
def
save_page
(
self
,
url
):
self
.
connect
()
r
=
requests
.
post
(
self
.
base_url
.
format
(
'
savepage
'
),
data
=
'''
{{
"
apikey
"
:
"
{api_key}
"
,
"
t
"
:
"
{token}
"
,
"
url
"
:
"
{url}
"
}}
'''
.
format
(
api_key
=
self
.
api_key
,
token
=
self
.
token
,
url
=
url
),
headers
=
{
'
content-type
'
:
'
application/json
'
,
'
accept
'
:
'
application/ld+json
'
}
)
return
r
try
:
response
=
self
.
_post
(
op
=
'
/savepage
'
,
json
=
{
"
url
"
:
url
})
return
response
except
HTTPError
as
e
:
self
.
_display_http_error
(
e
)
def
fragment_checksum_html
(
self
,
seed
,
capture
,
selector
,
occurrence
):
try
:
...
...
@@ -372,40 +371,69 @@ class WebarchivSession:
except
HTTPError
as
e
:
self
.
_display_http_error
(
e
)
def
fragment_checksum_binary
(
self
,
seed
,
capture
):
try
:
response
=
self
.
_post
(
op
=
'
/fragment/checksum/binary
'
,
json
=
{
"
seed
"
:
seed
,
"
capture
"
:
capture
,
"
extractortype
"
:
EXTRACTOR_BINARY
})
response
=
self
.
status_query
(
response
)
return
self
.
wait_for_response
(
response
)
except
HTTPError
as
e
:
self
.
_display_http_error
(
e
)
if
__name__
==
'
__main__
'
:
# noinspection SpellCheckingInspection
w
=
WebarchivSession
(
'
2pm8i0hnmpcTK4Oj4CUeBoZd7vywrm4c
'
)
# response = w.wayback_search("http://www.onb.ac.at")
# response = w.wayback_search("http://frauenhetz.jetzt")
url
=
"
http://sport.orf.at/l/stories/2003717/
"
response
=
w
.
wayback_search
(
"
http://sport.orf.at/l/stories/2003717/
"
,
"
20110101000000
"
,
"
20120401000000
"
)
# response = w.wayback_search("x")
if
response
.
status_code
!=
200
:
print
(
"
Error
"
,
response
.
status_code
)
exit
(
1
)
print
(
response
.
json
()[
'
total
'
])
print
(
url
)
lastchecksum
=
''
for
capture
in
response
.
json
()[
'
hits
'
]:
capturedate
=
capture
[
'
c
'
]
def
fragment_checksum_text_positionlen
(
self
,
seed
,
capture
,
pos
,
len
):
try
:
response
=
self
.
_post
(
op
=
'
/fragment/checksum/text
'
,
json
=
{
"
seed
"
:
seed
,
"
capture
"
:
capture
,
"
mode
"
:
POSITIONLEN_MODE
,
"
pos
"
:
pos
,
"
len
"
:
len
,
"
extractortype
"
:
EXTRACTOR_TEXT
})
response
=
self
.
status_query
(
response
)
return
self
.
wait_for_response
(
response
)
except
HTTPError
as
e
:
self
.
_display_http_error
(
e
)
resp
=
w
.
fragment_checksum_html
(
url
,
capturedate
,
"
.odd td
"
,
3
)
checksum
=
resp
.
json
()[
'
checksum
'
]
returncode
=
resp
.
json
()[
'
returncode
'
]
def
fragment_checksum_text_position
(
self
,
seed
,
capture
,
pos
):
try
:
response
=
self
.
_post
(
op
=
'
/fragment/checksum/text
'
,
json
=
{
"
seed
"
:
seed
,
"
capture
"
:
capture
,
"
mode
"
:
POSITION_MODE
,
"
pos
"
:
pos
,
"
extractortype
"
:
EXTRACTOR_TEXT
})
response
=
self
.
status_query
(
response
)
return
self
.
wait_for_response
(
response
)
except
HTTPError
as
e
:
self
.
_display_http_error
(
e
)
if
returncode
==
2
:
continue
def
fragment_checksum_text_regex
(
self
,
seed
,
capture
,
regexpattern
,
occurrence
):
try
:
response
=
self
.
_post
(
op
=
'
/fragment/checksum/text
'
,
json
=
{
"
seed
"
:
seed
,
"
capture
"
:
capture
,
"
mode
"
:
REGEX_MODE
,
"
regexpattern
"
:
regexpattern
,
"
occurrence
"
:
occurrence
,
"
extractortype
"
:
EXTRACTOR_TEXT
})
response
=
self
.
status_query
(
response
)
return
self
.
wait_for_response
(
response
)
except
HTTPError
as
e
:
self
.
_display_http_error
(
e
)
if
checksum
!=
lastchecksum
:
print
(
resp
.
json
())
print
(
"
http://wayback/web/
"
+
capturedate
+
"
/
"
+
url
)
print
(
capturedate
+
"
"
+
checksum
)
lastchecksum
=
checksum
def
create_watchlist
(
self
,
urls
):
try
:
response
=
self
.
_post
(
op
=
'
/watchlist
'
,
json
=
{
"
urls
"
:
urls
})
return
response
except
HTTPError
as
e
:
self
.
_display_http_error
(
e
)
print
(
"
end
"
)