diff --git a/3.3 - Text - Download OCR Text.ipynb b/3.3 - Text - Download OCR Text.ipynb index ccc08aa9879f4069df54a60399b1140d8d78231b..4f24bb4c456eef25073f5ff3157f3169611aabbf 100644 --- a/3.3 - Text - Download OCR Text.ipynb +++ b/3.3 - Text - Download OCR Text.ipynb @@ -2,7 +2,11 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, "source": [ "# 3.3 - Text - Download OCR Text\n", "\n", @@ -15,7 +19,11 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, "source": [ "In order to get to this text, we have to\n", "\n", @@ -27,14 +35,22 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, "source": [ "### Find a Newspaper Issue" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "slideshow": { + "slide_type": "" + } + }, "source": [ "Let's take a look at the [ONB Labs' historic newspapers](https://labs.onb.ac.at/en/dataset/anno/)" ] @@ -42,7 +58,11 @@ { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, "outputs": [], "source": [ "import pandas as pd\n", @@ -53,7 +73,11 @@ { "cell_type": "code", "execution_count": 2, - "metadata": {}, + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, "outputs": [ { "data": { @@ -101,180 +125,180 @@ " \n", "
\n", "Image courtesy of https://github.com/IIIF/training, CC-BY 4.0
+ +<Description>
<Styles>
<Layout>
<Page>
elementsimport requests
+import pandas as pd
+from SPARQLWrapper import SPARQLWrapper, JSON
+import json
+
Set the SPARQL-Endpoint:
+anno_lod_endpoint = "https://lod.onb.ac.at/sparql/anno"
+
Methods to query the endpoint and build the dataframe:
+ +def get_sparql_result(service, query):
+ sparql = SPARQLWrapper(service)
+ sparql.setQuery(query)
+ sparql.setReturnFormat(JSON)
+ return sparql.query()
+
+def get_sparql_dataframe(service, query):
+ result = get_sparql_result(service, query)
+
+ processed_results = result.convert()
+ cols = processed_results['head']['vars']
+
+ out = []
+ for row in processed_results['results']['bindings']:
+ item = []
+ for c in cols:
+ item.append(row.get(c, {}).get('value'))
+ out.append(item)
+
+ return pd.DataFrame(out, columns=cols)
+
Select all newspapers and periodicals with subjectheading Statistik:
+ +query = '''
+PREFIX dc: <http://purl.org/dc/elements/1.1/>
+PREFIX edm: <http://www.europeana.eu/schemas/edm/>
+PREFIX dcterms: <http://purl.org/dc/terms/>
+SELECT ?title ?subjectURI ?manifest
+WHERE {?subjectURI dc:subject <http://d-nb.info/gnd/4056995-0> .
+ ?subjectURI dc:title ?title .
+ ?subjectURI edm:isShownBy ?firstpage .
+ ?subjectURI edm:rights <http://creativecommons.org/publicdomain/mark/1.0/> .
+ ?firstpage dcterms:isReferencedBy ?manifest
+}'''
+
Get list of IIIF Manifests URLs:
+ +df = get_sparql_dataframe(anno_lod_endpoint, query)
+manifests = list(df['manifest'])
+manifests
+
Function to create a SACHA Collection (https://iiif.onb.ac.at/api#_collectionspostjsonprocessor):
+ +def create_collection(description, list_of_manifest_ids_or_ids):
+ j = {
+ "description": description,
+ "elements": list_of_manifest_ids_or_ids
+ }
+ creation_link = 'https://iiif.onb.ac.at/presentation/collection'
+ result = requests.post(creation_link, json=j)
+ if result.status_code == 201:
+ print('SUCCESS: Create collection {}'.format(result.json()['url']))
+ print('View collection in Mirador: https://iiif.onb.ac.at/view/collection/mirador/' + result.json()['url'].split('/').pop())
+ elif result.status_code == 400:
+ print('ERROR: Request error creating collection')
+ print(result.text)
+ elif result.status_code == 500:
+ print('ERROR: Server error creating collection')
+ print(result.text)
+ else:
+ print('ERROR: General error creating collection, HTTP status = {}'.format(result.status_code))
+
Create the SACHA Collection:
+ +create_collection("newspaper with subject heading Statistik", manifests)
+
I want to download a bunch of small images, already scaled down for my CNN
+https://labs.onb.ac.at/en/dataset/akon/
+https://labs.onb.ac.at/gitlab/labs-team/raw-metadata/raw/master/akon_postcards_public_domain.csv.bz2
+ + +Let's say you got a bunch of old timey scenery photographs. +And you want to extract all images containing mountains, why not. +And, because you can, you want an AI to do all the dirty work for you.
+What that has to do with this workshop?
+You can use the historic postcards from the ONB Labs as training data for your AI.
+ +Disclaimer: The AI-part is beyond the scope of this notebook, and would blow up the size of the venv considerably.
+If you want instructions on actually performing the training, take a look at
+ +One way to do it: Download a VGG16 network that's pre-trained on ImageNet, remove the last layer (the actual classifier), add your own output layer with 2 outputs ('mountain', 'no mountain') and train that one.
+Now back to the show.
+ +What do we have to do?
+Download the metadata set from the ONB Labs
+ +import pandas as pd
+
+# Let pandas show all available columns
+pd.set_option('display.max_columns', 50)
+# Pandas can read data directly from web links, even compressed files
+meta = pd.read_csv('https://labs.onb.ac.at/gitlab/labs-team/' \
+ 'raw-metadata/raw/master/akon_postcards_public_domain.csv.bz2', compression='bz2')
+
meta.sample(6)
+
Ok, we have metadata. And look, there's a column mountain:
+ +meta.sample(5)[['akon_id', 'mountain']]
+
Later, we'll split the dataset in two using the data in this column.
+ +The SACHA project provides an API for accessing digitized objects of the National Library via IIIF. +The online documentation for the API is here: https://iiif.onb.ac.at/api.
+We're especially interested in the possibility to serve manifests: https://iiif.onb.ac.at/api#_manifestrequestprocessor:
+ +GET /presentation/{projectName}/{id}/manifest
+
+GET /presentation/{projectName}/{id}/manifest
+The projectName
is AKON
('AnsichtsKarten ONline'), the id
is the akon_id
.
def akon_id_to_manifest_link(akon_id):
+ return f'https://iiif.onb.ac.at/presentation/AKON/{akon_id}/manifest'
+
akon_id_to_manifest_link('AK024_176')
+
Let's test the link
+ +import requests
+
+r = requests.get(akon_id_to_manifest_link('AK024_176'))
+r.json()
+
The manifest link seems to work. Let's add manifest links for all postcards to the dataframe:
+ +meta['manifest_link'] = meta['akon_id'].apply(akon_id_to_manifest_link)
+
meta.sample(6)[['akon_id', 'manifest_link']]
+
Let's take a look at that manifest again:
+ +r = requests.get(akon_id_to_manifest_link('AK024_176'))
+r.json()
+
We need to collect all @id
s from all resource
s from all images
from all canvases
.
That's tedious by hand. We'll use jsonpath-ng
:
from jsonpath_ng import jsonpath, parse
+
+image_id_jp = parse('$.sequences[*].canvases[*].images[*].resource.@id')
+
[match.value for match in image_id_jp.find(r.json())]
+
All of this in one function:
+ +image_id_jp = parse('$.sequences[*].canvases[*].images[*].resource.@id')
+
+def image_links_for_manifest_link(manifest_link):
+ r = requests.get(manifest_link)
+ try:
+ json = r.json()
+ except:
+ # default to empty on exceptions - makes batch processing easier in pandas
+ json = {}
+ image_links = [match.value for match in image_id_jp.find(json)]
+ return image_links
+
Let's test it:
+ +random_akon_id = meta.sample().iloc[0]['akon_id']
+manifest_link = akon_id_to_manifest_link(random_akon_id)
+image_links_for_manifest_link(manifest_link)
+
Looking good.
+ +Now let's add the image links to the dataframe...
+...actually, let's not do that now, because it takes a while (upwards of 10 minutes). Let's cheat instead, skip this step and load the resulting dataframe directly.
+ +# %%time
+# meta['image_links'] = meta['manifest_link'].apply(image_links_for_manifest_link)
+
import json
+
+def load_json(s):
+ try:
+ return json.loads(s.replace("'", '"'))
+ except:
+ return []
+
+meta = pd.read_csv('postcards_with_image_links.csv.bz2', compression='bz2', converters={
+ 'image_links': load_json
+})
+
meta.sample(10)
+
We'll split the dataframe into two: One with mountains, one without.
+ +nomountain = meta[ meta['mountain'].isnull() ]
+mountain = meta[ ~ meta['mountain'].isnull() ]
+
len(meta), len(nomountain), len(mountain)
+
Yeah, that adds up.
+ +Ok, so what's left to do?
+VGG16 and VGG19 expect 224x224 pixel RGB images.
+ +Luckily, IIIF allows us to request images already resized to our demands. That saves on bandwidth, time and code complexity.
+According to the standard we can use the size
parameter to resize the image exactly to the dimensions we need.
The links, before and after, would be:
+https://iiif.onb.ac.at/images/AKON/AK024_176/176/full/full/0/native.jpg
https://iiif.onb.ac.at/images/AKON/AK024_176/176/full/224,224/0/native.jpg
Let's try it:
+ +r = requests.get('https://iiif.onb.ac.at/images/AKON/AK024_176/176/full/224,224/0/native.jpg')
+
from IPython.display import display, Image
+
display(Image(r.content))
+
That looks about right.
+ +Download to file:
+ +import shutil
+
+def download_to_file(url, filename):
+ with requests.get(url, stream=True) as r:
+ with open(filename, 'wb') as fh:
+ shutil.copyfileobj(r.raw, fh)
+
+def sized_link(iiif_url, size='224,224'):
+ frags = iiif_url.split('/')
+ frags[-3] = size
+ return '/'.join(frags)
+
Test that:
+ +link = sized_link('https://iiif.onb.ac.at/images/AKON/AK024_176/176/full/full/0/native.jpg')
+download_to_file(link, 'testimg.jpg')
+
with open('testimg.jpg', 'rb') as fh:
+ display(Image(fh.read()))
+
Create directories:
+ +import os
+
+os.mkdir('./images')
+os.mkdir('./images/mountain')
+os.mkdir('./images/nomountain')
+
Now let's download!
+ +# For this demonstration we'll just take 10 images each
+for idx, row in mountain.sample(10).iterrows():
+ akon_id = row['akon_id']
+ for n, link in enumerate(row['image_links']):
+ small_image_link = sized_link(link)
+ file_name = f'./images/mountain/{akon_id}_{n}.jpg'
+ download_to_file(small_image_link, file_name)
+ print('.', end='')
+for idx, row in nomountain.sample(10).iterrows():
+ akon_id = row['akon_id']
+ for n, link in enumerate(row['image_links']):
+ small_image_link = sized_link(link)
+ file_name = f'./images/nomountain/{akon_id}_{n}.jpg'
+ download_to_file(small_image_link, file_name)
+ print('.', end='')
+
I need loads of text from old newspapers, preferably with loads of errors due to bad OCR.
+ + + +In order to get to this text, we have to
+Let's take a look at the ONB Labs' historic newspapers
+ +import pandas as pd
+
+meta = pd.read_csv('https://labs.onb.ac.at/gitlab/labs-team/raw-metadata/raw/master/anno_labs_issues.csv.bz2', compression='bz2')
+
meta.sample(10)
+
Let's go with the Ost-Deutsche Post issue from the 30th of July 1863
+ +manifest_id = 'ode18630730'
+
If we look at the SACHA API description, we see that the link for the IIIF manifest has to look like this:
+http://iiif.onb.ac.at/presentation/ANNO/ode18630730/manifest
import requests
+
r = requests.get(f'http://iiif.onb.ac.at/presentation/ANNO/{manifest_id}/manifest')
+
r.json()
+
There's a lot of information in there. We need the info blocks with links to ALTO-XML resources.
+Let's use jsonpath-ng for that.
+ +from jsonpath_ng import parse
+
def jp(http_response, parser):
+ return [match.value for match in parser.find(http_response.json())]
+
resource_parser = parse('$.sequences[*].canvases[*].otherContent[*].resources')
+
jp(r, resource_parser)
+
Not quite there yet.
+ +all_resources = parse('$.sequences[*].canvases[*].otherContent[*].resources[*].resource')
+
jp(r, all_resources)
+
Filter just the ones with format application/xml+alto
, and there only the @id
:
ids = [d['@id'] for d in jp(r, all_resources) if d['format'] == 'application/xml+alto']
+
ids
+
alto_storage = {}
+
+for xml_link in ids:
+ r = requests.get(xml_link)
+ if r.ok:
+ alto_storage[xml_link] = r.text
+
alto_storage
+
r
+
r.ok
+
Uh oh.
+ +import alto_tools
+
+def alto_extract_text_lines(xml, xmlns):
+ text_lines = []
+ nsdict = {'alto': xmlns}
+ for lines in xml.iterfind('.//alto:TextLine', nsdict):
+ words = [line.attrib.get('CONTENT') for line in lines.findall('alto:String', nsdict)]
+ text_lines.append(' '.join(words))
+ return '\n'.join(text_lines)
+
+def alto_to_text(raw_alto_text):
+ alto, xml, xmlns = alto_tools.alto_parse(raw_alto_text)
+ return alto_extract_text_lines(xml, xmlns)
+
print(alto_to_text(f'http://iiif.onb.ac.at/presentation/ANNO/{manifest_id}/resource/00000002.xml'))
+