Skip to content
Snippets Groups Projects
Commit f01e549e authored by onb1259's avatar onb1259
Browse files

added functionality to add iiif annotations to manifest

parent 556b4f9a
Branches
No related tags found
No related merge requests found
%% Cell type:markdown id:82ab6471 tags: %% Cell type:markdown id:82ab6471 tags:
# Extract figures # Extract figures
This notebook uses a YOLOv8 model trained on five annotated ABO books (1700 pages) to extract figures from them. This notebook uses a YOLOv8 model trained on five annotated ABO books (1700 pages) to extract figures from them.
After giving the iiif manifest url of the book from which you want to extract images, the book pages get dwonloaded, the figure recognition model gets applied and the resulting bounding boxes get extracted and can be donwloaded as a zip file. After giving the iiif manifest url of the book from which you want to extract images, the book pages get dwonloaded, the figure recognition model gets applied and the resulting bounding boxes get extracted and can be donwloaded as a zip file.
%% Cell type:code id:e1bcdac6 tags: %% Cell type:code id:e1bcdac6 tags:
``` python ``` python
import os, shutil import os, shutil
import json
from zipapp import zipfile from zipapp import zipfile
from PIL import Image from PIL import Image
from ultralytics import YOLO from ultralytics import YOLO
from IPython.display import display, FileLink, HTML from IPython.display import display, FileLink, HTML
from iiif_utils import get_imgurls_from_manifesturl, create_paths_from_iiifurls, download_images_multithreded from iiif_utils import get_imgurls_from_manifesturl, create_paths_from_iiifurls, download_images_multithreded, add_annotations, get_json_by_url
# Url of iiif manifest # Url of iiif manifest
input_manifest = 'https://iiif.onb.ac.at/presentation/ABO/+Z105572305/manifest' input_manifest = 'https://iiif.onb.ac.at/presentation/ABO/+Z105572305/manifest'
``` ```
%% Cell type:code id:65afd0ba tags: %% Cell type:code id:65afd0ba tags:
``` python ``` python
# Donwload the book pages # Donwload the book pages
IMAGES_DIR = 'images' IMAGES_DIR = 'images'
imgurls = get_imgurls_from_manifesturl(input_manifest) imgurls = get_imgurls_from_manifesturl(input_manifest)
imgpaths = create_paths_from_iiifurls(imgurls) imgpaths = create_paths_from_iiifurls(imgurls)
if os.path.isdir(IMAGES_DIR): if os.path.isdir(IMAGES_DIR):
shutil.rmtree(IMAGES_DIR) shutil.rmtree(IMAGES_DIR)
os.mkdir(IMAGES_DIR) os.mkdir(IMAGES_DIR)
imgpaths = [os.path.join(IMAGES_DIR, e) for e in imgpaths] imgpaths = [os.path.join(IMAGES_DIR, e) for e in imgpaths]
download_images_multithreded(imgurls, imgpaths, 10) download_images_multithreded(imgurls, imgpaths, 10)
print(f'Total number of downloaded pages: {len(os.listdir(IMAGES_DIR))}') print(f'Total number of downloaded pages: {len(os.listdir(IMAGES_DIR))}')
``` ```
%% Cell type:code id:c6b37e20 tags: %% Cell type:code id:c6b37e20 tags:
``` python ``` python
# Apply figure extraction model # Apply figure extraction model
RESULTS_DIR = 'extracted_figures' RESULTS_DIR = 'extracted_figures'
if os.path.isdir(RESULTS_DIR): if os.path.isdir(RESULTS_DIR):
shutil.rmtree(RESULTS_DIR) shutil.rmtree(RESULTS_DIR)
os.mkdir(RESULTS_DIR) os.mkdir(RESULTS_DIR)
model = YOLO('model_extract_figures.pt') model = YOLO('model_extract_figures.pt')
counter = 0 counter = 0
for path in imgpaths: list_of_bbs = [None] * len(imgpaths)
for j, path in enumerate(imgpaths):
result = model.predict(path, verbose=False) result = model.predict(path, verbose=False)
img = Image.open(path) img = Image.open(path)
for i, bb in enumerate(result[0].boxes.xyxy.tolist()): for i, bb in enumerate(result[0].boxes.xyxy.tolist()):
#print(f'Figure detected in: {path}') #print(f'Figure detected in: {path}')
counter += 1 counter += 1
bb = [int(e) for e in bb] bb = [int(e) for e in bb]
list_of_bbs[j] = bb
img_cropped = img.crop(bb) img_cropped = img.crop(bb)
path_cropped = os.path.join(RESULTS_DIR, os.path.basename(path.rsplit('.', 1)[0] + '_crop' + str(i) + '.' + path.rsplit('.', 1)[1])) path_cropped = os.path.join(RESULTS_DIR, os.path.basename(path.rsplit('.', 1)[0] + '_crop' + str(i) + '.' + path.rsplit('.', 1)[1]))
img_cropped.save(path_cropped) img_cropped.save(path_cropped)
print(f'Total number of extracted figures: {counter}') print(f'Total number of extracted figures: {counter}')
manifest_annotated = add_annotations(list_of_bbs, get_json_by_url(input_manifest))
with open(os.path.join(RESULTS_DIR, 'annotated_manifest.json'), 'w+') as f:
json.dump(manifest_annotated, f, indent=2)
``` ```
%% Cell type:code id:a997833e tags: %% Cell type:code id:a997833e tags:
``` python ``` python
# Create a zip file with extracted figures and show them # Create a zip file with extracted figures and show them
RESULT_ZIP = 'extracted_figures.zip' RESULT_ZIP = 'extracted_figures.zip'
with zipfile.ZipFile(RESULT_ZIP, 'w') as myzip: with zipfile.ZipFile(RESULT_ZIP, 'w') as myzip:
for f in os.listdir(RESULTS_DIR): for f in os.listdir(RESULTS_DIR):
myzip.write(os.path.join(RESULTS_DIR, f)) myzip.write(os.path.join(RESULTS_DIR, f))
zipped_extracted_figures = FileLink(RESULT_ZIP, result_html_prefix="Click here to download zipped extracted figures: ") zipped_extracted_figures = FileLink(RESULT_ZIP, result_html_prefix="Click here to download zipped extracted figures: ")
display(zipped_extracted_figures) display(zipped_extracted_figures)
def show_images(filenames, width, margin): def show_images(filenames, width, margin):
lis = [f'<img src="{name}" style="display:inline;margin:{margin}px" width="{width}"/>' for name in filenames] lis = [f'<img src="{name}" style="display:inline;margin:{margin}px" width="{width}"/>' for name in filenames]
html = ''.join(lis) html = ''.join(lis)
display(HTML(html)) display(HTML(html))
show_images([os.path.join(RESULTS_DIR, e) for e in sorted(os.listdir(RESULTS_DIR))], 150, 1) show_images([os.path.join(RESULTS_DIR, e) for e in sorted(os.listdir(RESULTS_DIR))], 150, 1)
``` ```
......
...@@ -56,14 +56,37 @@ def create_paths_from_iiifurls(img_urls, base_path=''): ...@@ -56,14 +56,37 @@ def create_paths_from_iiifurls(img_urls, base_path=''):
res = [os.path.join(base_path, el) for el in res] res = [os.path.join(base_path, el) for el in res]
return res return res
def add_annotations(list_of_boundingboxes, manifest):
for i, _ in enumerate(manifest['sequences'][0]['canvases']):
id = manifest['sequences'][0]['canvases'][i]['images'][0]['@id']
bounding_box = list_of_boundingboxes[i]
# if bounding box is None, skip
if not bounding_box:
continue
annotation = {
"@context": "http://iiif.io/api/presentation/2/context.json",
"@type": "sc:AnnotationList",
"@id": f"{id}/annotations",
"resources": [
{
"@type": "oa:Annotation",
"motivation": "sc:painting",
"resource": {
"@type": "oa:Choice",
"default": {
"@type": "oa:SpecificResource",
"selector": {
"@type": "oa:FragmentSelector",
"value": f"xywh={bounding_box[0]},{bounding_box[1]},{bounding_box[0]+bounding_box[2]},{bounding_box[2]+bounding_box[3]}"
},
"style": "rect",
"label": "Detected figure"
}
},
"on": f"{id}"
}
]
}
manifest['sequences'][0]['canvases'][i]['otherContent'].append(annotation)
return manifest
# collection_url = 'https://iiif.onb.ac.at/presentation/collection/labs_botanical_illustrations'
# urls = get_imgurls_from_collectionurl(collection_url)
# paths = create_paths_from_iiifurls(urls, 'downloads')
# download_images_multithreded(urls, paths)
# collection_url = 'https://iiif.onb.ac.at/presentation/collection/apz_1841'
# manifest_url = 'https://iiif.onb.ac.at/presentation/ANNO/apz18411229/manifest/'
# urls = get_imgurls_from_manifesturl(manifest_url)
# paths = create_paths_from_iiifurls(urls, 'downloads')
# download_images_multithreded(urls, paths)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment