added functionality to add iiif annotations to manifest

f01e549e · onb1259 · 556b4f9a · f01e549e · f01e549e
Commit f01e549e authored 1 year ago by onb1259
--- a/extract_figures.ipynb
+++ b/extract_figures.ipynb
@@ -21,12 +21,13 @@
   "outputs": [],
   "source": [
    "import os, shutil\n",
+    "import json\n",
    "from zipapp import zipfile\n",
    "from PIL import Image\n",
    "from ultralytics import YOLO\n",
    "from IPython.display import display, FileLink, HTML\n",
    "\n",
-    "from iiif_utils import get_imgurls_from_manifesturl, create_paths_from_iiifurls, download_images_multithreded\n",
+    "from iiif_utils import get_imgurls_from_manifesturl, create_paths_from_iiifurls, download_images_multithreded, add_annotations, get_json_by_url\n",
    "\n",
    "# Url of iiif manifest\n",
    "input_manifest = 'https://iiif.onb.ac.at/presentation/ABO/+Z105572305/manifest'"
@@ -69,17 +70,23 @@
    "os.mkdir(RESULTS_DIR)\n",
    "model = YOLO('model_extract_figures.pt')\n",
    "counter = 0\n",
-    "for path in imgpaths:\n",
+    "list_of_bbs = [None] * len(imgpaths)\n",
+    "for j, path in enumerate(imgpaths):\n",
    "    result = model.predict(path, verbose=False)\n",
    "    img = Image.open(path)\n",
    "    for i, bb in enumerate(result[0].boxes.xyxy.tolist()):\n",
    "        #print(f'Figure detected in: {path}')\n",
    "        counter += 1\n",
    "        bb = [int(e) for e in bb]\n",
+    "        list_of_bbs[j] = bb\n",
    "        img_cropped = img.crop(bb)\n",
    "        path_cropped = os.path.join(RESULTS_DIR, os.path.basename(path.rsplit('.', 1)[0] + '_crop' + str(i) + '.' + path.rsplit('.', 1)[1]))\n",
    "        img_cropped.save(path_cropped)\n",
-    "print(f'Total number of extracted figures: {counter}')"
+    "print(f'Total number of extracted figures: {counter}')\n",
+    "\n",
+    "manifest_annotated = add_annotations(list_of_bbs, get_json_by_url(input_manifest))\n",
+    "with open(os.path.join(RESULTS_DIR, 'annotated_manifest.json'), 'w+') as f:\n",
+    "    json.dump(manifest_annotated, f, indent=2)"
   ]
  },
  {

 %% Cell type:markdown id:82ab6471 tags:
 # Extract figures
 This notebook uses a YOLOv8 model trained on five annotated ABO books (1700 pages) to extract figures from them.
 After giving the iiif manifest url of the book from which you want to extract images, the book pages get dwonloaded, the figure recognition model gets applied and the resulting bounding boxes get extracted and can be donwloaded as a zip file.
 %% Cell type:code id:e1bcdac6 tags:
 ``` python
 import os, shutil
+import json
 from zipapp import zipfile
 from PIL import Image
 from ultralytics import YOLO
 from IPython.display import display, FileLink, HTML
-from iiif_utils import get_imgurls_from_manifesturl, create_paths_from_iiifurls, download_images_multithreded
+from iiif_utils import get_imgurls_from_manifesturl, create_paths_from_iiifurls, download_images_multithreded, add_annotations, get_json_by_url
 # Url of iiif manifest
 input_manifest = 'https://iiif.onb.ac.at/presentation/ABO/+Z105572305/manifest'
 ```
 %% Cell type:code id:65afd0ba tags:
 ``` python
 # Donwload the book pages
 IMAGES_DIR = 'images'
 imgurls = get_imgurls_from_manifesturl(input_manifest)
 imgpaths = create_paths_from_iiifurls(imgurls)
 if os.path.isdir(IMAGES_DIR):
    shutil.rmtree(IMAGES_DIR)
 os.mkdir(IMAGES_DIR)
 imgpaths = [os.path.join(IMAGES_DIR, e) for e in imgpaths]
 download_images_multithreded(imgurls, imgpaths, 10)
 print(f'Total number of downloaded pages: {len(os.listdir(IMAGES_DIR))}')
 ```
 %% Cell type:code id:c6b37e20 tags:
 ``` python
 # Apply figure extraction model
 RESULTS_DIR = 'extracted_figures'
 if os.path.isdir(RESULTS_DIR):
    shutil.rmtree(RESULTS_DIR)
 os.mkdir(RESULTS_DIR)
 model = YOLO('model_extract_figures.pt')
 counter = 0
-for path in imgpaths:
+list_of_bbs = [None] * len(imgpaths)
+for j, path in enumerate(imgpaths):
    result = model.predict(path, verbose=False)
    img = Image.open(path)
    for i, bb in enumerate(result[0].boxes.xyxy.tolist()):
        #print(f'Figure detected in: {path}')
        counter += 1
        bb = [int(e) for e in bb]
+        list_of_bbs[j] = bb
        img_cropped = img.crop(bb)
        path_cropped = os.path.join(RESULTS_DIR, os.path.basename(path.rsplit('.', 1)[0] + '_crop' + str(i) + '.' + path.rsplit('.', 1)[1]))
        img_cropped.save(path_cropped)
 print(f'Total number of extracted figures: {counter}')
+manifest_annotated = add_annotations(list_of_bbs, get_json_by_url(input_manifest))
+with open(os.path.join(RESULTS_DIR, 'annotated_manifest.json'), 'w+') as f:
+    json.dump(manifest_annotated, f, indent=2)
 ```
 %% Cell type:code id:a997833e tags:
 ``` python
 # Create a zip file with extracted figures and show them
 RESULT_ZIP = 'extracted_figures.zip'
 with zipfile.ZipFile(RESULT_ZIP, 'w') as myzip:
    for f in os.listdir(RESULTS_DIR):
        myzip.write(os.path.join(RESULTS_DIR, f))
 zipped_extracted_figures = FileLink(RESULT_ZIP, result_html_prefix="Click here to download zipped extracted figures: ")
 display(zipped_extracted_figures)
 def show_images(filenames, width, margin):
    lis = [f'<img src="{name}" style="display:inline;margin:{margin}px" width="{width}"/>' for name in filenames]
    html = ''.join(lis)
    display(HTML(html))
 show_images([os.path.join(RESULTS_DIR, e) for e in sorted(os.listdir(RESULTS_DIR))], 150, 1)
 ```

--- a/iiif_utils.py
+++ b/iiif_utils.py
@@ -56,14 +56,37 @@ def create_paths_from_iiifurls(img_urls, base_path=''):
    res = [os.path.join(base_path, el) for el in res]
    return res
+def add_annotations(list_of_boundingboxes, manifest):
+    for i, _ in enumerate(manifest['sequences'][0]['canvases']):
+        id = manifest['sequences'][0]['canvases'][i]['images'][0]['@id']
+        bounding_box = list_of_boundingboxes[i]
+        # if bounding box is None, skip
+        if not bounding_box:
+            continue
+        annotation = {
+            "@context": "http://iiif.io/api/presentation/2/context.json",
+            "@type": "sc:AnnotationList",
+            "@id": f"{id}/annotations",
+            "resources": [
+            {
+                "@type": "oa:Annotation",
+                "motivation": "sc:painting",
+                "resource": {
+                "@type": "oa:Choice",
+                "default": {
+                    "@type": "oa:SpecificResource",
+                    "selector": {
+                    "@type": "oa:FragmentSelector",
+                    "value": f"xywh={bounding_box[0]},{bounding_box[1]},{bounding_box[0]+bounding_box[2]},{bounding_box[2]+bounding_box[3]}"
+                    },
+                    "style": "rect",
+                    "label": "Detected figure"
+                }  
+                },
+                "on": f"{id}"
+            }
+            ]
+        }
+        manifest['sequences'][0]['canvases'][i]['otherContent'].append(annotation)
+    return manifest
-# collection_url = 'https://iiif.onb.ac.at/presentation/collection/labs_botanical_illustrations'
-# urls = get_imgurls_from_collectionurl(collection_url)
-# paths = create_paths_from_iiifurls(urls, 'downloads')
-# download_images_multithreded(urls, paths)
-# collection_url = 'https://iiif.onb.ac.at/presentation/collection/apz_1841'
-# manifest_url = 'https://iiif.onb.ac.at/presentation/ANNO/apz18411229/manifest/'
-# urls = get_imgurls_from_manifesturl(manifest_url)
-# paths = create_paths_from_iiifurls(urls, 'downloads')
-# download_images_multithreded(urls, paths)