diff --git a/OCR_samples.ipynb b/OCR_samples.ipynb index 012c06e675b73f40d8becaa3247eea73a8c350af..2c69ba24aa999fb6f471ce629a6f7eed23455b87 100644 --- a/OCR_samples.ipynb +++ b/OCR_samples.ipynb @@ -29,14 +29,22 @@ }, { "cell_type": "code", - "execution_count": 6, - "id": "54580312-c90d-49ec-a26d-aaaad34d7409", + "execution_count": 1, + "id": "195c1f7d-9056-40f7-86cb-835d9a5052a9", "metadata": {}, "outputs": [], "source": [ "from preprocessing import *\n", - "import cv2 as cv\n", - "\n", + "import cv2 as cv" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "54580312-c90d-49ec-a26d-aaaad34d7409", + "metadata": {}, + "outputs": [], + "source": [ "img_paths = ['img/kfz18151101_00000001.jpg', 'img/kfz18700224_00000001.jpg']\n", "for path in img_paths:\n", " preprocessed_img = preprocess_pipeline(cv.imread(path), path, debug=False, debug_path='img/debug')\n", @@ -45,7 +53,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 3, "id": "c47423fe-02aa-4d61-bdd5-2c589f57f5ee", "metadata": {}, "outputs": [ @@ -82,7 +90,7 @@ "" ] }, - "execution_count": 26, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -108,7 +116,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 4, "id": "83847ba8-9c30-4499-9dbe-fc8552555194", "metadata": {}, "outputs": [ @@ -126,7 +134,8 @@ "paths = sorted(Path('img').glob('*.jpg'))\n", "\n", "for img in paths:\n", - " os.system(f\"tesseract -l deu+Fraktur+frk --psm 3 {img} {str(img).replace('img', 'data').replace('.jpg', '_tess')} alto txt\")" + " out_path = str(img).replace('img', 'data').replace('.jpg', '_tess')\n", + " os.system(f\"tesseract -l deu+Fraktur+frk --psm 3 {img} {out_path} alto txt\")" ] }, { @@ -139,7 +148,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 5, "id": "1ac51927-4cba-457a-a8e9-9fd9d7b52f65", "metadata": {}, "outputs": [ @@ -177,15 +186,16 @@ "import pandas as pd\n", "\n", "txtpaths = sorted(Path('data').glob('*1815*.txt'))\n", - "txts = [open(txt).read() for txt in txtpaths]\n", - "txt_df = pd.DataFrame([txts], columns=txtpaths)\n", + "strpaths = list(filter(lambda x: 'new' not in x, [str(p) for p in txtpaths]))\n", + "txts = [open(txt).read() for txt in strpaths]\n", + "txt_df = pd.DataFrame([txts], columns=strpaths)\n", "pd.set_option(\"display.max_colwidth\", None)\n", "display(HTML(txt_df.to_html().replace(\"\\\\n\",\"
\")))" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 6, "id": "b9542753-a848-421e-81f3-65b07f3818b4", "metadata": {}, "outputs": [ @@ -221,8 +231,9 @@ ], "source": [ "txtpaths = sorted(Path('data').glob('*1870*.txt'))\n", - "txts = [open(txt).read() for txt in txtpaths]\n", - "txt_df = pd.DataFrame([txts], columns=txtpaths)\n", + "strpaths = list(filter(lambda x: 'new' not in x, [str(p) for p in txtpaths]))\n", + "txts = [open(txt).read() for txt in strpaths]\n", + "txt_df = pd.DataFrame([txts], columns=strpaths)\n", "pd.set_option(\"display.max_colwidth\", None)\n", "display(HTML(txt_df.to_html().replace(\"\\\\n\",\"
\")))" ] @@ -244,7 +255,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 7, "id": "cc69ad26-523f-450a-aabd-3dd30ab748dd", "metadata": {}, "outputs": [ @@ -266,7 +277,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 8, "id": "22f88237-22da-4fe1-8546-86ee4070f743", "metadata": {}, "outputs": [ diff --git a/ONB_IIIF_API.ipynb b/ONB_IIIF_API.ipynb index 6c56fbb40cbde9b20eb42484fddfb556c606b444..2360b92be39a9ec15a99b300be0edcd0e44ddc3d 100644 --- a/ONB_IIIF_API.ipynb +++ b/ONB_IIIF_API.ipynb @@ -7,13 +7,13 @@ "source": [ "# ONB's IIIF API (iiif.onb.ac.at)\n", "\n", - "Available at https://iiif.onb.ac.at/, used to access copyright-free images and metadata for digitized objects of the Austrian National Library from more than 140 years ago. Newspapers from ANNO are available as IIIF collections, using their 3-letter id. This id can be obtained either via the ANNO portal or via ONB Labs metadata at https://labs.onb.ac.at/datasets/anno.\n", + "Available at https://iiif.onb.ac.at/ (follows IIIF 2.1), used to access copyright-free images and metadata for digitized objects of the Austrian National Library from more than 140 years ago. Newspapers from ANNO are available as IIIF collections, using their 3-letter id. This id can be obtained either via the [ANNO portal](https://anno.onb.ac.at/) or via ONB Labs metadata at https://labs.onb.ac.at/datasets/anno.\n", "\n", "Metadata are served via IIIF Presentation API (see https://iiif.io/api/presentation/)\n", "\n", "Images are served via IIIF Image API (see https://iiif.io/api/image/)\n", "\n", - "Future development: ONB API for *all* digitized objects in development, alpha version accessible under https://api.onb.ac.at\n", + "Future development: ONB API for *all* digitized objects in development, alpha version accessible under https://api.onb.ac.at (follows IIIF 3.0)\n", "\n", "## Example: Klagenfurter Zeitung with ANNO id 'kfz'\n", "\n", @@ -22,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "id": "e008f11b-bbbd-483e-a82a-b4f21220c134", "metadata": {}, "outputs": [ @@ -117,7 +117,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 2, "id": "bdb22c9b-3699-47b1-ba87-99501db09ccd", "metadata": {}, "outputs": [ @@ -207,7 +207,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "a4917b5e-a710-4ebd-a24e-940bafe7690c", "metadata": {}, "outputs": [], @@ -248,7 +248,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "7838094f-145c-4b78-98d2-49da93d456c5", "metadata": {}, "outputs": [], @@ -260,7 +260,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "id": "84572deb-3fd8-4cc0-a88c-e9bddee40b27", "metadata": {}, "outputs": [ @@ -282,7 +282,7 @@ "" ] }, - "execution_count": 3, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -305,7 +305,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, "id": "ec71ace8-8217-494e-99e4-1d9f0a3bf2d8", "metadata": {}, "outputs": [], @@ -318,7 +318,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "id": "c60fe0f1-30e2-4c7c-9d5c-5f42c09d9ed2", "metadata": {}, "outputs": [ @@ -366,7 +366,6 @@ "source": [ "import os\n", "from lxml import etree\n", - "import glob\n", "\n", "ns = {\n", " 'alto': 'http://www.loc.gov/standards/alto/ns-v2#'\n", @@ -429,7 +428,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "id": "a7d55e0a-57c9-4b32-b3a8-22673889ace0", "metadata": {}, "outputs": [], @@ -442,7 +441,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "id": "a43a9f90-bb24-4131-993b-d6e9065a00ca", "metadata": {}, "outputs": [ @@ -497,7 +496,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "id": "fcabce41-a3fe-4e5d-aeb1-3c9022e8a62b", "metadata": {}, "outputs": [ @@ -519,7 +518,7 @@ "" ] }, - "execution_count": 13, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" }