diff --git a/3.3 - Text - Download OCR Text.ipynb b/3.3 - Text - Download OCR Text.ipynb index 2ddd44bce5495371027974fbb9671f1cae0f0130..ccc08aa9879f4069df54a60399b1140d8d78231b 100644 --- a/3.3 - Text - Download OCR Text.ipynb +++ b/3.3 - Text - Download OCR Text.ipynb @@ -1269,6 +1269,72 @@ "source": [ "Uh oh." ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Convert the ALTO-XML to TXT" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "import alto_tools\n", + "\n", + "def alto_extract_text_lines(xml, xmlns):\n", + " text_lines = []\n", + " nsdict = {'alto': xmlns}\n", + " for lines in xml.iterfind('.//alto:TextLine', nsdict):\n", + " words = [line.attrib.get('CONTENT') for line in lines.findall('alto:String', nsdict)]\n", + " text_lines.append(' '.join(words))\n", + " return '\\n'.join(text_lines)\n", + "\n", + "def alto_to_text(raw_alto_text):\n", + " alto, xml, xmlns = alto_tools.alto_parse(raw_alto_text)\n", + " return alto_extract_text_lines(xml, xmlns)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "ename": "OSError", + "evalue": "Error reading file 'http://iiif.onb.ac.at/presentation/ANNO/apr18750223/resource/00000002.xml': failed to load HTTP resource", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0malto_to_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'http://iiif.onb.ac.at/presentation/ANNO/apr18750223/resource/00000002.xml'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36malto_to_text\u001b[0;34m(raw_alto_text)\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0malto_to_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mraw_alto_text\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0malto\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mxml\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mxmlns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0malto_tools\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0malto_parse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mraw_alto_text\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 13\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0malto_extract_text_lines\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxml\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mxmlns\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/labs/pydays19/alto_tools.py\u001b[0m in \u001b[0;36malto_parse\u001b[0;34m(alto)\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0;34m\"\"\" Convert ALTO xml file to element tree \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 19\u001b[0;31m \u001b[0mxml\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0metree\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0malto\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 20\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0metree\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mParseError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 21\u001b[0m sys.stdout.write('\\nERROR: Failed parsing \"%s\" - '\n", + "\u001b[0;32msrc/lxml/etree.pyx\u001b[0m in \u001b[0;36mlxml.etree.parse\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32msrc/lxml/parser.pxi\u001b[0m in \u001b[0;36mlxml.etree._parseDocument\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32msrc/lxml/parser.pxi\u001b[0m in \u001b[0;36mlxml.etree._parseDocumentFromURL\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32msrc/lxml/parser.pxi\u001b[0m in \u001b[0;36mlxml.etree._parseDocFromFile\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32msrc/lxml/parser.pxi\u001b[0m in \u001b[0;36mlxml.etree._BaseParser._parseDocFromFile\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32msrc/lxml/parser.pxi\u001b[0m in \u001b[0;36mlxml.etree._ParserContext._handleParseResultDoc\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32msrc/lxml/parser.pxi\u001b[0m in \u001b[0;36mlxml.etree._handleParseResult\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32msrc/lxml/parser.pxi\u001b[0m in \u001b[0;36mlxml.etree._raiseParseError\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mOSError\u001b[0m: Error reading file 'http://iiif.onb.ac.at/presentation/ANNO/apr18750223/resource/00000002.xml': failed to load HTTP resource" + ] + } + ], + "source": [ + "print(alto_to_text('http://iiif.onb.ac.at/presentation/ANNO/apr18750223/resource/00000002.xml'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {