From 007f037de54792f0a0ff189521eee86006c47e4d Mon Sep 17 00:00:00 2001 From: Simon Mayer <simon.mayer@onb.ac.at> Date: Thu, 12 Sep 2024 13:47:51 +0200 Subject: [PATCH] Add forced encoding for txt files while running the notebook on Windows machines --- OCR_samples.ipynb | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/OCR_samples.ipynb b/OCR_samples.ipynb index 2c69ba2..7e7907e 100644 --- a/OCR_samples.ipynb +++ b/OCR_samples.ipynb @@ -29,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "195c1f7d-9056-40f7-86cb-835d9a5052a9", "metadata": {}, "outputs": [], @@ -148,7 +148,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "1ac51927-4cba-457a-a8e9-9fd9d7b52f65", "metadata": {}, "outputs": [ @@ -187,7 +187,7 @@ "\n", "txtpaths = sorted(Path('data').glob('*1815*.txt'))\n", "strpaths = list(filter(lambda x: 'new' not in x, [str(p) for p in txtpaths]))\n", - "txts = [open(txt).read() for txt in strpaths]\n", + "txts = [open(txt, 'r', encoding='utf-8').read() for txt in strpaths]\n", "txt_df = pd.DataFrame([txts], columns=strpaths)\n", "pd.set_option(\"display.max_colwidth\", None)\n", "display(HTML(txt_df.to_html().replace(\"\\\\n\",\"<br>\")))" @@ -195,7 +195,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "b9542753-a848-421e-81f3-65b07f3818b4", "metadata": {}, "outputs": [ @@ -232,7 +232,7 @@ "source": [ "txtpaths = sorted(Path('data').glob('*1870*.txt'))\n", "strpaths = list(filter(lambda x: 'new' not in x, [str(p) for p in txtpaths]))\n", - "txts = [open(txt).read() for txt in strpaths]\n", + "txts = [open(txt, 'r', encoding='utf-8').read() for txt in strpaths]\n", "txt_df = pd.DataFrame([txts], columns=strpaths)\n", "pd.set_option(\"display.max_colwidth\", None)\n", "display(HTML(txt_df.to_html().replace(\"\\\\n\",\"<br>\")))" @@ -277,7 +277,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "id": "22f88237-22da-4fe1-8546-86ee4070f743", "metadata": {}, "outputs": [ @@ -311,7 +311,7 @@ ], "source": [ "txtpaths = sorted(Path('data').glob('*new.txt'))\n", - "txts = [open(txt).read() for txt in txtpaths]\n", + "txts = [open(txt, 'r', encoding='utf-8').read() for txt in txtpaths]\n", "txt_df = pd.DataFrame([txts], columns=txtpaths)\n", "pd.set_option(\"display.max_colwidth\", None)\n", "display(HTML(txt_df.to_html().replace(\"\\\\n\",\"<br>\")))" -- GitLab