diff --git a/requirements.txt b/requirements.txt index 3410fb4de8490a33402d535d121e720a9ada67ce..4ecb19fd7c9889e096f8ffff957f0472f357e6f6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,12 @@ attrs==19.3.0 +backcall==0.2.0 +bleach==3.1.5 certifi==2020.6.20 chardet==3.0.4 +decorator==4.4.2 +defusedxml==0.6.0 +entrypoints==0.3 +et-xmlfile==1.0.1 h11==0.9.0 h2==3.2.0 hpack==3.0.0 @@ -10,21 +16,58 @@ httpx==0.13.3 hyperframe==5.2.0 idna==2.10 importlib-metadata==1.7.0 +ipykernel==5.3.2 +ipython==7.16.1 +ipython-genutils==0.2.0 +ipywidgets==7.5.1 +jdcal==1.4.1 +jedi==0.17.1 +Jinja2==2.11.2 +jsonschema==3.2.0 +jupyter==1.0.0 +jupyter-client==6.1.6 +jupyter-console==6.1.0 +jupyter-core==4.6.3 lxml==4.5.2 +MarkupSafe==1.1.1 +mistune==0.8.4 more-itertools==8.4.0 +nbconvert==5.6.1 +nbformat==5.0.7 +notebook==6.0.3 numpy==1.19.0 +openpyxl==3.0.4 packaging==20.4 pandas==1.0.5 +pandocfilters==1.4.2 +parso==0.7.0 +pexpect==4.8.0 +pickleshare==0.7.5 pluggy==0.13.1 +prometheus-client==0.8.0 +prompt-toolkit==3.0.5 +ptyprocess==0.6.0 py==1.9.0 +Pygments==2.6.1 pyparsing==2.4.7 +pyrsistent==0.16.0 pytest==5.4.3 pytest-asyncio==0.14.0 python-dateutil==2.8.1 pytz==2020.1 +pyzmq==19.0.1 +qtconsole==4.7.5 +QtPy==1.9.0 rfc3986==1.4.0 +Send2Trash==1.5.0 six==1.15.0 sniffio==1.1.0 +terminado==0.8.3 +testpath==0.4.4 +tornado==6.0.4 +traitlets==4.3.3 wcwidth==0.2.5 +webencodings==0.5.1 +widgetsnbextension==3.5.1 xlrd==1.2.0 zipp==3.1.0 diff --git a/travelogues_extraction/dataextractors/dataextractors/simple.py b/travelogues_extraction/dataextractors/dataextractors/simple.py index e1a716a4189659d20b98cf2540ea52a6f5747316..985761b0df11e2a9ecd4ba5013d11c1ee64a438d 100644 --- a/travelogues_extraction/dataextractors/dataextractors/simple.py +++ b/travelogues_extraction/dataextractors/dataextractors/simple.py @@ -97,13 +97,13 @@ class ErscheinungsjahrSortierform(AbstractXpathJoinDirectlyToColumn): class Kollation(AbstractXpathJoinDirectlyToColumn): column = 'Kollation' - xpath = lxmletree.XPath('./marc:datafield[@tag="300" and @ind1=" " and @ind2=" "]/marc:subfield[@code="a"]', namespaces=namespaces) + xpath = lxmletree.XPath('(./marc:datafield[@tag="300" and @ind1=" " and @ind2=" "]/marc:subfield[@code="a"])[1]', namespaces=namespaces) # È tutto class Illustrationen(AbstractXpathJoinDirectlyToColumn): - xpath = lxmletree.XPath('./marc:datafield[@tag="300"]/marc:subfield[@code="b"]', namespaces=namespaces) + xpath = lxmletree.XPath('./marc:datafield[@tag="300"]/marc:subfield[@code="b"][1]', namespaces=namespaces) regexes = [ regex.compile(r'(\d+)\s*?(?=[Ii]llustration)'), regex.compile(r'(\d+)\s*?(?=[Kk]arte)'), @@ -196,7 +196,11 @@ class Inhalt(AbstractXpathJoinDirectlyToColumn): class Marker(AbstractXpathJoinDirectlyToColumn): column = 'Marker' - xpath = lxmletree.XPath('./marc:datafield[@tag="980" and @ind1="0" and @ind2=" "]/marc:subfield[@code="a"]', namespaces=namespaces) + xpath = lxmletree.XPath( + ( + './marc:datafield[@tag="980" and @ind1="0" and @ind2=" "]' + '/marc:subfield[@code="a" and starts-with(text(), "Travelogue")]' + ), namespaces=namespaces) class Zusammenstellung(AbstractXpathJoinDirectlyToColumn): diff --git a/travelogues_extraction/script/script.py b/travelogues_extraction/script/script.py index 74c209a3577e74c26cc1e0a91328bd2b89a65e46..d412aee0fa4d4145b1b8fbb685eb31ddc2ea84bd 100644 --- a/travelogues_extraction/script/script.py +++ b/travelogues_extraction/script/script.py @@ -27,7 +27,7 @@ async def extract(input_file: str, output_folder: str) -> typing.NoReturn: + datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) ) - converter = FromAlmaOutputToExcel(input_file, output_path.absolute(), slice(None, None, 1)) + converter = FromAlmaOutputToExcel(input_file, output_path.absolute(), slice(None, 2, 1)) await converter.runasync() await converter.close() converter.write()