From 7ec4164797f85a87942ddb6c969549e87f33d924 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20R=C3=B6ggla?= Date: Fri, 17 Jul 2020 13:23:52 +0200 Subject: [PATCH] schlagworte column and abstract look up in parent element --- .../test_\303\274bergeordnet.py" | 21 ++-- .../dataextractors/abstract.py | 97 ++++++++++++++- .../dataextractors/\303\274bergeordnet.py" | 111 ++++++------------ 3 files changed, 143 insertions(+), 86 deletions(-) diff --git "a/test/dataextractors/test_\303\274bergeordnet.py" "b/test/dataextractors/test_\303\274bergeordnet.py" index c05550e..80419be 100644 --- "a/test/dataextractors/test_\303\274bergeordnet.py" +++ "b/test/dataextractors/test_\303\274bergeordnet.py" @@ -1,25 +1,24 @@ -import re as regex - import httpx import pandas as pd import pytest from travelogues_extraction.getrecords.session import RecordRetriever from travelogues_extraction.getrecords.acnumber_extractor import extract_ac_from_series -from travelogues_extraction.dataextractors.dataextractors.übergeordnet import Werktitel +from travelogues_extraction.dataextractors.dataextractors.übergeordnet import Werktitel, Schlagworte from travelogues_extraction.dataextractors.dataextractors.index import IndexSetter dummy_data = pd.read_excel('test/dummy_data/TravelogueD18_ALMAoutput_20200707.xlsx') @pytest.mark.asyncio -async def test_werktitel(): +async def test_werktitel_and_achlagworte(): reduced_series = dummy_data['Datensatznummer'][:25] session = httpx.AsyncClient() record_retriever = RecordRetriever(extract_ac_from_series(reduced_series), session=session) - target_dataframe = pd.DataFrame([]) + target_dataframe = pd.DataFrame([], columns=[IndexSetter.column, Werktitel.column, Schlagworte.column]) index_setter = IndexSetter(target_dataframe) werktitel = Werktitel(target_dataframe) + schlagworte = Schlagworte(target_dataframe) index = 0 max = 50 @@ -30,8 +29,9 @@ async def test_werktitel(): await index_setter.write(record) assert record.ac_number in target_dataframe.index await werktitel.write(record) + await schlagworte.write(record) - assert target_dataframe.shape == (index + 1, 2) + assert target_dataframe.shape == (index + 1, 3) if record.ac_number in empty: assert target_dataframe[target_dataframe.index == record.ac_number]['Werktitel'].isna().sum() == 1 @@ -55,9 +55,16 @@ async def test_werktitel(): assert elements[1] == '1793' assert elements[2].startswith('http') + schlagworte_val: str = target_dataframe.at[record.ac_number, schlagworte.column] + assert schlagworte_val.__class__ is str + schlagworte_list = schlagworte_val.split(schlagworte.join_string) + assert len(schlagworte_list) > 0 + assert all([schlagwort[0].isalpha() for schlagwort in schlagworte_list]) + - if record.ac_number == 'AC09975167': + if record.ac_number == 'AC09975167': # first known number with parent titel break + index += 1 if index == max: break diff --git a/travelogues_extraction/dataextractors/abstract.py b/travelogues_extraction/dataextractors/abstract.py index a0f6b54..5d9af0a 100644 --- a/travelogues_extraction/dataextractors/abstract.py +++ b/travelogues_extraction/dataextractors/abstract.py @@ -3,14 +3,16 @@ from dataclasses import dataclass import re as regex import typing +import httpx from pandas import DataFrame if typing.TYPE_CHECKING: import pandas as pd - from travelogues_extraction.getrecords.session import RecordRetriever from lxml import etree as lxmletree +from travelogues_extraction.getrecords.session import RecordRetriever + class AbstractDataExtractor(ABC): target_dataframe: DataFrame @@ -120,3 +122,96 @@ class AbstractMultifield(AbstractDataExtractor): self.target_dataframe.at[record.ac_number, self.column] = result return result + +class AbstractParentAsSecondCast(AbstractDataExtractor): + + column: str + + @classmethod + def get_columns_names_I_work_on(cls) -> list: + return [cls.column] + + parent_ac_xpath: lxmletree.XPath + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.parend_titles = {} + self.log = [] + + @abstractmethod + async def _write(self, record: 'RecordRetriever.Record') -> typing.Optional[str]: + """ + the same as the usual write method, just that this one, will be called by the write(), which handles the parent logic + :param record: + :return: + """ + pass + + async def write(self, record: 'RecordRetriever.Record') -> typing.Optional[str]: + async with httpx.AsyncClient() as session: + contents = await self._write(record) + if not (contents is None): + return + + # else: try it with parents! + ac_parent_elements = self.parent_ac_xpath(record.lxmlelement) + + if len(ac_parent_elements) == 0: + return # and cry + + ac_parent = ac_parent_elements[0].text.replace('(AT-OBV)', '') + # check if we have requested this ac number already + if ac_parent in self.parend_titles: + self.target_dataframe.at[record.ac_number, self.column] = self.parend_titles[ac_parent] + return # and smile + + # else: get the data + + parent_response = await session.get( + url='https://obv-at-oenb.alma.exlibrisgroup.com/view/sru/43ACC_ONB', params={ + 'startRecord': 1, + 'maximumRecords': 1, + 'query': f'alma.local_control_field_009={ac_parent}', + 'version': '1.2', + 'operation': 'searchRetrieve', + 'recordSchema': 'marcxml', + }, timeout=5 + ) + if parent_response.status_code != '200': + self.log.append({ + 'status_code': parent_response.status_code, + 'url': parent_response.url, + 'message': parent_response.text, + 'ac_child': record.ac_number, + 'ac_parent': ac_parent, + }) + + try: + xml = lxmletree.fromstring(parent_response.content) + except Exception as exception: + self.log.append({ + 'ac_child': record.ac_number, + 'ac_parent': ac_parent, + 'xml_error': exception.__str__(), + 'xml': parent_response.text + }) + + records = RecordRetriever.record_xpath(xml) + + if len(records) == 0: + self.log.append({ + 'issue': 'norecords', + 'ac_child': record.ac_number, + 'ac_parent': ac_parent + }) + return + + return await self._write( + RecordRetriever.Record(lxmlelement=records[0], + ac_number=record.ac_number) + ) + + + + + diff --git "a/travelogues_extraction/dataextractors/dataextractors/\303\274bergeordnet.py" "b/travelogues_extraction/dataextractors/dataextractors/\303\274bergeordnet.py" index 3b434fe..34cca63 100644 --- "a/travelogues_extraction/dataextractors/dataextractors/\303\274bergeordnet.py" +++ "b/travelogues_extraction/dataextractors/dataextractors/\303\274bergeordnet.py" @@ -1,14 +1,17 @@ -from typing import Dict, List, Any, TYPE_CHECKING +from typing import Dict, List, Any, Optional import httpx +import typing from lxml import etree as lxmletree from travelogues_extraction.dataextractors.namespaces import namespaces from travelogues_extraction.getrecords.session import RecordRetriever -from travelogues_extraction.dataextractors.abstract import AbstractDataExtractor +from travelogues_extraction.dataextractors.abstract import AbstractParentAsSecondCast, AbstractXpathJoinDirectlyToColumn -class Werktitel(AbstractDataExtractor): +class Werktitel(AbstractParentAsSecondCast): + + column: str = 'Werktitel' log: List[Dict[str, Any]] parend_titles: Dict[str, str] @@ -41,18 +44,9 @@ class Werktitel(AbstractDataExtractor): './marc:datafield[@tag="773" and @ind1="0" and @ind2="8"]/marc:subfield[@code="w"]', namespaces=namespaces) - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.parend_titles = {} - self.log = [] - - @classmethod - def get_columns_names_I_work_on(cls) -> list: - return ['Werktitel'] - - def get_contents(self, element: lxmletree._Element) -> list: + async def _write(self, record: 'RecordRetriever.Record') -> Optional[str]: contents = [] - werktitel_fields = [xpath(element) for xpath in self.werktitel_xpaths] + werktitel_fields = [xpath(record.lxmlelement) for xpath in self.werktitel_xpaths] for werktitel_field, subbfield_xpath in zip(werktitel_fields, self.subfield_xpaths): if len(werktitel_field) == 0: continue @@ -67,67 +61,28 @@ class Werktitel(AbstractDataExtractor): contents.append( self.lower_level_join_string.join([sub_content.text for sub_content in sub_contents])) - return contents - - async def write(self, record: 'RecordRetriever.Record'): - async with httpx.AsyncClient() as self.session: - contents = self.get_contents(record.lxmlelement) - if len(contents) != 0: - self.target_dataframe.at[record.ac_number, 'Werktitel'] = self.upper_level_join_string.join(contents) - return - - # else: try it with parents! - ac_parent_elements = self.parent_ac_xpath(record.lxmlelement) - - if len(ac_parent_elements) == 0: - return # and cry - - ac_parent = ac_parent_elements[0].text.replace('(AT-OBV)', '') - # check if we have requested this ac number already - if ac_parent in self.parend_titles: - self.target_dataframe.at[record.ac_number, 'Werktitel'] = self.parend_titles[ac_parent] - return # and smile - - # else: get the data - - parent_response = await self.session.get(url='https://obv-at-oenb.alma.exlibrisgroup.com/view/sru/43ACC_ONB', params={ - 'startRecord': 1, - 'maximumRecords': 1, - 'query': f'alma.local_control_field_009={ac_parent}', - 'version': '1.2', - 'operation': 'searchRetrieve', - 'recordSchema': 'marcxml', - } - ) - if parent_response.status_code != '200': - self.log.append({ - 'status_code': parent_response.status_code, - 'url': parent_response.url, - 'message': parent_response.text, - 'ac_child': record.ac_number, - 'ac_parent': ac_parent, - }) - - try: - xml = lxmletree.fromstring(parent_response.content) - except Exception as exception: - self.log.append({ - 'ac_child': record.ac_number, - 'ac_parent': ac_parent, - 'xml_error': exception.__str__(), - 'xml': parent_response.text - }) - - records = RecordRetriever.record_xpath(xml) - - if len(records) == 0: - self.log.append({ - 'issue': 'norecords', - 'ac_child': record.ac_number, - 'ac_parent': ac_parent - }) - return - - contents = self.get_contents(records[0]) - if len(contents) != 0: - self.target_dataframe.at[record.ac_number, 'Werktitel'] = self.upper_level_join_string.join(contents) + if len(contents) > 0: + content = self.upper_level_join_string.join(contents) + self.target_dataframe.at[record.ac_number, self.column] = content + return content + + +class Schlagworte(AbstractParentAsSecondCast): + + column: str = 'Schlagworte' + join_string = AbstractXpathJoinDirectlyToColumn.join_string # to lazy for multi inheritance + parent_ac_xpath = lxmletree.XPath( + ( + './marc:datafield[' + '(@tag="773" and @ind1="0" and @ind2="8")' + ' or ' + '(@tag="830" and @ind1=" " and @ind2="0")' + ']/marc:subfield[@code="w"]' + ), + namespaces=namespaces) + xpath = lxmletree.XPath('./marc:datafield[@tag="689"]/marc:subfield[@code="a"]', namespaces=namespaces) + + async def _write(self, record: 'RecordRetriever.Record') -> typing.Optional[str]: + return await AbstractXpathJoinDirectlyToColumn.write(self, record) + + -- GitLab