diff --git a/travelogues_extraction/dataextractors/abstract.py b/travelogues_extraction/dataextractors/abstract.py index 5d9af0a58cb0369dcb97230cdecb5abb9bdd93ab..42990c582782b562e9238994753e6d2d2d6e57a5 100644 --- a/travelogues_extraction/dataextractors/abstract.py +++ b/travelogues_extraction/dataextractors/abstract.py @@ -1,8 +1,10 @@ from abc import ABC, abstractmethod +import asyncio from dataclasses import dataclass import re as regex import typing +import httpcore import httpx from pandas import DataFrame @@ -126,6 +128,7 @@ class AbstractMultifield(AbstractDataExtractor): class AbstractParentAsSecondCast(AbstractDataExtractor): column: str + sleep: float @classmethod def get_columns_names_I_work_on(cls) -> list: @@ -137,6 +140,7 @@ class AbstractParentAsSecondCast(AbstractDataExtractor): super().__init__(*args, **kwargs) self.parend_titles = {} self.log = [] + self.sleep = 0.01 @abstractmethod async def _write(self, record: 'RecordRetriever.Record') -> typing.Optional[str]: @@ -167,24 +171,71 @@ class AbstractParentAsSecondCast(AbstractDataExtractor): # else: get the data - parent_response = await session.get( - url='https://obv-at-oenb.alma.exlibrisgroup.com/view/sru/43ACC_ONB', params={ - 'startRecord': 1, - 'maximumRecords': 1, - 'query': f'alma.local_control_field_009={ac_parent}', - 'version': '1.2', - 'operation': 'searchRetrieve', - 'recordSchema': 'marcxml', - }, timeout=5 - ) - if parent_response.status_code != '200': + try: + await asyncio.sleep(self.sleep) + parent_response = await session.get( + url='https://obv-at-oenb.alma.exlibrisgroup.com/view/sru/43ACC_ONB', params={ + 'startRecord': 1, + 'maximumRecords': 1, + 'query': f'alma.local_control_field_009={ac_parent}', + 'version': '1.2', + 'operation': 'searchRetrieve', + 'recordSchema': 'marcxml', + }, timeout=60 + ) + + except httpx.ConnectTimeout as timeout: + self.log.append({ + 'error': 'time_out', + 'message': str(timeout), + 'ac_child': record.ac_number, + 'ac_parent': ac_parent, + }) + return + + except Exception as error: + self.sleep *= 1.3 + try: + await asyncio.sleep(self.sleep) + parent_response = await session.get( + url='https://obv-at-oenb.alma.exlibrisgroup.com/view/sru/43ACC_ONB', params={ + 'startRecord': 1, + 'maximumRecords': 1, + 'query': f'alma.local_control_field_009={ac_parent}', + 'version': '1.2', + 'operation': 'searchRetrieve', + 'recordSchema': 'marcxml', + }, timeout=60 + ) + + except httpx.ConnectTimeout as timeout: + self.log.append({ + 'error': 'time_out', + 'message': str(timeout), + 'ac_child': record.ac_number, + 'ac_parent': ac_parent, + }) + return + + except Exception as error: + self.sleep *= 1.3 + self.log.append({ + 'error': 'time_out', + 'message': str(error), + 'ac_child': record.ac_number, + 'ac_parent': ac_parent, + }) + return + + if parent_response.status_code != 200: self.log.append({ 'status_code': parent_response.status_code, - 'url': parent_response.url, + 'url': parent_response.url.full_path, 'message': parent_response.text, 'ac_child': record.ac_number, 'ac_parent': ac_parent, }) + return try: xml = lxmletree.fromstring(parent_response.content) diff --git "a/travelogues_extraction/dataextractors/dataextractors/\303\274bergeordnet.py" "b/travelogues_extraction/dataextractors/dataextractors/\303\274bergeordnet.py" index 34cca63c7ee2672a8b8a7d5dfa88dd34efb40c2f..45fd2bc85fb7fe5a786a6b29f7f09693dcd3a2ab 100644 --- "a/travelogues_extraction/dataextractors/dataextractors/\303\274bergeordnet.py" +++ "b/travelogues_extraction/dataextractors/dataextractors/\303\274bergeordnet.py" @@ -11,6 +11,9 @@ from travelogues_extraction.dataextractors.abstract import AbstractParentAsSecon class Werktitel(AbstractParentAsSecondCast): + def __init__(self, *args, **kwargs): + AbstractParentAsSecondCast.__init__(self, *args, **kwargs) + column: str = 'Werktitel' log: List[Dict[str, Any]] @@ -69,6 +72,9 @@ class Werktitel(AbstractParentAsSecondCast): class Schlagworte(AbstractParentAsSecondCast): + def __init__(self, *args, **kwargs): + AbstractParentAsSecondCast.__init__(self, *args, **kwargs) + column: str = 'Schlagworte' join_string = AbstractXpathJoinDirectlyToColumn.join_string # to lazy for multi inheritance parent_ac_xpath = lxmletree.XPath(