import asyncio from dataclasses import dataclass import typing import httpcore from lxml import etree as lxmletree import httpx from travelogues_extraction.dataextractors.namespaces import namespaces class RecordRetriever: ac_list: typing.List[str] session: httpx.AsyncClient url = 'https://obv-at-oenb.alma.exlibrisgroup.com/view/sru/43ACC_ONB' log: list all_ok: bool sleep: float record_xpath = lxmletree.XPath( r'/srw:searchRetrieveResponse/srw:records/srw:record[1]/srw:recordData/marc:record[1]', namespaces=namespaces ) def __init__(self, ac_numbers: typing.List[str], session: httpx.AsyncClient): self.ac_list = ac_numbers self.log = [] self.all_ok = True # so far self.session = session self.sleep = 0.01 @dataclass class Record: ac_number: str lxmlelement: lxmletree._Element async def generate_records(self) -> typing.Generator[Record, None, None]: for ac_number in self.ac_list: record = await self.get_record_from_ac_number(ac_number) if record is None: continue else: yield record async def get_record_from_ac_number(self, ac_number: str) -> typing.Optional[Record]: try: await asyncio.sleep(self.sleep) response = await self.session.get(url=self.url, params={ 'startRecord': 1, 'maximumRecords': 1, 'query': f'alma.local_control_field_009={ac_number}', 'version': '1.2', 'operation': 'searchRetrieve', 'recordSchema': 'marcxml', }, timeout=60) except httpx.ConnectTimeout as timeout: self.log.append({ 'error': 'time_out', 'message': str(timeout), 'ac_number': ac_number, }) return except Exception as error: self.sleep *= 1.3 try: await asyncio.sleep(self.sleep) response = await self.session.get(url=self.url, params={ 'startRecord': 1, 'maximumRecords': 1, 'query': f'alma.local_control_field_009={ac_number}', 'version': '1.2', 'operation': 'searchRetrieve', 'recordSchema': 'marcxml', }, timeout=60) except httpx.ConnectTimeout as timeout: self.log.append({ 'error': 'time_out', 'message': str(timeout), 'ac_number': ac_number, }) return except Exception as error: self.sleep *= 1.3 self.log.append({ 'error': 'time_out', 'message': str(error), 'ac_number': ac_number, }) return sub_log = { 'ac_number': ac_number, 'status_code': response.status_code, 'url': response.url.full_path, } if response.status_code != 200: sub_log['response_error_message'] = response.text self.log.append(sub_log) self.all_ok = False return None try: xml = lxmletree.fromstring(response.content) except Exception as e: sub_log['lxml_error'] = e sub_log['content'] = response.content self.log.append(sub_log) self.all_ok = False return None records: typing.List[lxmletree._Element] = self.record_xpath(xml) if len(records) == 0: sub_log['no_records_found'] = True self.log.append(sub_log) self.all_ok = False return None return self.Record(ac_number=ac_number, lxmlelement=records[0])