Skip to content
session.py 3.87 KiB
Newer Older
philip.roeggla's avatar
philip.roeggla committed
import asyncio
phylogram's avatar
phylogram committed
from dataclasses import dataclass
import typing

philip.roeggla's avatar
philip.roeggla committed
import httpcore
phylogram's avatar
phylogram committed
from lxml import etree as lxmletree
import httpx

from travelogues_extraction.dataextractors.namespaces import namespaces


class RecordRetriever:

    ac_list: typing.List[str]
    session: httpx.AsyncClient

    url = 'https://obv-at-oenb.alma.exlibrisgroup.com/view/sru/43ACC_ONB'

    log: list
    all_ok: bool
philip.roeggla's avatar
philip.roeggla committed
    sleep: float
phylogram's avatar
phylogram committed

    record_xpath = lxmletree.XPath(
        r'/srw:searchRetrieveResponse/srw:records/srw:record[1]/srw:recordData/marc:record[1]',
        namespaces=namespaces
    )

    def __init__(self, ac_numbers: typing.List[str], session: httpx.AsyncClient):
        self.ac_list = ac_numbers
        self.log = []
        self.all_ok = True  # so far
        self.session = session
philip.roeggla's avatar
philip.roeggla committed
        self.sleep = 0.01
phylogram's avatar
phylogram committed

    @dataclass
    class Record:
        ac_number: str
        lxmlelement: lxmletree._Element

    async def generate_records(self) -> typing.Generator[Record, None, None]:
        for ac_number in self.ac_list:
            record = await self.get_record_from_ac_number(ac_number)
            if record is None:
                continue
            else:
                yield record



    async def get_record_from_ac_number(self, ac_number: str) -> typing.Optional[Record]:
philip.roeggla's avatar
philip.roeggla committed

        try:
            await asyncio.sleep(self.sleep)
            response = await self.session.get(url=self.url, params={
                'startRecord': 1,
                'maximumRecords': 1,
                'query': f'alma.local_control_field_009={ac_number}',
                'version': '1.2',
                'operation': 'searchRetrieve',
                'recordSchema': 'marcxml',
            }, timeout=60)

        except httpx.ConnectTimeout as timeout:
            self.log.append({
                'error': 'time_out',
                'message': str(timeout),
                'ac_number': ac_number,
            })
            return

        except Exception as error:
            self.sleep *= 1.3
            try:
                await asyncio.sleep(self.sleep)
                response = await self.session.get(url=self.url, params={
                    'startRecord': 1,
                    'maximumRecords': 1,
                    'query': f'alma.local_control_field_009={ac_number}',
                    'version': '1.2',
                    'operation': 'searchRetrieve',
                    'recordSchema': 'marcxml',
                }, timeout=60)

            except httpx.ConnectTimeout as timeout:
                self.log.append({
                    'error': 'time_out',
                    'message': str(timeout),
                    'ac_number': ac_number,
                })
                return
            except Exception as error:
                self.sleep *= 1.3
                self.log.append({
                    'error': 'time_out',
                    'message': str(error),
                    'ac_number': ac_number,
                })
                return

phylogram's avatar
phylogram committed

        sub_log = {
            'ac_number': ac_number,
            'status_code': response.status_code,
philip.roeggla's avatar
philip.roeggla committed
            'url': response.url.full_path,
phylogram's avatar
phylogram committed
        }

        if response.status_code != 200:
            sub_log['response_error_message'] = response.text
            self.log.append(sub_log)
            self.all_ok = False
            return None

        try:
            xml = lxmletree.fromstring(response.content)
        except Exception as e:
            sub_log['lxml_error'] = e
            sub_log['content'] = response.content
            self.log.append(sub_log)
            self.all_ok = False
            return None

        records: typing.List[lxmletree._Element] = self.record_xpath(xml)

        if len(records) == 0:
            sub_log['no_records_found'] = True
            self.log.append(sub_log)
            self.all_ok = False
            return None

        return self.Record(ac_number=ac_number, lxmlelement=records[0])