import asyncio import time import typing from typing import List import httpx from httpx import AsyncClient from travelogues_extraction.getrecords.session import RecordRetriever if typing.TYPE_CHECKING: from travelogues_extraction.dataextractors.abstract import AbstractDataExtractor import pandas as pd from travelogues_extraction.getrecords.acnumber_extractor import extract_ac_from_series from travelogues_extraction.dataextractors.dataextractors.index import * from travelogues_extraction.dataextractors.dataextractors.combinedsubfields import * from travelogues_extraction.dataextractors.dataextractors.übergeordnet import * from travelogues_extraction.dataextractors.dataextractors.simple import * class FromAlmaOutputToExcel: extractors_objects: List[AbstractDataExtractor] client: AsyncClient record_retriever: RecordRetriever log: pd.DataFrame ac_numbers: typing.List[str] target_output_string: str target_output_dataframe: pd.DataFrame n: int extractors: typing.List[typing.Type['AbstractDataExtractor']] = [ IndexSetter, MMSID, VolltextAndBarcode, VerfasserGND, Werktitel, ReihentitelBandzählung, HaupttitelTitelzusatzVerantwortlichkeitsangabe, BandzählungTitelDesBandes, Ausgabe, Verlagsort, VerlagsOrtNormiertGND, DruckOrtNormiertGND, VerlegerDrucker, VerlegerNormiertGNDID, DruckerGNDID, Erscheinungsjahr, ErscheinungsjahrSortierform, Kollation, Illustrationen, Format, Anmerkungen, Sprache, OriginalSprache, BemerkungenZurSprache, Standardnummer, WeitereVerfasserGNDID, HerausgeberGNDID, ÜbersetzerGNDID, BeiträgerGNDID, WeitereBeteiligteGNDID, IllustratorenGNDID, WidmenderGNDID, WidmungsempfängerGNDID, ArtDesInhalts, Inhalt, WerkeInRelation ] def __init__(self, alma_output: str, target_output: str, slice: slice): """ file path :param alma_output: """ self.target_output_string = target_output input_data = pd.read_excel(alma_output, usecols=['Datensatznummer']) self.ac_numbers = extract_ac_from_series(input_data['Datensatznummer'][slice]) self.target_output_dataframe = pd.DataFrame([], columns=[column for extractor in self.extractors for column in extractor.get_columns_names_I_work_on() ], index=self.ac_numbers ) self.log = pd.DataFrame([], index=self.target_output_dataframe.index, columns=['time', 'n']) self.extractors_objects = [ extractor(self.target_output_dataframe) for extractor in self.extractors ] self.n = 0 self.client = httpx.AsyncClient() self.record_retriever = RecordRetriever(self.ac_numbers, self.client) async def _extract_from_ac_number(self, ac_number: str): record = await self.record_retriever.get_record_from_ac_number(ac_number) self.log.at[record.ac_number, 'time'] = time.time_ns() self.log.at[record.ac_number, 'n'] = self.n for extractor in self.extractors_objects: await extractor.write(record) self.n += 1 async def runasync(self): await asyncio.wait( [ asyncio.create_task(self._extract_from_ac_number(ac_number)) for ac_number in self.ac_numbers ] ) def write(self): self.target_output_dataframe.to_excel(self.target_output_string) async def close(self): await self.client.aclose()