Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import asyncio
import time
import typing
from typing import List
import httpx
from httpx import AsyncClient
from travelogues_extraction.getrecords.session import RecordRetriever
if typing.TYPE_CHECKING:
from travelogues_extraction.dataextractors.abstract import AbstractDataExtractor
import pandas as pd
from travelogues_extraction.getrecords.acnumber_extractor import extract_ac_from_series
from travelogues_extraction.dataextractors.dataextractors.index import *
from travelogues_extraction.dataextractors.dataextractors.combinedsubfields import *
from travelogues_extraction.dataextractors.dataextractors.übergeordnet import *
from travelogues_extraction.dataextractors.dataextractors.simple import *
class FromAlmaOutputToExcel:
extractors_objects: List[AbstractDataExtractor]
client: AsyncClient
record_retriever: RecordRetriever
log: pd.DataFrame
ac_numbers: typing.List[str]
target_output_string: str
target_output_dataframe: pd.DataFrame
n: int
extractors: typing.List[typing.Type['AbstractDataExtractor']] = [
IndexSetter, MMSID,
VolltextAndBarcode, VerfasserGND, Werktitel, ReihentitelBandzählung, HaupttitelTitelzusatzVerantwortlichkeitsangabe,
BandzählungTitelDesBandes, Ausgabe, Verlagsort, VerlagsOrtNormiertGND, DruckOrtNormiertGND, VerlegerDrucker,
VerlegerNormiertGNDID, DruckerGNDID, Erscheinungsjahr, ErscheinungsjahrSortierform, Kollation, Illustrationen,
Format, Anmerkungen, Sprache, OriginalSprache, BemerkungenZurSprache, Standardnummer, WeitereVerfasserGNDID,
HerausgeberGNDID, ÜbersetzerGNDID, BeiträgerGNDID, WeitereBeteiligteGNDID, IllustratorenGNDID, WidmenderGNDID,
WidmungsempfängerGNDID, ArtDesInhalts, Inhalt, WerkeInRelation, Schlagworte
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
]
def __init__(self, alma_output: str, target_output: str, slice: slice):
"""
file path
:param alma_output:
"""
self.target_output_string = target_output
input_data = pd.read_excel(alma_output, usecols=['Datensatznummer'])
self.ac_numbers = extract_ac_from_series(input_data['Datensatznummer'][slice])
self.target_output_dataframe = pd.DataFrame([],
columns=[column for extractor in self.extractors for column in
extractor.get_columns_names_I_work_on()
],
index=self.ac_numbers
)
self.log = pd.DataFrame([], index=self.target_output_dataframe.index, columns=['time', 'n'])
self.extractors_objects = [
extractor(self.target_output_dataframe)
for extractor in self.extractors
]
self.n = 0
self.client = httpx.AsyncClient()
self.record_retriever = RecordRetriever(self.ac_numbers, self.client)
async def _extract_from_ac_number(self, ac_number: str):
record = await self.record_retriever.get_record_from_ac_number(ac_number)
self.log.at[record.ac_number, 'time'] = time.time_ns()
self.log.at[record.ac_number, 'n'] = self.n
for extractor in self.extractors_objects:
await extractor.write(record)
self.n += 1
async def runasync(self):
await asyncio.wait(
[
asyncio.create_task(self._extract_from_ac_number(ac_number))
for ac_number in self.ac_numbers
]
)
def write(self):
self.target_output_dataframe.to_excel(self.target_output_string)
async def close(self):
await self.client.aclose()