import httpx
import pandas as pd
import pytest

from travelogues_extraction.getrecords.session import RecordRetriever
from travelogues_extraction.getrecords.acnumber_extractor import extract_ac_from_series
from travelogues_extraction.dataextractors.dataextractors.übergeordnet import Werktitel, Schlagworte
from travelogues_extraction.dataextractors.dataextractors.index import IndexSetter

dummy_data = pd.read_excel('test/dummy_data/TravelogueD18_ALMAoutput_20200707.xlsx')


@pytest.mark.asyncio
async def test_werktitel_and_achlagworte():
    reduced_series = dummy_data['Datensatznummer'][:25]
    session = httpx.AsyncClient()
    record_retriever = RecordRetriever(extract_ac_from_series(reduced_series), session=session)
    target_dataframe = pd.DataFrame([], columns=[IndexSetter.column, Werktitel.column, Schlagworte.column])
    index_setter = IndexSetter(target_dataframe)
    werktitel = Werktitel(target_dataframe)
    schlagworte = Schlagworte(target_dataframe)

    index = 0
    max = 50

    empty = {'AC09682453', 'AC09731407', 'AC09792500', 'AC09705420'}

    async for record in record_retriever.generate_records():
        await index_setter.write(record)
        assert record.ac_number in target_dataframe.index
        await werktitel.write(record)
        await schlagworte.write(record)

        assert target_dataframe.shape == (index + 1, 3)

        if record.ac_number in empty:
            assert target_dataframe[target_dataframe.index == record.ac_number]['Werktitel'].isna().sum() == 1

        else:
            value = target_dataframe.at[record.ac_number, 'Werktitel']
            assert value.__class__ is str
            elements = value.split(werktitel.lower_level_join_string)
            if record.ac_number == 'AC03114611':
                assert len(elements) == 3
                assert all([element.__class__ is str for element in elements])
                assert elements[0][0].isalpha()
                assert elements[1] == '1788'
                assert elements[2].startswith('http')

            if record.ac_number == 'AC09975167':
                # this is the first element, where the titel comes from the parent book.
                assert len(elements) == 3
                assert all([element.__class__ is str for element in elements])
                assert elements[0][0].isalpha()
                assert elements[1] == '1793'
                assert elements[2].startswith('http')

        schlagworte_val: str = target_dataframe.at[record.ac_number, schlagworte.column]
        assert schlagworte_val.__class__ is str
        schlagworte_list = schlagworte_val.split(schlagworte.join_string)
        assert len(schlagworte_list) > 0
        assert all([schlagwort[0].isalpha() for schlagwort in schlagworte_list])


        if record.ac_number == 'AC09975167': # first known number with parent titel
            break

        index += 1
        if index == max:
            break

    assert target_dataframe['Werktitel'].isna().sum() == len(empty)

@pytest.mark.asyncio
async def test_schlagworte_fallback():
    session = httpx.AsyncClient()
    rr = RecordRetriever([], session=session)
    df = pd.DataFrame()
    schlag = Schlagworte(df)
    record = await rr.get_record_from_ac_number('AC11064890')
    result = await schlag.write(record)
    assert not (result is None)
    assert df.shape == (1, 1)