import httpx
import pandas as pd
import pytest
import typing

from travelogues_extraction.getrecords.session import RecordRetriever
from travelogues_extraction.getrecords.acnumber_extractor import extract_ac_from_series
from travelogues_extraction.dataextractors.dataextractors.combinedsubfields import VerfasserGND, ReihentitelBandzählung, \
    HaupttitelTitelzusatzVerantwortlichkeitsangabe, BandzählungTitelDesBandes, VerlagsOrtNormiertGND, DruckOrtNormiertGND, \
    VerlegerNormiertGNDID, DruckerGNDID, WeitereVerfasserGNDID, HerausgeberGNDID, ÜbersetzerGNDID, BeiträgerGNDID, \
    WeitereBeteiligteGNDID, IllustratorenGNDID, WidmenderGNDID

from travelogues_extraction.dataextractors.dataextractors.index import IndexSetter

dummy_data = pd.read_excel('test/dummy_data/TravelogueD18_ALMAoutput_20200707.xlsx')


@pytest.mark.asyncio
async def test_multiple_classes():
    """
    Here I test multiple extractors, which all do very similar stuff, so I do not have to make so much requests.
    todo: order them better for readability
    :return:
    """

    reduced_series = dummy_data['Datensatznummer'][:25]
    session = httpx.AsyncClient()
    record_retriever = RecordRetriever(extract_ac_from_series(reduced_series), session=session)
    target_dataframe = pd.DataFrame([], columns=[
        IndexSetter.column, VerfasserGND.column, ReihentitelBandzählung.column,
        HaupttitelTitelzusatzVerantwortlichkeitsangabe.column, BandzählungTitelDesBandes.column,
        VerlagsOrtNormiertGND.column, DruckOrtNormiertGND, DruckerGNDID.column, HerausgeberGNDID.column,
        ÜbersetzerGNDID.column, IllustratorenGNDID.column, WidmenderGNDID.column
    ])
    index_setter = IndexSetter(target_dataframe)
    verfasser = VerfasserGND(target_dataframe)
    reihentitelbandzählung = ReihentitelBandzählung(target_dataframe)
    haupttitel_etc = HaupttitelTitelzusatzVerantwortlichkeitsangabe(target_dataframe)
    bandzählung_etc = BandzählungTitelDesBandes(target_dataframe)
    verlag_etc = VerlagsOrtNormiertGND(target_dataframe)
    drucker_etc = DruckOrtNormiertGND(target_dataframe)
    verleger_normiert = VerlegerNormiertGNDID(target_dataframe)
    druckerGNDID = DruckerGNDID(target_dataframe)
    herausgeberGNDID = HerausgeberGNDID(target_dataframe)
    übersetzerGNDID = ÜbersetzerGNDID(target_dataframe)
    illustratorenGNDID = IllustratorenGNDID(target_dataframe)
    widmenderGNDID = WidmenderGNDID(target_dataframe)

    index = 0
    stop = 7

    verlag_ort_korrekt = [['Hamburg', 'Berlin'], ['Wien'], ['Nürnberg', 'Leipzig'], ['Göttingen'], ['Hamburg'], ['Göttingen'], ['Wien'], ]
    drucker_ort_korrekt = {
            'AC03114611': 'Berlin',
            'AC03826205': 'Wien',
            'AC09682453': 'Göttingen',
            'AC09836279': 'Göttingen',
            'AC03115986': 'Wien'
    }

    verleger_normiert_korrekt = [
        'Hoffmann, Benjamin Gottlob',
        'Schrämbl, Franz Anton',
        'Adam Gottlieb Schneider-Weigelsche Kunst- und Buchhandlung',
        'Vandenhoeck, Abraham', 'Bohn, Carl Ernst',
        'Vandenhoeck, Anna',
        'Schrämbl, Franz Anton',
        'Universitäts-Buchhandlung',
    ]

    druckerGNDID_korrekt_list = ['Langhoff, Johann Georg', 'Schrämbl, Franz Anton', None, 'Vandenhoeck, Abraham', None, 'Vandenhoeck, Anna', 'Schrämbl, Franz Anton']


    async for record in record_retriever.generate_records():
        await index_setter.write(record)
        await verfasser.write(record)
        await reihentitelbandzählung.write(record)
        await haupttitel_etc.write(record)
        await bandzählung_etc.write(record)
        await verlag_etc.write(record)
        await drucker_etc.write(record)
        await verleger_normiert.write(record)
        await druckerGNDID.write(record)
        await herausgeberGNDID.write(record)
        await übersetzerGNDID.write(record)
        await illustratorenGNDID.write(record)
        await widmenderGNDID.write(record)

        assert target_dataframe.shape[0] == index + 1
        if record.ac_number == 'AC09682453':
            assert target_dataframe.at[record.ac_number, VerfasserGND.column].__class__ is float
            assert target_dataframe.at[record.ac_number, VerfasserGND.column].__repr__() == 'nan'
        else:
            assert target_dataframe.at[record.ac_number, VerfasserGND.column].__class__ is str
        assert target_dataframe.at[record.ac_number, ReihentitelBandzählung.column].__class__ is str
        assert target_dataframe.at[record.ac_number, haupttitel_etc.column].__class__ is str


        reihentitelbandzählung_val: str = target_dataframe.at[record.ac_number, reihentitelbandzählung.column]
        haupttitel_etc_val: str = target_dataframe.at[record.ac_number, haupttitel_etc.column]

        assert reihentitelbandzählung_val.__class__ is str
        assert haupttitel_etc_val.__class__ is str
        assert reihentitelbandzählung_val[0].isalpha()
        assert haupttitel_etc_val[0].isalpha() or haupttitel_etc_val[0] == '<'
        reihe_band_list = reihentitelbandzählung_val.split(reihentitelbandzählung.join_string_secondary_level)
        haupttitel_etc_list = haupttitel_etc_val.split(haupttitel_etc.join_string_secondary_level)
        assert all([r.__class__ is str for r in reihe_band_list])
        assert all([h.__class__ is str for h in haupttitel_etc_list])
        assert len(reihe_band_list) == 2
        if record.ac_number == 'AC07705435':
            assert len(haupttitel_etc_list) == 1
        else:
            assert len(haupttitel_etc_list) == 2

        if index < 2:
            assert reihe_band_list[1] == 'Erster Band'
        elif index == 2:
            assert reihe_band_list[1] == 'Erstes Bändchen'

        assert reihe_band_list[0][0].isalpha()

        if index == 0:
            assert haupttitel_etc_list[0].startswith('Nachrichten von den Pelew')
            assert haupttitel_etc_list[1].startswith('Aus den Tagebüchern')

        if index != 0 and record.ac_number != 'AC09682453':  # I know the first verfasser gnd is damaged
            primary_elements = target_dataframe.iat[index, 1].split(verfasser.join_string_primary_level)
            assert len(primary_elements) == 1
            secondary_elements = [secondary_element.split(verfasser.join_string_secondary_level) for secondary_element in primary_elements]
            assert all([len(secondary_element) == 2 for secondary_element in secondary_elements])
            first_name = secondary_elements[0][0]; first_gnd = secondary_elements[0][1]
            assert first_name[0].isalpha()
            assert first_gnd.startswith('http')

        if index == 6:
            assert target_dataframe.at[record.ac_number, bandzählung_etc.column].__class__ is str
            assert target_dataframe.at[record.ac_number, bandzählung_etc.column] == 'Erste Abtheilung'
        else:
            assert target_dataframe.at[record.ac_number, bandzählung_etc.column].__class__ is float
            assert target_dataframe.at[record.ac_number, bandzählung_etc.column].__repr__() == 'nan'


        verlag_etc_val: str = target_dataframe.at[record.ac_number, verlag_etc.column]
        assert verlag_etc_val.__class__ is str
        verlag_orte = verlag_etc_val.split(verlag_etc.join_string_primary_level)
        verlag_orte = [verlag_ort.split(verlag_etc.join_string_secondary_level) for verlag_ort in verlag_orte]
        assert [verlag_ort[0] for verlag_ort in verlag_orte] == verlag_ort_korrekt[index]
        assert all([verlag_ort[1].startswith('http://d-nb.info/gnd/') for verlag_ort in verlag_orte])

        drucker_etc_val: str = target_dataframe.at[record.ac_number, drucker_etc.column]
        if record.ac_number in drucker_ort_korrekt:
            assert drucker_etc_val.__class__ is str
            drucker_etc_list = drucker_etc_val.split(drucker_etc.join_string_primary_level)
            drucker_etc_list = [val.split(drucker_etc.join_string_secondary_level) for val in drucker_etc_list]
            assert len(drucker_etc_list) == 1
            assert drucker_etc_list[0][0] == drucker_ort_korrekt[record.ac_number]
            assert drucker_etc_list[0][1].startswith('http://d-nb.info/gnd/')
        else:
            assert drucker_etc_val.__class__ is float
            assert drucker_etc_val.__repr__() == 'nan'

        verleger_normiert_val: str = target_dataframe.at[record.ac_number, verleger_normiert.column]
        assert verleger_normiert_val.__class__ is str
        verleger_normiert_list = [
            v.split(verleger_normiert.join_string_secondary_level)
            for v in
            verleger_normiert_val.split(verleger_normiert.join_string_primary_level)
        ]
        if record.ac_number != 'AC09682453':
            assert all([e[1].startswith('http://d-nb.info/gnd/') for e in verleger_normiert_list])
        assert verleger_normiert_list[0][0] == verleger_normiert_korrekt[index]

        druckerGNDID_val: str = target_dataframe.at[record.ac_number, druckerGNDID.column]
        druckerGNDID_korrekt_val = druckerGNDID_korrekt_list[index]
        if druckerGNDID_korrekt_val is None:
            assert druckerGNDID_val.__class__ is float
            assert druckerGNDID_val.__repr__() == 'nan'
        else:
            assert druckerGNDID_val.__class__ is str
            druckerGNDID_list: list = [
                d.split(druckerGNDID.join_string_secondary_level) for d in
                druckerGNDID_val.split(druckerGNDID.join_string_primary_level)]
            assert druckerGNDID_list[0][0] == druckerGNDID_korrekt_val
            if record.ac_number != 'AC09682453':
                assert all([d[1].startswith('http://d-nb.info/gnd/') for d in druckerGNDID_list])

        if index == 1:
            values = [
                v.split(herausgeberGNDID.join_string_secondary_level)
                for v in target_dataframe.at[record.ac_number, herausgeberGNDID.column].split(herausgeberGNDID.join_string_primary_level)
            ]
            assert len(values) == 1
            assert len(values[0]) == 2
            assert values[0][0][0].isalpha()
            assert values[0][1].startswith('http://d-nb.info/gnd/')
        else:
            assert target_dataframe.at[record.ac_number, herausgeberGNDID.column].__repr__() == 'nan'


        übersetzerGNDID_val: str = target_dataframe.at[record.ac_number, ÜbersetzerGNDID.column]
        if index > 1:
            assert übersetzerGNDID_val.__repr__() == 'nan'
        else:
            correct_übersetzer = 'Forster, Johann Reinhold' if index == 1 else 'Forster, Georg'
            übersetzerGNDID_list = [ü.split(übersetzerGNDID.join_string_secondary_level) for ü in übersetzerGNDID_val.split(übersetzerGNDID.join_string_primary_level)]
            assert len(übersetzerGNDID_list) == 1
            assert len(übersetzerGNDID_list[0]) == 2
            assert übersetzerGNDID_list[0][0] == correct_übersetzer
            assert übersetzerGNDID_list[0][1].startswith('http://d-nb.info/gnd/')

        if index == 0:
            illval: str = target_dataframe.at[record.ac_number, illustratorenGNDID.column]
            illlist = [i.split(illustratorenGNDID.join_string_secondary_level) for i in illval.split(illustratorenGNDID.join_string_primary_level)]
            assert len(illlist) == 2
            assert len(illlist[0]) == 2
            assert illlist[0][0][0].isalpha()
            assert illlist[0][1].startswith('http://d-nb.info/gnd/')
        elif index == 1:
            assert target_dataframe.at[record.ac_number, illustratorenGNDID.column].__repr__() == 'nan'

        widmenderGNDID_val: typing.Union[str, float] = target_dataframe.at[record.ac_number, widmenderGNDID.column]

        if index == 0:
            assert widmenderGNDID_val.__class__ == str
            widmenderGNDID_list = [
                w.split(widmenderGNDID.join_string_secondary_level)
                for w in widmenderGNDID_val.split(widmenderGNDID.join_string_primary_level)
                                   ]
            assert widmenderGNDID_list.__len__() == 1
            assert widmenderGNDID_list[0].__len__() == 2
            assert widmenderGNDID_list[0][0][0].isalpha()
            assert widmenderGNDID_list[0][1].startswith('http://d-nb.info/gnd/'
                                                        )
        elif index == 1:
            assert widmenderGNDID_val.__repr__() == 'nan'

        index += 1
        if index == stop:
            break


@pytest.mark.asyncio
async def test_verfasser():
    reduced_series = dummy_data[dummy_data['Datensatznummer'].str.contains('AC07705435') | dummy_data['Datensatznummer'].str.contains('AC09836279')]['Datensatznummer']
    session = httpx.AsyncClient()
    record_retriever = RecordRetriever(extract_ac_from_series(reduced_series), session=session)
    target_dataframe = pd.DataFrame([])
    indexer = IndexSetter(target_dataframe)
    weitereverfasser = WeitereVerfasserGNDID(target_dataframe)
    async for record in record_retriever.generate_records():
        await indexer.write(record)
        await weitereverfasser.write(record)

    assert target_dataframe.shape == (2, 2)
    assert target_dataframe.at['AC09836279', weitereverfasser.column].__repr__() == 'nan'
    val: str = target_dataframe.at['AC07705435', weitereverfasser.column]
    verfasser = [
        verfasser.split(weitereverfasser.join_string_secondary_level)
        for verfasser in val.split(weitereverfasser.join_string_primary_level)
    ]
    assert len(verfasser) == 5
    assert len(verfasser[0]) == 2
    assert len(verfasser[1]) == 1
    assert all([v[0][0].isalpha() for v in verfasser])
    assert all([len(v) == 2 or len(v) == 1 for v in verfasser])
    assert all([v[1].startswith('http://d-nb.info/gnd/') for v in verfasser if len(v) == 2])

    # Chappe D'Auteroche, Jean; https://d-nb.info/gnd/118872648 .- Miller, Charles .- Mason, Franziscus; https://d-nb.info/gnd/102833761 .- Cassini, Jean-Dominique <<de>>; https://d-nb.info/gnd/116470887 .- Aiton, William; https://d-nb.info/gnd/100005063

# AC09836279
# AC07705435
# Mason, Franziscus; https://d-nb.info/gnd/102833761


@pytest.mark.asyncio
async def test_beiträger():
    reduced_series = dummy_data[
        dummy_data['Datensatznummer'].str.contains('AC09836279') | dummy_data['Datensatznummer'].str.contains('AC07705435')]['Datensatznummer']
    session = httpx.AsyncClient()
    record_retriever = RecordRetriever(extract_ac_from_series(reduced_series), session=session)
    target_dataframe = pd.DataFrame([])
    indexer = IndexSetter(target_dataframe)
    beiträger = BeiträgerGNDID(target_dataframe)
    async for record in record_retriever.generate_records():
        await indexer.write(record)
        await beiträger.write(record)

    assert target_dataframe.at['AC09836279', beiträger.column].__repr__() == 'nan'
    val = target_dataframe.at['AC07705435', beiträger.column]
    l = [v.split(beiträger.join_string_secondary_level) for v in val.split(beiträger.join_string_primary_level)]
    assert len(l) == 1
    assert len(l[0]) == 2
    assert l[0][1].startswith('http')
    assert l[0][0][:2].isalpha()


@pytest.mark.asyncio
async def test_weitere_beiligte():
    """

    AC09974769
    AC09975164 Hunter, John; https://d-nb.info/gnd/123076285
    :return:
    """

    reduced_series = dummy_data[
        dummy_data['Datensatznummer'].str.contains('AC09975164') | dummy_data['Datensatznummer'].str.contains(
            'AC09974769')]['Datensatznummer']
    session = httpx.AsyncClient()
    record_retriever = RecordRetriever(extract_ac_from_series(reduced_series), session=session)
    target_dataframe = pd.DataFrame([])
    indexer = IndexSetter(target_dataframe)
    weitere = WeitereBeteiligteGNDID(target_dataframe)
    async for record in record_retriever.generate_records():
        await indexer.write(record)
        await weitere.write(record)

    assert target_dataframe.at['AC09974769', weitere.column].__repr__() == 'nan'
    val = target_dataframe.at['AC09975164', weitere.column]
    l = [v.split(weitere.join_string_secondary_level) for v in val.split(weitere.join_string_primary_level)]
    assert len(l) == 1
    assert len(l[0]) == 2
    assert l[0][1].startswith('http')
    assert l[0][0][:2].isalpha()

