In [None]:
!pip install -r requirements.txt

In [9]:
import pathlib
import pandas as pd
import numpy as np
import requests
import cv2
from tqdm.notebook import tqdm
import re
from PIL import Image

In [10]:
def get_content(url):
    retries = 0
    while retries < 3:
        try:
            resp = requests.get(url)
            if resp.status_code == 200:
                return resp.content
            else:
                retries += 1
        except requests.exceptions.Timeout:
            retries += 1
        except requests.exceptions.TooManyRedirects as e:
            raise SystemExit(e)
        except requests.exceptions.RequestException as e:
            raise SystemExit(e)

def get_category(entry):
    if entry in ['A', 'B', 'C']:
        return entry
    else:
        return 'N'

def display_image_from_series(df, num_images=16):
    images_so_far = 0
    cols = 8
    rows = int(np.ceil(num_images/cols))
    fig = plt.figure(figsize=(int(cols*2.5),int(rows*2.5)))
    
    for i, (filename, year) in enumerate(df[['filename', 'years']].values):
        images_so_far += 1
        ax = plt.subplot(rows, cols, images_so_far)
        ax.axis('off')
        ax.set_title(i)
        image = Image.open(f'img/ABO/{year}/no_category/{filename}')
        plt.imshow(image)
        if images_so_far == num_images:
            return

In [None]:
pred_1501_1549 = pd.read_csv('data/predictions/pred_1501_1549.csv')
pred_1550_1599 = pd.read_csv('data/predictions/pred_1550_1599.csv')
pred_1600_1650 = pd.read_csv('data/predictions/pred_1600_1650.csv')
pred_1651_1699 = pd.read_csv('data/predictions/pred_1651_1699.csv')
pred_1700_1738 = pd.read_csv('data/predictions/pred_1700_1738.csv')

In [None]:
pred_1501_1549['years'] = '1501_1549'
pred_1550_1599['years'] = '1550_1599'
pred_1600_1650['years'] = '1600_1650'
pred_1651_1699['years'] = '1651_1699'
pred_1700_1738['years'] = '1700_1738'

all_frames = [pred_1501_1549, pred_1550_1599, pred_1600_1650, pred_1651_1699, pred_1700_1738]
all_pred = pd.concat(all_frames)
all_pred.to_csv('data/predictions/all_pred_1501_1738.csv')
print(len(all_pred))

In [None]:
all_pred['probability'] = all_pred['probability'].apply(lambda x: json.loads(x))
all_pred['p_A'] = all_pred['probability'].apply(lambda x: x[0])
all_pred['p_B'] = all_pred['probability'].apply(lambda x: x[1])
all_pred['p_C'] = all_pred['probability'].apply(lambda x: x[2])
all_pred['p_N'] = all_pred['probability'].apply(lambda x: x[3])

In [None]:
def create_GT_addition(GT_df, candidate_df, num_additions):
    return_lis = []
    running_index = GT_df.iloc[-1]['Unnamed: 0']
    for i, row in enumerate(candidate_df.values):
        if len(return_lis) == num_additions:
            print('Finishing at index', i)
            break
        bc = row[0].split('_')[0]
        if GT_df['Strichcode'].str.contains(bc).any():
            continue
        page = row[0].split('_')[1].replace('.jpg', '')
        running_index += 1
        new_row = {
                'Unnamed: 0': running_index,
                'Strichcode': bc,
                'Link': '',
                'Variante': row[1],
                'Farbe': '',
                'Erhaltungsgrad': '',
                'Bem.': '',
                'Seite': int(page),
                'Image URLs': [f'https://iiif.onb.ac.at/images/ABO/{bc}/{page}/full/full/0/native.jpg']
            }
        return_lis.append(new_row)
    return pd.DataFrame(return_lis)

In [None]:
GT_df = pd.read_csv('data/groundtruth/BE_GT_v2.csv')

GT_addition_A = create_GT_addition(GT_df, all_pred[all_pred['p_A'] > 0.95], 79)
GT_addition_C = create_GT_addition(GT_df, all_pred[all_pred['p_C'] > 0.95], 150)
new_GT = pd.concat([GT_df, GT_addition_A, GT_addition_C], ignore_index=True)
new_GT = new_GT.drop('Unnamed: 0', axis=1)
new_GT.to_csv('data/groundtruth/BE_GT_v3.csv')

In [None]:
display_image_from_series(all_pred[all_pred['p_A'] > 0.95], num_images=85)

## Download new GT with square cutout and reduced resolution

In [8]:
img_dir = pathlib.Path('img')
if not img_dir.exists():
    img_dir.mkdir()
GT_path = img_dir.joinpath('GT_square')
if not GT_path.exists():
    GT_path.mkdir()
GT_csv = pd.read_csv('data/groundtruth/BE_GT_v3.csv')
for category, URL_lis, barcode in zip(tqdm(GT_csv['Variante']), GT_csv['Image URLs'], GT_csv['Strichcode']):
    url_lis = eval(URL_lis)
    cat = get_category(category)
    GT_cat_path = GT_path.joinpath(cat)
    if not GT_cat_path.exists():
        GT_cat_path.mkdir()
    for url in url_lis:
        page_number = re.findall('/(.{8,12})/full/full', url)[0]
        filename = f'{barcode}_{page_number}.jpg'
        if 'REPO' not in url:
            filepath = GT_cat_path.joinpath(filename)
            if not filepath.exists():
                url = url.replace('full/full', 'square/256,')
                img_content = get_content(url)
                open(filepath, 'wb').write(img_content)
        else:
            filepath = GT_cat_path.joinpath(filename.replace('.jpg.', '.'))
            if not filepath.exists():
                resp = requests.get(url, stream=True).raw
                img = np.asarray(bytearray(resp.read()), dtype='uint8')
                img = cv2.imdecode(img, cv2.IMREAD_COLOR)
                img_width = img.shape[1]
                img_height = img.shape[0]
                x = 0
                y = int((img_height - img_width)/2)
                w = img_width
                h = img_width
                square_img_content = img[y:y+h, x:x+w]
                resized_img = cv2.resize(square_img_content, (256, 256))
                cv2.imwrite(filepath.as_posix(), resized_img)

  0%|          | 0/804 [00:00<?, ?it/s]