Commit e12a74e4 authored by csteindl's avatar csteindl
Browse files

Binder

parent 008f3147
......@@ -56,7 +56,7 @@
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('historic_postcards_color_swatches.csv.bz2', compression='bz2')"
"df = pd.read_csv('https://labs.onb.ac.at/gitlab/labs-team/color-swatches-data/-/raw/master/historic_postcards_color_swatches.csv.bz2?inline=false', compression='bz2')"
]
},
{
......@@ -812,677 +812,6 @@
"source": [
"And done!"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Below there's a fast-forward, compact version of what's been done above. No need to do all this again."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Compact (with other data source)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load Data"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2785: DtypeWarning: Columns (13) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" interactivity=interactivity, compiler=compiler, result=result)\n"
]
}
],
"source": [
"colors_hsv_clip = pd.read_csv('akon_with_hsv_clip50_color_swatches.csv.bz2', compression='bz2')\n",
"raw_data = pd.read_csv('akon_postcards_public_domain_1925.csv.bz2', compression='bz2')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## View Data Format"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>akon_id</th>\n",
" <th>image_link</th>\n",
" <th>hex_colors</th>\n",
" <th>html</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>11914</th>\n",
" <td>11914</td>\n",
" <td>AK003_285</td>\n",
" <td>https://iiif.onb.ac.at/images/AKON/AK003_285/2...</td>\n",
" <td>['#050300', '#eee2c9', '#b6af9e', '#fdf7da', '...</td>\n",
" <td>&lt;a href=\"https://iiif.onb.ac.at/images/AKON/AK...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 akon_id \\\n",
"11914 11914 AK003_285 \n",
"\n",
" image_link \\\n",
"11914 https://iiif.onb.ac.at/images/AKON/AK003_285/2... \n",
"\n",
" hex_colors \\\n",
"11914 ['#050300', '#eee2c9', '#b6af9e', '#fdf7da', '... \n",
"\n",
" html \n",
"11914 <a href=\"https://iiif.onb.ac.at/images/AKON/AK... "
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"colors_hsv_clip.sample()"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>akon_id</th>\n",
" <th>id</th>\n",
" <th>altitude</th>\n",
" <th>building</th>\n",
" <th>city</th>\n",
" <th>color</th>\n",
" <th>comment</th>\n",
" <th>mountain</th>\n",
" <th>other</th>\n",
" <th>...</th>\n",
" <th>feature_class</th>\n",
" <th>feature_code</th>\n",
" <th>geoname_id</th>\n",
" <th>latitude</th>\n",
" <th>longitude</th>\n",
" <th>name</th>\n",
" <th>country_id</th>\n",
" <th>admin_name_1</th>\n",
" <th>admin_code_1</th>\n",
" <th>geo</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>23435</th>\n",
" <td>23435</td>\n",
" <td>AK042_533</td>\n",
" <td>25265</td>\n",
" <td>434.0</td>\n",
" <td>NaN</td>\n",
" <td>Frohnleiten</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>P</td>\n",
" <td>PPLA3</td>\n",
" <td>2779202.0</td>\n",
" <td>47.26667</td>\n",
" <td>15.31667</td>\n",
" <td>Frohnleiten</td>\n",
" <td>AT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>47.26667, 15.31667</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1 rows × 30 columns</p>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 akon_id id altitude building city color \\\n",
"23435 23435 AK042_533 25265 434.0 NaN Frohnleiten False \n",
"\n",
" comment mountain other ... feature_class feature_code \\\n",
"23435 NaN NaN NaN ... P PPLA3 \n",
"\n",
" geoname_id latitude longitude name country_id admin_name_1 \\\n",
"23435 2779202.0 47.26667 15.31667 Frohnleiten AT NaN \n",
"\n",
" admin_code_1 geo \n",
"23435 NaN 47.26667, 15.31667 \n",
"\n",
"[1 rows x 30 columns]"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_data.sample()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Combine Data"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"combined_data = pd.merge(colors_hsv_clip[['akon_id', 'hex_colors', 'image_link']],\n",
" raw_data[['akon_id', 'name', 'date']],\n",
" on='akon_id')"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>akon_id</th>\n",
" <th>hex_colors</th>\n",
" <th>image_link</th>\n",
" <th>name</th>\n",
" <th>date</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>23217</th>\n",
" <td>AK041_595</td>\n",
" <td>['#ada896', '#fcf6d5', '#767467', '#484739', '...</td>\n",
" <td>https://iiif.onb.ac.at/images/AKON/AK041_595/5...</td>\n",
" <td>Ötscher</td>\n",
" <td>1909</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" akon_id hex_colors \\\n",
"23217 AK041_595 ['#ada896', '#fcf6d5', '#767467', '#484739', '... \n",
"\n",
" image_link name date \n",
"23217 https://iiif.onb.ac.at/images/AKON/AK041_595/5... Ötscher 1909 "
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"combined_data.sample()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Flatten hex_colors"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"combined_data['hex_colors_list'] = combined_data['hex_colors'].apply(lambda c: json.loads(c.replace(\"'\", '\"')))"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>akon_id</th>\n",
" <th>hex_colors</th>\n",
" <th>image_link</th>\n",
" <th>name</th>\n",
" <th>date</th>\n",
" <th>hex_colors_list</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>15996</th>\n",
" <td>AK014_589</td>\n",
" <td>['#020100', '#fbfae8', '#88887e', '#64645a', '...</td>\n",
" <td>https://iiif.onb.ac.at/images/AKON/AK014_589/5...</td>\n",
" <td>Maria Taferl</td>\n",
" <td>1909</td>\n",
" <td>[#020100, #fbfae8, #88887e, #64645a, #4d4f49, ...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" akon_id hex_colors \\\n",
"15996 AK014_589 ['#020100', '#fbfae8', '#88887e', '#64645a', '... \n",
"\n",
" image_link name date \\\n",
"15996 https://iiif.onb.ac.at/images/AKON/AK014_589/5... Maria Taferl 1909 \n",
"\n",
" hex_colors_list \n",
"15996 [#020100, #fbfae8, #88887e, #64645a, #4d4f49, ... "
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"combined_data.sample()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Sanitize and Reorder"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
"combined_data = combined_data.drop(columns=['hex_colors']).copy()"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>akon_id</th>\n",
" <th>image_link</th>\n",
" <th>name</th>\n",
" <th>date</th>\n",
" <th>hex_colors_list</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>19590</th>\n",
" <td>AK028_177</td>\n",
" <td>https://iiif.onb.ac.at/images/AKON/AK028_177/1...</td>\n",
" <td>Frohnleiten</td>\n",
" <td>1906</td>\n",
" <td>[#020100, #a8a599, #7b7a6f, #fbf9e5, #4b4b40, ...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" akon_id image_link \\\n",
"19590 AK028_177 https://iiif.onb.ac.at/images/AKON/AK028_177/1... \n",
"\n",
" name date hex_colors_list \n",
"19590 Frohnleiten 1906 [#020100, #a8a599, #7b7a6f, #fbf9e5, #4b4b40, ... "
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"combined_data.sample()"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [],
"source": [
"combined_data = combined_data.rename(columns={'hex_colors_list': 'hex_colors'})"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>akon_id</th>\n",
" <th>image_link</th>\n",
" <th>name</th>\n",
" <th>date</th>\n",
" <th>hex_colors</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>33304</th>\n",
" <td>AK087_042</td>\n",
" <td>https://iiif.onb.ac.at/images/AKON/AK087_042/0...</td>\n",
" <td>Abcoude</td>\n",
" <td>vor 1905</td>\n",
" <td>[#f8eacd, #aca391, #5b5747, #6e6a5e, #525148, ...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" akon_id image_link name \\\n",
"33304 AK087_042 https://iiif.onb.ac.at/images/AKON/AK087_042/0... Abcoude \n",
"\n",
" date hex_colors \n",
"33304 vor 1905 [#f8eacd, #aca391, #5b5747, #6e6a5e, #525148, ... "
]
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"combined_data.sample()"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>akon_id</th>\n",
" <th>hex_colors</th>\n",
" <th>image_link</th>\n",
" <th>name</th>\n",
" <th>date</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>25575</th>\n",
" <td>AK031_287</td>\n",
" <td>[#444626, #caccbc, #4a4d41, #48504f, #5b7073, ...</td>\n",
" <td>https://iiif.onb.ac.at/images/AKON/AK031_287/2...</td>\n",
" <td>Ebensee</td>\n",
" <td>1907</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" akon_id hex_colors \\\n",
"25575 AK031_287 [#444626, #caccbc, #4a4d41, #48504f, #5b7073, ... \n",
"\n",
" image_link name date \n",
"25575 https://iiif.onb.ac.at/images/AKON/AK031_287/2... Ebensee 1907 "
]
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"combined_data = combined_data[['akon_id', 'hex_colors', 'image_link', 'name', 'date']]\n",
"combined_data.sample()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Sample and Write"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
"combined_data.iloc[:100].to_json('swatches_100.json', orient='values')"
]
},
{
"cell_type": "code",