Spaces:
Sleeping
Sleeping
import pandas as pd | |
import os | |
import shutil | |
import gradio as gr | |
import utils_data_extraction | |
import utils_assessment | |
import importlib | |
importlib.reload(utils_data_extraction) | |
importlib.reload(utils_assessment) | |
"""### Function to load data | |
Data is loaded from a Roamler Excel file, from a sheet called "output". | |
- A subset of the Excel file is taken as reference data, and saved in the `outputs` directory as reference_data.csv | |
- A folder for storing photos is created | |
A n_rows parameter can be passed to load a subset of the data. | |
""" | |
def load_roamler_excel_file(filepath, n_rows=3): | |
OUTPUT_DIR = 'outputs/'+os.path.basename(filepath) | |
if not os.path.exists(OUTPUT_DIR): | |
os.makedirs(OUTPUT_DIR) | |
DATA_EXTRACTION_DIR=OUTPUT_DIR+'/data_extraction' | |
if not os.path.exists(DATA_EXTRACTION_DIR): | |
os.makedirs(DATA_EXTRACTION_DIR) | |
df_review = pd.read_excel(filepath, sheet_name='Output') | |
if n_rows is not None: | |
df_review = df_review.sample(n=n_rows, random_state=42) | |
df_products = df_review[['ID', 'Front photo', 'Nutritionals photo', 'Ingredients photo', 'EAN photo', | |
'Brand', 'Product name', 'Legal name', 'Barcode', | |
'Energy kJ', 'Energy kcal', 'Fat', 'Saturated fat', 'Carbohydrates', 'Sugars', 'Fibers', 'Proteins', 'Salt', 'Ingredients', | |
'Nutriscore','Allergens', | |
'Quantity per unit']].copy() | |
df_products.to_csv(f'{OUTPUT_DIR}/data_extraction/reference_data.csv', index=False) | |
PHOTO_DIR=OUTPUT_DIR+'/photos' | |
if not os.path.exists(PHOTO_DIR): | |
os.makedirs(PHOTO_DIR) | |
df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data = load_df_from_folder(OUTPUT_DIR) | |
return df_products, OUTPUT_DIR, df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data | |
def load_df_from_folder(OUTPUT_DIR): | |
df_brand_data = pd.DataFrame(columns=['ID', 'Extracted_Text', 'Price', 'Processing time']) | |
if os.path.exists(f'{OUTPUT_DIR}/data_extraction/brand.csv'): | |
df_brand_data = pd.read_csv(f'{OUTPUT_DIR}/data_extraction/brand.csv') | |
df_product_name_data = pd.DataFrame(columns=['ID', 'Extracted_Text', 'Price', 'Processing time']) | |
if os.path.exists(f'{OUTPUT_DIR}/data_extraction/product_name.csv'): | |
df_product_name_data = pd.read_csv(f'{OUTPUT_DIR}/data_extraction/product_name.csv') | |
df_ingredients_data = pd.DataFrame(columns=['ID', 'Extracted_Text', 'Price', 'Processing time']) | |
if os.path.exists(f'{OUTPUT_DIR}/data_extraction/ingredients.csv'): | |
df_ingredients_data = pd.read_csv(f'{OUTPUT_DIR}/data_extraction/ingredients.csv') | |
df_nutritional_values_data = pd.DataFrame(columns=['ID', 'Extracted_Text', 'Price', 'Processing time']) | |
if os.path.exists(f'{OUTPUT_DIR}/data_extraction/nutritional_values.csv'): | |
df_nutritional_values_data = pd.read_csv(f'{OUTPUT_DIR}/data_extraction/nutritional_values.csv') | |
return df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data | |
def load_csv_files(archive, OUTPUT_DIR): | |
accepted_files = ['brand.csv', 'product_name.csv', 'ingredients.csv', 'nutritional_values.csv'] | |
for file in archive: | |
print(os.path.basename(file)) | |
if os.path.basename(file) in accepted_files: | |
shutil.copy(file, f'{OUTPUT_DIR}/data_extraction') | |
df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data = load_df_from_folder(OUTPUT_DIR) | |
return df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data | |
"""### Function to save data | |
This function is called when the user clicks on the "Generate data archive" button. | |
It creates a zip of all CSV files of the f'{OUTPUT_DIR}/data_extraction' folder, and return a download button to the archive. | |
""" | |
def generate_archive(OUTPUT_DIR): | |
# Download all data | |
archive_name = f'{OUTPUT_DIR}' | |
shutil.make_archive(archive_name, 'zip', f'{OUTPUT_DIR}/data_extraction') | |
return gr.DownloadButton(label=f"Download {archive_name}.zip", value=f'{archive_name}.zip', visible=True) | |
"""### Gradio UI""" | |
def toggle_row_visibility(show): | |
if show: | |
return gr.update(visible=True) | |
else: | |
return gr.update(visible=False) | |
language = 'French' | |
# Custom CSS to set max height for the rows | |
custom_css = """ | |
.dataframe-wrap { | |
max-height: 300px; /* Set the desired height */ | |
overflow-y: scroll; | |
} | |
""" | |
OUTPUT_DIR_value = "" | |
dummy_data = df_brand_data = df_product_name_data = df_ingredients_data = df_nutritional_values_data = pd.DataFrame() | |
#dummy_data, OUTPUT_DIR_value, df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data = load_roamler_excel_file("FDL-Datasets3/FR - Review.xlsm", n_rows=3) | |
with gr.Blocks(css=custom_css) as fdl_data_extraction_ui: | |
gr.HTML("<div align='center'><h1>Euroconsumers Food Data Lake</h1>") | |
gr.HTML("<div align='center'><h2>Data extraction</h2>") | |
OUTPUT_DIR = gr.State(value=OUTPUT_DIR_value) | |
with gr.Row(): | |
with gr.Column(): | |
gr.HTML("<h2>Upload Roamler Excel file</h2>") | |
load_roamler_excel_file_input = gr.File(label="Upload Roamler Excel file", type="filepath") | |
with gr.Row(visible=False) as dataset_block: | |
with gr.Column(): | |
gr.HTML("<h2>Dataset summary</h2>") | |
# Display summary of the dataset - ID, Reference_brand, Reference_product_name, mean_accuracy_score | |
with gr.Row(elem_classes="dataframe-wrap"): | |
dataframe_component = gr.DataFrame(value=dummy_data, interactive=False) | |
with gr.Row(visible=False) as product_detail_block: | |
with gr.Column(): | |
# Section for product details | |
gr.HTML("<h1>Data extraction</h1>") | |
load_csv_files_input = gr.Files(label="Upload extracted data from CSV files") | |
language = gr.Dropdown(label="Select language", choices=["French", "Dutch", "Spanish", "Italian", "Portuguese"], value="French") | |
gr.HTML("<h3>Brand</h3>") | |
extract_brand_button = gr.Button("Extract brand") | |
df_brand = gr.Dataframe(label="Brand data", scale=2, | |
column_widths=["10%", "60%", "15%", "15%"], | |
wrap=True, value=df_brand_data) | |
gr.HTML("<h3>Product name</h3>") | |
extract_product_name_button = gr.Button("Extract product_name") | |
df_product_name = gr.Dataframe(label="Product name data", scale=2, | |
column_widths=["10%", "60%", "15%", "15%"], | |
wrap=True, value=df_product_name_data) | |
gr.HTML("<h3>Ingredients</h3>") | |
extract_ingredients_button = gr.Button("Extract ingredients") | |
df_ingredients = gr.Dataframe(label="Ingredients data", scale=2, | |
column_widths=["10%", "60%", "15%", "15%"], | |
wrap=True, value=df_ingredients_data) | |
gr.HTML("<h3>Nutritional values</h3>") | |
extract_nutritional_values_button = gr.Button("Extract nutritional values") | |
df_nutritional_values = gr.Dataframe(label="Nutritional data", scale=2, | |
column_widths=["10%", "60%", "15%", "15%"], | |
wrap=True, value=df_nutritional_values_data) | |
# Download | |
gr.HTML("<h1>Data download</h1>") | |
generate_merged_file_button = gr.Button("Generate merged file") | |
generate_archive_button = gr.Button("Generate data archive") | |
download_button = gr.DownloadButton("Download archive", visible=False) | |
### Control functions | |
# Linking the select_dataset change event to update both the gradio DataFrame and product_ids dropdown | |
load_roamler_excel_file_input.change(load_roamler_excel_file, | |
inputs=load_roamler_excel_file_input, | |
outputs=[dataframe_component, OUTPUT_DIR, | |
df_brand, df_product_name, df_ingredients, df_nutritional_values]) | |
# Toggle visibility of the dataset block | |
load_roamler_excel_file_input.change(toggle_row_visibility, inputs=load_roamler_excel_file_input, outputs=dataset_block) | |
load_roamler_excel_file_input.change(toggle_row_visibility, inputs=load_roamler_excel_file_input, outputs=product_detail_block) | |
load_csv_files_input.change(load_csv_files, | |
inputs=[load_csv_files_input, OUTPUT_DIR], | |
outputs=[df_brand, df_product_name, df_ingredients, df_nutritional_values]) | |
# Data extraction | |
extract_brand_button.click(utils_data_extraction.extract_brand, | |
inputs=[OUTPUT_DIR, dataframe_component, language], | |
outputs=df_brand) | |
extract_product_name_button.click(utils_data_extraction.extract_product_name, | |
inputs=[OUTPUT_DIR, dataframe_component, language], | |
outputs=df_product_name) | |
extract_ingredients_button.click(utils_data_extraction.extract_ingredients, | |
inputs=[OUTPUT_DIR, dataframe_component, language], | |
outputs=df_ingredients) | |
extract_nutritional_values_button.click(utils_data_extraction.extract_nutritional_values, | |
inputs=[OUTPUT_DIR, dataframe_component, language], | |
outputs=df_nutritional_values) | |
generate_merged_file_button.click(utils_assessment.merge_and_save_data, inputs=OUTPUT_DIR) | |
generate_archive_button.click(generate_archive, inputs=OUTPUT_DIR, outputs=download_button) | |
fdl_data_extraction_ui.launch(debug=True) | |