Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- app.py +224 -0
- utils_assessment.py +241 -0
- utils_data_extraction.py +323 -0
app.py
ADDED
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import pandas as pd
|
3 |
+
import os
|
4 |
+
import shutil
|
5 |
+
|
6 |
+
import gradio as gr
|
7 |
+
|
8 |
+
import utils_data_extraction
|
9 |
+
import utils_assessment
|
10 |
+
|
11 |
+
import importlib
|
12 |
+
importlib.reload(utils_data_extraction)
|
13 |
+
importlib.reload(utils_assessment)
|
14 |
+
|
15 |
+
"""### Function to load data
|
16 |
+
|
17 |
+
Data is loaded from a Roamler Excel file, from a sheet called "output".
|
18 |
+
|
19 |
+
- A subset of the Excel file is taken as reference data, and saved in the `outputs` directory as reference_data.csv
|
20 |
+
- A folder for storing photos is created
|
21 |
+
|
22 |
+
A n_rows parameter can be passed to load a subset of the data.
|
23 |
+
"""
|
24 |
+
|
25 |
+
def load_roamler_excel_file(filepath, n_rows=3):
|
26 |
+
|
27 |
+
OUTPUT_DIR = 'outputs/'+os.path.basename(filepath)
|
28 |
+
if not os.path.exists(OUTPUT_DIR):
|
29 |
+
os.makedirs(OUTPUT_DIR)
|
30 |
+
|
31 |
+
DATA_EXTRACTION_DIR=OUTPUT_DIR+'/data_extraction'
|
32 |
+
if not os.path.exists(DATA_EXTRACTION_DIR):
|
33 |
+
os.makedirs(DATA_EXTRACTION_DIR)
|
34 |
+
|
35 |
+
df_review = pd.read_excel(filepath, sheet_name='Output')
|
36 |
+
if n_rows is not None:
|
37 |
+
df_review = df_review.sample(n=n_rows, random_state=42)
|
38 |
+
|
39 |
+
df_products = df_review[['ID', 'Front photo', 'Nutritionals photo', 'Ingredients photo', 'EAN photo',
|
40 |
+
'Brand', 'Product name', 'Legal name', 'Barcode',
|
41 |
+
'Energy kJ', 'Energy kcal', 'Fat', 'Saturated fat', 'Carbohydrates', 'Sugars', 'Fibers', 'Proteins', 'Salt', 'Ingredients',
|
42 |
+
'Nutriscore','Allergens',
|
43 |
+
'Quantity per unit']].copy()
|
44 |
+
|
45 |
+
df_products.to_csv(f'{OUTPUT_DIR}/data_extraction/reference_data.csv', index=False)
|
46 |
+
|
47 |
+
PHOTO_DIR=OUTPUT_DIR+'/photos'
|
48 |
+
if not os.path.exists(PHOTO_DIR):
|
49 |
+
os.makedirs(PHOTO_DIR)
|
50 |
+
|
51 |
+
df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data = load_df_from_folder(OUTPUT_DIR)
|
52 |
+
|
53 |
+
return df_products, OUTPUT_DIR, df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data
|
54 |
+
|
55 |
+
def load_df_from_folder(OUTPUT_DIR):
|
56 |
+
|
57 |
+
df_brand_data = pd.DataFrame(columns=['ID', 'Extracted_Text', 'Price', 'Processing time'])
|
58 |
+
if os.path.exists(f'{OUTPUT_DIR}/data_extraction/brand.csv'):
|
59 |
+
df_brand_data = pd.read_csv(f'{OUTPUT_DIR}/data_extraction/brand.csv')
|
60 |
+
|
61 |
+
df_product_name_data = pd.DataFrame(columns=['ID', 'Extracted_Text', 'Price', 'Processing time'])
|
62 |
+
if os.path.exists(f'{OUTPUT_DIR}/data_extraction/product_name.csv'):
|
63 |
+
df_product_name_data = pd.read_csv(f'{OUTPUT_DIR}/data_extraction/product_name.csv')
|
64 |
+
|
65 |
+
df_ingredients_data = pd.DataFrame(columns=['ID', 'Extracted_Text', 'Price', 'Processing time'])
|
66 |
+
if os.path.exists(f'{OUTPUT_DIR}/data_extraction/ingredients.csv'):
|
67 |
+
df_ingredients_data = pd.read_csv(f'{OUTPUT_DIR}/data_extraction/ingredients.csv')
|
68 |
+
|
69 |
+
df_nutritional_values_data = pd.DataFrame(columns=['ID', 'Extracted_Text', 'Price', 'Processing time'])
|
70 |
+
if os.path.exists(f'{OUTPUT_DIR}/data_extraction/nutritional_values.csv'):
|
71 |
+
df_nutritional_values_data = pd.read_csv(f'{OUTPUT_DIR}/data_extraction/nutritional_values.csv')
|
72 |
+
|
73 |
+
return df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data
|
74 |
+
|
75 |
+
def load_csv_files(archive, OUTPUT_DIR):
|
76 |
+
|
77 |
+
accepted_files = ['brand.csv', 'product_name.csv', 'ingredients.csv', 'nutritional_values.csv']
|
78 |
+
|
79 |
+
for file in archive:
|
80 |
+
print(os.path.basename(file))
|
81 |
+
if os.path.basename(file) in accepted_files:
|
82 |
+
shutil.copy(file, f'{OUTPUT_DIR}/data_extraction')
|
83 |
+
|
84 |
+
df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data = load_df_from_folder(OUTPUT_DIR)
|
85 |
+
return df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data
|
86 |
+
|
87 |
+
"""### Function to save data
|
88 |
+
|
89 |
+
This function is called when the user clicks on the "Generate data archive" button.
|
90 |
+
|
91 |
+
It creates a zip of all CSV files of the f'{OUTPUT_DIR}/data_extraction' folder, and return a download button to the archive.
|
92 |
+
"""
|
93 |
+
|
94 |
+
def generate_archive(OUTPUT_DIR):
|
95 |
+
# Download all data
|
96 |
+
|
97 |
+
archive_name = f'{OUTPUT_DIR}'
|
98 |
+
shutil.make_archive(archive_name, 'zip', f'{OUTPUT_DIR}/data_extraction')
|
99 |
+
|
100 |
+
return gr.DownloadButton(label=f"Download {archive_name}.zip", value=f'{archive_name}.zip', visible=True)
|
101 |
+
|
102 |
+
"""### Gradio UI"""
|
103 |
+
|
104 |
+
def toggle_row_visibility(show):
|
105 |
+
if show:
|
106 |
+
return gr.update(visible=True)
|
107 |
+
else:
|
108 |
+
return gr.update(visible=False)
|
109 |
+
|
110 |
+
language = 'French'
|
111 |
+
|
112 |
+
# Custom CSS to set max height for the rows
|
113 |
+
custom_css = """
|
114 |
+
.dataframe-wrap {
|
115 |
+
max-height: 300px; /* Set the desired height */
|
116 |
+
overflow-y: scroll;
|
117 |
+
}
|
118 |
+
"""
|
119 |
+
|
120 |
+
OUTPUT_DIR_value = ""
|
121 |
+
dummy_data = df_brand_data = df_product_name_data = df_ingredients_data = df_nutritional_values_data = pd.DataFrame()
|
122 |
+
#dummy_data, OUTPUT_DIR_value, df_brand_data, df_product_name_data, df_ingredients_data, df_nutritional_values_data = load_roamler_excel_file("FDL-Datasets3/FR - Review.xlsm", n_rows=3)
|
123 |
+
|
124 |
+
with gr.Blocks(css=custom_css) as fdl_data_extraction_ui:
|
125 |
+
|
126 |
+
gr.HTML("<div align='center'><h1>Euroconsumers Food Data Lake</h1>")
|
127 |
+
gr.HTML("<div align='center'><h2>Data extraction</h2>")
|
128 |
+
|
129 |
+
OUTPUT_DIR = gr.State(value=OUTPUT_DIR_value)
|
130 |
+
|
131 |
+
with gr.Row():
|
132 |
+
with gr.Column():
|
133 |
+
gr.HTML("<h2>Upload Roamler Excel file</h2>")
|
134 |
+
load_roamler_excel_file_input = gr.File(label="Upload Roamler Excel file", type="filepath")
|
135 |
+
|
136 |
+
with gr.Row(visible=False) as dataset_block:
|
137 |
+
with gr.Column():
|
138 |
+
gr.HTML("<h2>Dataset summary</h2>")
|
139 |
+
|
140 |
+
# Display summary of the dataset - ID, Reference_brand, Reference_product_name, mean_accuracy_score
|
141 |
+
with gr.Row(elem_classes="dataframe-wrap"):
|
142 |
+
dataframe_component = gr.DataFrame(value=dummy_data, interactive=False)
|
143 |
+
|
144 |
+
with gr.Row(visible=False) as product_detail_block:
|
145 |
+
with gr.Column():
|
146 |
+
# Section for product details
|
147 |
+
gr.HTML("<h1>Data extraction</h1>")
|
148 |
+
|
149 |
+
load_csv_files_input = gr.Files(label="Upload extracted data from CSV files")
|
150 |
+
|
151 |
+
language = gr.Dropdown(label="Select language", choices=["French", "Dutch", "Spanish", "Italian", "Portuguese"], value="French")
|
152 |
+
|
153 |
+
gr.HTML("<h3>Brand</h3>")
|
154 |
+
extract_brand_button = gr.Button("Extract brand")
|
155 |
+
df_brand = gr.Dataframe(label="Brand data", scale=2,
|
156 |
+
column_widths=["10%", "60%", "15%", "15%"],
|
157 |
+
wrap=True, value=df_brand_data)
|
158 |
+
|
159 |
+
gr.HTML("<h3>Product name</h3>")
|
160 |
+
extract_product_name_button = gr.Button("Extract product_name")
|
161 |
+
df_product_name = gr.Dataframe(label="Product name data", scale=2,
|
162 |
+
column_widths=["10%", "60%", "15%", "15%"],
|
163 |
+
wrap=True, value=df_product_name_data)
|
164 |
+
|
165 |
+
gr.HTML("<h3>Ingredients</h3>")
|
166 |
+
extract_ingredients_button = gr.Button("Extract ingredients")
|
167 |
+
df_ingredients = gr.Dataframe(label="Ingredients data", scale=2,
|
168 |
+
column_widths=["10%", "60%", "15%", "15%"],
|
169 |
+
wrap=True, value=df_ingredients_data)
|
170 |
+
|
171 |
+
gr.HTML("<h3>Nutritional values</h3>")
|
172 |
+
extract_nutritional_values_button = gr.Button("Extract nutritional values")
|
173 |
+
df_nutritional_values = gr.Dataframe(label="Nutritional data", scale=2,
|
174 |
+
column_widths=["10%", "60%", "15%", "15%"],
|
175 |
+
wrap=True, value=df_nutritional_values_data)
|
176 |
+
|
177 |
+
# Download
|
178 |
+
gr.HTML("<h1>Data download</h1>")
|
179 |
+
|
180 |
+
generate_merged_file_button = gr.Button("Generate merged file")
|
181 |
+
generate_archive_button = gr.Button("Generate data archive")
|
182 |
+
download_button = gr.DownloadButton("Download archive", visible=False)
|
183 |
+
|
184 |
+
### Control functions
|
185 |
+
|
186 |
+
# Linking the select_dataset change event to update both the gradio DataFrame and product_ids dropdown
|
187 |
+
load_roamler_excel_file_input.change(load_roamler_excel_file,
|
188 |
+
inputs=load_roamler_excel_file_input,
|
189 |
+
outputs=[dataframe_component, OUTPUT_DIR,
|
190 |
+
df_brand, df_product_name, df_ingredients, df_nutritional_values])
|
191 |
+
|
192 |
+
# Toggle visibility of the dataset block
|
193 |
+
load_roamler_excel_file_input.change(toggle_row_visibility, inputs=load_roamler_excel_file_input, outputs=dataset_block)
|
194 |
+
load_roamler_excel_file_input.change(toggle_row_visibility, inputs=load_roamler_excel_file_input, outputs=product_detail_block)
|
195 |
+
|
196 |
+
load_csv_files_input.change(load_csv_files,
|
197 |
+
inputs=[load_csv_files_input, OUTPUT_DIR],
|
198 |
+
outputs=[df_brand, df_product_name, df_ingredients, df_nutritional_values])
|
199 |
+
|
200 |
+
# Data extraction
|
201 |
+
extract_brand_button.click(utils_data_extraction.extract_brand,
|
202 |
+
inputs=[OUTPUT_DIR, dataframe_component, language],
|
203 |
+
outputs=df_brand)
|
204 |
+
|
205 |
+
extract_product_name_button.click(utils_data_extraction.extract_product_name,
|
206 |
+
inputs=[OUTPUT_DIR, dataframe_component, language],
|
207 |
+
outputs=df_product_name)
|
208 |
+
|
209 |
+
extract_ingredients_button.click(utils_data_extraction.extract_ingredients,
|
210 |
+
inputs=[OUTPUT_DIR, dataframe_component, language],
|
211 |
+
outputs=df_ingredients)
|
212 |
+
|
213 |
+
extract_nutritional_values_button.click(utils_data_extraction.extract_nutritional_values,
|
214 |
+
inputs=[OUTPUT_DIR, dataframe_component, language],
|
215 |
+
outputs=df_nutritional_values)
|
216 |
+
|
217 |
+
generate_merged_file_button.click(utils_assessment.merge_and_save_data, inputs=OUTPUT_DIR)
|
218 |
+
|
219 |
+
generate_archive_button.click(generate_archive, inputs=OUTPUT_DIR, outputs=download_button)
|
220 |
+
|
221 |
+
fdl_data_extraction_ui.launch(debug=True)
|
222 |
+
|
223 |
+
|
224 |
+
|
utils_assessment.py
ADDED
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
from rouge_score import rouge_scorer
|
4 |
+
|
5 |
+
import Levenshtein
|
6 |
+
|
7 |
+
import pandas as pd
|
8 |
+
import numpy as np
|
9 |
+
|
10 |
+
feature_assessment_entries = {
|
11 |
+
|
12 |
+
f'brand': {
|
13 |
+
'name': f'brand',
|
14 |
+
'output_column': 'Brand',
|
15 |
+
'scoring_function_name': 'grade_exact_match',
|
16 |
+
'post_processing_function_name': 'post_processing_none',
|
17 |
+
# 'post_processing_function_name' : 'post_processing_brand',
|
18 |
+
'k_folds': 3,
|
19 |
+
},
|
20 |
+
|
21 |
+
f'product_name': {
|
22 |
+
'name': f'product_name',
|
23 |
+
'output_column': 'Product name',
|
24 |
+
'scoring_function_name': 'grade_levenshtein_match',
|
25 |
+
# 'scoring_function_name' : 'grade_exact_match',
|
26 |
+
'post_processing_function_name': 'post_processing_none',
|
27 |
+
'k_folds': 3,
|
28 |
+
},
|
29 |
+
|
30 |
+
f'ingredients': {
|
31 |
+
'name': f'ingredients',
|
32 |
+
'output_column': 'Ingredients',
|
33 |
+
'scoring_function_name': 'grade_rouge_score',
|
34 |
+
# 'scoring_function_name' : 'grade_levenshtein_match',
|
35 |
+
'post_processing_function_name': 'post_processing_none',
|
36 |
+
# 'post_processing_function_name' : 'post_processing_ingredients',
|
37 |
+
'k_folds': 3,
|
38 |
+
},
|
39 |
+
|
40 |
+
f'energy_kj': {
|
41 |
+
'name': f'energy_kj',
|
42 |
+
'output_column': 'Energy kJ',
|
43 |
+
'scoring_function_name': 'grade_numerical',
|
44 |
+
'post_processing_function_name': 'post_processing_none',
|
45 |
+
'k_folds': 3,
|
46 |
+
},
|
47 |
+
|
48 |
+
f'energy_kcal': {
|
49 |
+
'name': f'energy_kcal',
|
50 |
+
'output_column': 'Energy kcal',
|
51 |
+
'scoring_function_name': 'grade_numerical',
|
52 |
+
'post_processing_function_name': 'post_processing_none',
|
53 |
+
'k_folds': 3,
|
54 |
+
},
|
55 |
+
|
56 |
+
f'fat': {
|
57 |
+
'name': f'fat',
|
58 |
+
'output_column': 'Fat',
|
59 |
+
'scoring_function_name': 'grade_numerical',
|
60 |
+
'post_processing_function_name': 'post_processing_nutritionals',
|
61 |
+
'k_folds': 3,
|
62 |
+
},
|
63 |
+
|
64 |
+
f'saturated_fat': {
|
65 |
+
'name': f'saturated_fat',
|
66 |
+
'output_column': 'Saturated fat',
|
67 |
+
'scoring_function_name': 'grade_numerical',
|
68 |
+
'post_processing_function_name': 'post_processing_nutritionals',
|
69 |
+
'k_folds': 3,
|
70 |
+
},
|
71 |
+
|
72 |
+
f'carbohydrates': {
|
73 |
+
'name': f'carbohydrates',
|
74 |
+
'output_column': 'Carbohydrates',
|
75 |
+
'scoring_function_name': 'grade_numerical',
|
76 |
+
'post_processing_function_name': 'post_processing_nutritionals',
|
77 |
+
'k_folds': 3,
|
78 |
+
},
|
79 |
+
|
80 |
+
f'sugars': {
|
81 |
+
'name': f'sugars',
|
82 |
+
'output_column': 'Sugars',
|
83 |
+
'scoring_function_name': 'grade_numerical',
|
84 |
+
'post_processing_function_name': 'post_processing_nutritionals',
|
85 |
+
'k_folds': 3,
|
86 |
+
},
|
87 |
+
|
88 |
+
f'fibers': {
|
89 |
+
'name': f'fibers',
|
90 |
+
'output_column': 'Fibers',
|
91 |
+
'scoring_function_name': 'grade_numerical',
|
92 |
+
'post_processing_function_name': 'post_processing_nutritionals',
|
93 |
+
'k_folds': 3,
|
94 |
+
},
|
95 |
+
|
96 |
+
f'proteins': {
|
97 |
+
'name': f'proteins',
|
98 |
+
'output_column': 'Proteins',
|
99 |
+
'scoring_function_name': 'grade_numerical',
|
100 |
+
'post_processing_function_name': 'post_processing_nutritionals',
|
101 |
+
'k_folds': 3,
|
102 |
+
},
|
103 |
+
|
104 |
+
f'salt': {
|
105 |
+
'name': f'salt',
|
106 |
+
'output_column': 'Salt',
|
107 |
+
'scoring_function_name': 'grade_numerical',
|
108 |
+
'post_processing_function_name': 'post_processing_nutritionals',
|
109 |
+
'k_folds': 3,
|
110 |
+
},
|
111 |
+
|
112 |
+
}
|
113 |
+
|
114 |
+
|
115 |
+
def post_processing_none(string):
|
116 |
+
return string
|
117 |
+
|
118 |
+
|
119 |
+
def post_processing_ingredients(string):
|
120 |
+
pattern = r"<ingredients>(.*?)</ingredients>"
|
121 |
+
# Find all matches
|
122 |
+
matches = re.findall(pattern, string, re.DOTALL)
|
123 |
+
|
124 |
+
if len(matches) == 0:
|
125 |
+
output = string
|
126 |
+
else:
|
127 |
+
output = matches[0].strip()
|
128 |
+
if output.lower().startswith("ingrediënten: ") or output.lower().startswith("ingredienten: "):
|
129 |
+
output = output[len("ingrediënten: "):]
|
130 |
+
if output.lower().startswith("ingredients: "):
|
131 |
+
output = output[len("ingredients: "):]
|
132 |
+
|
133 |
+
return output
|
134 |
+
|
135 |
+
|
136 |
+
def post_processing_brand(brand):
|
137 |
+
if brand.lower() == "boni":
|
138 |
+
brand = "Boni Selection"
|
139 |
+
elif brand.lower() == "rana":
|
140 |
+
brand = "Giovanni Rana"
|
141 |
+
elif brand.lower() == "the market":
|
142 |
+
brand = "Carrefour The Market"
|
143 |
+
elif brand.lower() == "extra":
|
144 |
+
brand = "Carrefour Extra"
|
145 |
+
|
146 |
+
return brand
|
147 |
+
|
148 |
+
|
149 |
+
def post_processing_nutritionals(predicted_value):
|
150 |
+
try:
|
151 |
+
predicted_value = re.findall(r"[-+]?\d*\.\d+|\d+", str(predicted_value))[0]
|
152 |
+
except:
|
153 |
+
predicted_value = np.nan
|
154 |
+
|
155 |
+
return predicted_value
|
156 |
+
|
157 |
+
def grade_levenshtein_match(predicted_value, reference_value):
|
158 |
+
score = Levenshtein.ratio(predicted_value.lower().strip(), reference_value.lower().strip())
|
159 |
+
return score
|
160 |
+
|
161 |
+
|
162 |
+
def grade_exact_match(predicted_value, reference_value):
|
163 |
+
reference_value = reference_value.lower().strip()
|
164 |
+
reference_value = re.sub(r'\s+', ' ', reference_value)
|
165 |
+
predicted_value = predicted_value.lower().strip()
|
166 |
+
|
167 |
+
score = int(predicted_value.lower().strip() == reference_value.lower().strip())
|
168 |
+
|
169 |
+
return score
|
170 |
+
|
171 |
+
|
172 |
+
def grade_rouge_score(predicted_value, reference_value):
|
173 |
+
scorer = rouge_scorer.RougeScorer(['rouge2'])
|
174 |
+
score = scorer.score(predicted_value, reference_value)['rouge2'].fmeasure
|
175 |
+
|
176 |
+
return score
|
177 |
+
|
178 |
+
|
179 |
+
def grade_numerical(predicted_value, reference_value):
|
180 |
+
try:
|
181 |
+
if np.isnan(float(predicted_value)) and np.isnan(float(reference_value)):
|
182 |
+
score = 1
|
183 |
+
else:
|
184 |
+
score = int(float(predicted_value) == float(reference_value))
|
185 |
+
except:
|
186 |
+
score = -1
|
187 |
+
|
188 |
+
return score
|
189 |
+
|
190 |
+
|
191 |
+
def create_eval_data(OUTPUT_DIR, feature_assessment_entry):
|
192 |
+
|
193 |
+
df_product_id = pd.read_csv(f"{OUTPUT_DIR}/reference_data.csv")
|
194 |
+
|
195 |
+
df_features = pd.read_csv(f"{OUTPUT_DIR}/{feature_assessment_entry['name']}.csv")
|
196 |
+
|
197 |
+
df_features = df_features.merge(df_product_id, on='ID', how='left')
|
198 |
+
df_eval_data = df_features[
|
199 |
+
['ID', feature_assessment_entry['output_column'], 'Extracted_Text', 'Price', 'Processing time']].copy()
|
200 |
+
df_eval_data.rename(columns={feature_assessment_entry['output_column']: 'Reference'}, inplace=True)
|
201 |
+
df_eval_data.rename(columns={'Extracted_Text': 'Predicted'}, inplace=True)
|
202 |
+
|
203 |
+
df_eval_data['Predicted'] = df_eval_data.apply(
|
204 |
+
lambda row: eval(feature_assessment_entry['post_processing_function_name'])(row['Predicted']), axis=1)
|
205 |
+
|
206 |
+
df_eval_data['accuracy_score'] = df_eval_data.apply(
|
207 |
+
lambda row: eval(feature_assessment_entry['scoring_function_name'])(row['Predicted'], row['Reference']), axis=1)
|
208 |
+
|
209 |
+
df_eval_data['accuracy_score'] = round(df_eval_data['accuracy_score'], 2)
|
210 |
+
|
211 |
+
N = len(df_eval_data)
|
212 |
+
k = feature_assessment_entry['k_folds']
|
213 |
+
np.random.seed(42)
|
214 |
+
df_eval_data['fold'] = np.random.randint(0, k, size=N)
|
215 |
+
|
216 |
+
return df_eval_data
|
217 |
+
|
218 |
+
|
219 |
+
def merge_and_save_data(OUTPUT_DIR):
|
220 |
+
|
221 |
+
df_ref_data = pd.read_csv(f"{OUTPUT_DIR}/data_extraction/reference_data.csv")
|
222 |
+
|
223 |
+
data_merged = [df_ref_data[['ID', 'Front photo', 'Nutritionals photo', 'Ingredients photo', 'EAN photo']]]
|
224 |
+
|
225 |
+
for feature_name in feature_assessment_entries.keys():
|
226 |
+
|
227 |
+
df_eval_data = create_eval_data(f'{OUTPUT_DIR}/data_extraction', feature_assessment_entries[feature_name])
|
228 |
+
|
229 |
+
df_eval_data = df_eval_data[['Reference', 'Predicted', 'accuracy_score']]
|
230 |
+
df_eval_data.rename(columns={'Reference': 'Reference_' + feature_name}, inplace=True)
|
231 |
+
df_eval_data.rename(columns={'Predicted': 'Predicted_' + feature_name}, inplace=True)
|
232 |
+
df_eval_data.rename(columns={'accuracy_score': 'accuracy_score_' + feature_name}, inplace=True)
|
233 |
+
|
234 |
+
data_merged.append(df_eval_data)
|
235 |
+
|
236 |
+
data_merged = pd.concat(data_merged, axis=1)
|
237 |
+
|
238 |
+
data_merged.to_csv(f"{OUTPUT_DIR}/data_extraction/merged.csv")
|
239 |
+
|
240 |
+
return data_merged
|
241 |
+
|
utils_data_extraction.py
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import pandas as pd
|
3 |
+
import os
|
4 |
+
import time
|
5 |
+
import gradio as gr
|
6 |
+
import json
|
7 |
+
|
8 |
+
import google.generativeai as genai
|
9 |
+
|
10 |
+
from dotenv import load_dotenv
|
11 |
+
load_dotenv()
|
12 |
+
GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY')
|
13 |
+
genai.configure(api_key=GOOGLE_API_KEY)
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
############## Photos ##############
|
18 |
+
|
19 |
+
|
20 |
+
def download_file(url, save_path):
|
21 |
+
try:
|
22 |
+
# Send a GET request to the URL
|
23 |
+
response = requests.get(url)
|
24 |
+
|
25 |
+
# Check if the request was successful (status code 200)
|
26 |
+
if response.status_code == 200:
|
27 |
+
# Open the specified path in binary-write mode and save the content
|
28 |
+
with open(save_path, 'wb') as file:
|
29 |
+
file.write(response.content)
|
30 |
+
else:
|
31 |
+
print(f"Failed to download image. Status code: {response.status_code}")
|
32 |
+
except Exception as e:
|
33 |
+
print(f"An error occurred: {e}")
|
34 |
+
|
35 |
+
def upload_file(photo_path):
|
36 |
+
|
37 |
+
photo = genai.upload_file(photo_path)
|
38 |
+
|
39 |
+
return photo
|
40 |
+
|
41 |
+
###### Data extraction
|
42 |
+
|
43 |
+
## Helper function to initialize model
|
44 |
+
|
45 |
+
price_token={'gemini-1.5-pro-002': {'input': 1.25 / 1000000, 'output': 5 / 1000000}
|
46 |
+
}
|
47 |
+
|
48 |
+
gemini_safety_settings = [
|
49 |
+
{
|
50 |
+
"category": "HARM_CATEGORY_DANGEROUS",
|
51 |
+
"threshold": "BLOCK_NONE",
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"category": "HARM_CATEGORY_HARASSMENT",
|
55 |
+
"threshold": "BLOCK_NONE",
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"category": "HARM_CATEGORY_HATE_SPEECH",
|
59 |
+
"threshold": "BLOCK_NONE",
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
|
63 |
+
"threshold": "BLOCK_NONE",
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"category": "HARM_CATEGORY_DANGEROUS_CONTENT",
|
67 |
+
"threshold": "BLOCK_NONE",
|
68 |
+
},
|
69 |
+
]
|
70 |
+
|
71 |
+
|
72 |
+
def load_gemini_model(model_name):
|
73 |
+
|
74 |
+
generation_config = genai.types.GenerationConfig(
|
75 |
+
# Only one candidate for now.
|
76 |
+
candidate_count=1,
|
77 |
+
max_output_tokens=4000,
|
78 |
+
temperature=0,
|
79 |
+
response_mime_type="text/plain"
|
80 |
+
)
|
81 |
+
|
82 |
+
generation_config_json = genai.types.GenerationConfig(
|
83 |
+
# Only one candidate for now.
|
84 |
+
candidate_count=1,
|
85 |
+
max_output_tokens=4000,
|
86 |
+
temperature=0,
|
87 |
+
response_mime_type= "application/json"
|
88 |
+
)
|
89 |
+
|
90 |
+
system_prompt = ["You are a helpful assistant."]
|
91 |
+
gemini_model = genai.GenerativeModel(model_name, system_instruction=system_prompt,
|
92 |
+
safety_settings=gemini_safety_settings)
|
93 |
+
|
94 |
+
return gemini_model, generation_config, generation_config_json
|
95 |
+
|
96 |
+
|
97 |
+
|
98 |
+
##### Call LLM
|
99 |
+
|
100 |
+
|
101 |
+
def call_llm_gemini(model_instance, model, messages, generation_config):
|
102 |
+
|
103 |
+
response = model_instance.generate_content(messages,
|
104 |
+
generation_config=generation_config)
|
105 |
+
|
106 |
+
try:
|
107 |
+
response_content = response.text.strip()
|
108 |
+
except:
|
109 |
+
response_content = 'Failed'
|
110 |
+
|
111 |
+
nb_input_tokens = model_instance.count_tokens(messages).total_tokens
|
112 |
+
nb_output_tokens = model_instance.count_tokens(response_content).total_tokens
|
113 |
+
price = nb_input_tokens * price_token[model]['input'] + nb_output_tokens * price_token[model]['output']
|
114 |
+
print(f"input tokens: {nb_input_tokens}; output tokens: {nb_output_tokens}, price: {price}")
|
115 |
+
|
116 |
+
return response_content, nb_input_tokens, nb_output_tokens, price
|
117 |
+
|
118 |
+
|
119 |
+
|
120 |
+
##### Prompts
|
121 |
+
|
122 |
+
def get_prompt_brand(language):
|
123 |
+
|
124 |
+
prompt = "What is the brand of this product? Answer with the brand name and nothing else."
|
125 |
+
|
126 |
+
return prompt
|
127 |
+
|
128 |
+
def get_prompt_product_name(language):
|
129 |
+
|
130 |
+
prompt = f"What is the {language} product name of this product? Answer in {language} with the product name and nothing else."
|
131 |
+
|
132 |
+
return prompt
|
133 |
+
|
134 |
+
def get_prompt_ingredients(language):
|
135 |
+
prompt=f"""
|
136 |
+
You will be given an image of a product label or packaging. Your task is to extract the ingredients list from this image, focusing specifically on the {language} language version. Here's how to approach this task:
|
137 |
+
|
138 |
+
1. Analyze the provided image
|
139 |
+
|
140 |
+
2. Locate the ingredients list on the product label or packaging.
|
141 |
+
|
142 |
+
3. Identify the {language} language section of the ingredients list.
|
143 |
+
|
144 |
+
4. Extract only the {language} ingredients list. Ignore any ingredients lists in other languages, even if they are present in the image.
|
145 |
+
|
146 |
+
5. If there are multiple {language} ingredient lists (e.g., for different flavors or varieties), extract all of them and clearly separate them.
|
147 |
+
|
148 |
+
6. Do not include any additional information such as allergen warnings, nutritional information, or preparation instructions, even if they are in {language}.
|
149 |
+
|
150 |
+
7. If you cannot find a {language} ingredients list in the image, state that no {language} ingredients list was found.
|
151 |
+
|
152 |
+
8. If the image is unclear, state that the image quality is insufficient to extract the ingredients list accurately.
|
153 |
+
|
154 |
+
Provide your output in the following format:
|
155 |
+
|
156 |
+
<ingredients>
|
157 |
+
[Insert the extracted {language} ingredients list here, exactly as it appears in the image]
|
158 |
+
</ingredients>
|
159 |
+
|
160 |
+
Remember, include only the text of the {language} ingredients list, nothing else. Do not translate or interpret the ingredients; simply transcribe them as they appear in {language}.
|
161 |
+
"""
|
162 |
+
|
163 |
+
return prompt
|
164 |
+
|
165 |
+
|
166 |
+
def get_prompt_nutritional_info():
|
167 |
+
|
168 |
+
prompt = """Extract the following nutritional information from the product image and present it **only** in JSON format, providing only the values per 100g: Energy kJ, Energy kcal, Fat, Saturated fat, Carbohydrates, Sugars, Fibers, Proteins, Salt.
|
169 |
+
|
170 |
+
If you can't extract the nutritional information from the image, you need to say why it's the case.
|
171 |
+
|
172 |
+
The response should contain **only** the following JSON:
|
173 |
+
|
174 |
+
{
|
175 |
+
"Energy kJ": 1500,
|
176 |
+
"Energy kcal": 360,
|
177 |
+
"Fat": 18,
|
178 |
+
"Saturated fat": 7,
|
179 |
+
"Carbohydrates": 40,
|
180 |
+
"Sugars": 25,
|
181 |
+
"Fibers": 3,
|
182 |
+
"Proteins": 8,
|
183 |
+
"Salt": 0.5
|
184 |
+
}
|
185 |
+
|
186 |
+
No additional text or explanation should be included.
|
187 |
+
"""
|
188 |
+
|
189 |
+
return prompt
|
190 |
+
|
191 |
+
|
192 |
+
##### Extract data functions
|
193 |
+
|
194 |
+
def extract_text_from_picture_baseline(OUTPUT_DIR,
|
195 |
+
df_product_id,
|
196 |
+
prompt,
|
197 |
+
type_photo,
|
198 |
+
generation_config,
|
199 |
+
max_entry=None,
|
200 |
+
progress=None
|
201 |
+
):
|
202 |
+
outputs = []
|
203 |
+
|
204 |
+
if max_entry is None:
|
205 |
+
max_entry = len(df_product_id)
|
206 |
+
|
207 |
+
for i in progress.tqdm(range(max_entry)) if progress is not None else range(max_entry):
|
208 |
+
|
209 |
+
start_time = time.time()
|
210 |
+
product = df_product_id.loc[i]
|
211 |
+
product_id = product['ID']
|
212 |
+
|
213 |
+
photo_path = f'{OUTPUT_DIR}/photos/{product_id}_{type_photo}.jpg'
|
214 |
+
|
215 |
+
download_file(url=product[type_photo], save_path=photo_path)
|
216 |
+
|
217 |
+
photo = upload_file(photo_path)
|
218 |
+
|
219 |
+
messages = [photo, prompt]
|
220 |
+
|
221 |
+
try:
|
222 |
+
response_content, _, _, price = call_llm_gemini(gemini_model, model, messages, generation_config)
|
223 |
+
|
224 |
+
print(response_content)
|
225 |
+
processing_time = time.time() - start_time
|
226 |
+
|
227 |
+
output = [product_id, response_content, round(price, 4), round(processing_time, 2)]
|
228 |
+
outputs.append(output)
|
229 |
+
|
230 |
+
except:
|
231 |
+
print(f"Error for ID: {product_id}")
|
232 |
+
|
233 |
+
df_output = pd.DataFrame(outputs, columns=['ID', 'Extracted_Text', 'Price', 'Processing time'])
|
234 |
+
|
235 |
+
return df_output
|
236 |
+
|
237 |
+
|
238 |
+
def extract_brand(OUTPUT_DIR, df_product_id, language, progress=gr.Progress()):
|
239 |
+
df_output = extract_text_from_picture_baseline(OUTPUT_DIR,
|
240 |
+
df_product_id,
|
241 |
+
get_prompt_brand(language),
|
242 |
+
type_photo="Front photo",
|
243 |
+
generation_config=generation_config,
|
244 |
+
max_entry=None,
|
245 |
+
progress=progress)
|
246 |
+
|
247 |
+
df_output.to_csv(f'{OUTPUT_DIR}/data_extraction/brand.csv', index=False)
|
248 |
+
|
249 |
+
return df_output
|
250 |
+
|
251 |
+
|
252 |
+
def extract_product_name(OUTPUT_DIR, df_product_id, language, progress=gr.Progress()):
|
253 |
+
df_output = extract_text_from_picture_baseline(OUTPUT_DIR, df_product_id,
|
254 |
+
get_prompt_product_name(language),
|
255 |
+
type_photo="Front photo",
|
256 |
+
generation_config=generation_config,
|
257 |
+
max_entry=None,
|
258 |
+
progress=progress)
|
259 |
+
|
260 |
+
df_output.to_csv(f'{OUTPUT_DIR}/data_extraction/product_name.csv', index=False)
|
261 |
+
|
262 |
+
return df_output
|
263 |
+
|
264 |
+
|
265 |
+
def extract_ingredients(OUTPUT_DIR, df_product_id, language, progress=gr.Progress()):
|
266 |
+
df_output = extract_text_from_picture_baseline(OUTPUT_DIR, df_product_id,
|
267 |
+
get_prompt_ingredients(language),
|
268 |
+
type_photo="Ingredients photo",
|
269 |
+
generation_config=generation_config,
|
270 |
+
max_entry=None,
|
271 |
+
progress=progress)
|
272 |
+
|
273 |
+
df_output.to_csv(f'{OUTPUT_DIR}/data_extraction/ingredients.csv', index=False)
|
274 |
+
|
275 |
+
return df_output
|
276 |
+
|
277 |
+
|
278 |
+
def convert_json_string_to_dict(json_string, record_id):
|
279 |
+
default_keys = ['Energy kJ', 'Energy kcal', 'Fat', 'Saturated fat', 'Carbohydrates', 'Sugars', 'Fibers', 'Proteins',
|
280 |
+
'Salt']
|
281 |
+
|
282 |
+
clean_string = json_string
|
283 |
+
|
284 |
+
if not clean_string:
|
285 |
+
print(f"ID: {record_id} - La chaîne est vide ou invalide : '{json_string}'")
|
286 |
+
return {key: -1 for key in default_keys}
|
287 |
+
|
288 |
+
try:
|
289 |
+
return json.loads(clean_string)
|
290 |
+
except json.JSONDecodeError:
|
291 |
+
print(f"ID: {record_id} - Erreur lors du décodage du JSON : '{json_string}'")
|
292 |
+
return {key: -1 for key in default_keys}
|
293 |
+
|
294 |
+
def extract_nutritional_values(OUTPUT_DIR, df_product_id, language, progress=gr.Progress()):
|
295 |
+
|
296 |
+
df_output = extract_text_from_picture_baseline(OUTPUT_DIR, df_product_id,
|
297 |
+
get_prompt_nutritional_info(),
|
298 |
+
type_photo="Nutritionals photo",
|
299 |
+
generation_config=generation_config_json,
|
300 |
+
max_entry=None,
|
301 |
+
progress=progress)
|
302 |
+
|
303 |
+
df_output.to_csv(f'{OUTPUT_DIR}/data_extraction/nutritional_values.csv', index=False)
|
304 |
+
|
305 |
+
df_output['Extracted_Text_Json'] = df_output.apply(
|
306 |
+
lambda row: convert_json_string_to_dict(row['Extracted_Text'], row['ID']), axis=1)
|
307 |
+
|
308 |
+
keys = list(df_output['Extracted_Text_Json'].iloc[
|
309 |
+
0].keys()) # On prend les clés du premier dictionnaire comme référence
|
310 |
+
|
311 |
+
for key in keys:
|
312 |
+
df_key = df_output[['ID', 'Price', 'Processing time']].copy()
|
313 |
+
df_key['Extracted_Text'] = df_output['Extracted_Text_Json'].apply(lambda x: x.get(key, None))
|
314 |
+
df_key.to_csv(f"{OUTPUT_DIR}/data_extraction/{key.replace(' ', '_').lower()}.csv", index=False)
|
315 |
+
|
316 |
+
df_output = df_output[['ID', 'Extracted_Text', 'Price', 'Processing time']]
|
317 |
+
|
318 |
+
return df_output
|
319 |
+
|
320 |
+
|
321 |
+
model = 'gemini-1.5-pro-002'
|
322 |
+
|
323 |
+
gemini_model, generation_config, generation_config_json = load_gemini_model(model)
|