import re from rouge_score import rouge_scorer import Levenshtein import pandas as pd import numpy as np feature_assessment_entries = { f'brand': { 'name': f'brand', 'output_column': 'Brand', 'scoring_function_name': 'grade_exact_match', 'post_processing_function_name': 'post_processing_none', # 'post_processing_function_name' : 'post_processing_brand', 'k_folds': 3, }, f'product_name': { 'name': f'product_name', 'output_column': 'Product name', 'scoring_function_name': 'grade_levenshtein_match', # 'scoring_function_name' : 'grade_exact_match', 'post_processing_function_name': 'post_processing_none', 'k_folds': 3, }, f'ingredients': { 'name': f'ingredients', 'output_column': 'Ingredients', 'scoring_function_name': 'grade_rouge_score', # 'scoring_function_name' : 'grade_levenshtein_match', 'post_processing_function_name': 'post_processing_none', # 'post_processing_function_name' : 'post_processing_ingredients', 'k_folds': 3, }, f'energy_kj': { 'name': f'energy_kj', 'output_column': 'Energy kJ', 'scoring_function_name': 'grade_numerical', 'post_processing_function_name': 'post_processing_none', 'k_folds': 3, }, f'energy_kcal': { 'name': f'energy_kcal', 'output_column': 'Energy kcal', 'scoring_function_name': 'grade_numerical', 'post_processing_function_name': 'post_processing_none', 'k_folds': 3, }, f'fat': { 'name': f'fat', 'output_column': 'Fat', 'scoring_function_name': 'grade_numerical', 'post_processing_function_name': 'post_processing_nutritionals', 'k_folds': 3, }, f'saturated_fat': { 'name': f'saturated_fat', 'output_column': 'Saturated fat', 'scoring_function_name': 'grade_numerical', 'post_processing_function_name': 'post_processing_nutritionals', 'k_folds': 3, }, f'carbohydrates': { 'name': f'carbohydrates', 'output_column': 'Carbohydrates', 'scoring_function_name': 'grade_numerical', 'post_processing_function_name': 'post_processing_nutritionals', 'k_folds': 3, }, f'sugars': { 'name': f'sugars', 'output_column': 'Sugars', 'scoring_function_name': 'grade_numerical', 'post_processing_function_name': 'post_processing_nutritionals', 'k_folds': 3, }, f'fibers': { 'name': f'fibers', 'output_column': 'Fibers', 'scoring_function_name': 'grade_numerical', 'post_processing_function_name': 'post_processing_nutritionals', 'k_folds': 3, }, f'proteins': { 'name': f'proteins', 'output_column': 'Proteins', 'scoring_function_name': 'grade_numerical', 'post_processing_function_name': 'post_processing_nutritionals', 'k_folds': 3, }, f'salt': { 'name': f'salt', 'output_column': 'Salt', 'scoring_function_name': 'grade_numerical', 'post_processing_function_name': 'post_processing_nutritionals', 'k_folds': 3, }, } def post_processing_none(string): return string def post_processing_ingredients(string): pattern = r"(.*?)" # Find all matches matches = re.findall(pattern, string, re.DOTALL) if len(matches) == 0: output = string else: output = matches[0].strip() if output.lower().startswith("ingrediƫnten: ") or output.lower().startswith("ingredienten: "): output = output[len("ingrediƫnten: "):] if output.lower().startswith("ingredients: "): output = output[len("ingredients: "):] return output def post_processing_brand(brand): if brand.lower() == "boni": brand = "Boni Selection" elif brand.lower() == "rana": brand = "Giovanni Rana" elif brand.lower() == "the market": brand = "Carrefour The Market" elif brand.lower() == "extra": brand = "Carrefour Extra" return brand def post_processing_nutritionals(predicted_value): try: predicted_value = re.findall(r"[-+]?\d*\.\d+|\d+", str(predicted_value))[0] except: predicted_value = np.nan return predicted_value def grade_levenshtein_match(predicted_value, reference_value): score = Levenshtein.ratio(predicted_value.lower().strip(), reference_value.lower().strip()) return score def grade_exact_match(predicted_value, reference_value): reference_value = reference_value.lower().strip() reference_value = re.sub(r'\s+', ' ', reference_value) predicted_value = predicted_value.lower().strip() score = int(predicted_value.lower().strip() == reference_value.lower().strip()) return score def grade_rouge_score(predicted_value, reference_value): scorer = rouge_scorer.RougeScorer(['rouge2']) score = scorer.score(predicted_value, reference_value)['rouge2'].fmeasure return score def grade_numerical(predicted_value, reference_value): try: if np.isnan(float(predicted_value)) and np.isnan(float(reference_value)): score = 1 else: score = int(float(predicted_value) == float(reference_value)) except: score = -1 return score def create_eval_data(OUTPUT_DIR, feature_assessment_entry): df_product_id = pd.read_csv(f"{OUTPUT_DIR}/reference_data.csv") df_features = pd.read_csv(f"{OUTPUT_DIR}/{feature_assessment_entry['name']}.csv") df_features = df_features.merge(df_product_id, on='ID', how='left') df_eval_data = df_features[ ['ID', feature_assessment_entry['output_column'], 'Extracted_Text', 'Price', 'Processing time']].copy() df_eval_data.rename(columns={feature_assessment_entry['output_column']: 'Reference'}, inplace=True) df_eval_data.rename(columns={'Extracted_Text': 'Predicted'}, inplace=True) df_eval_data['Predicted'] = df_eval_data.apply( lambda row: eval(feature_assessment_entry['post_processing_function_name'])(row['Predicted']), axis=1) df_eval_data['accuracy_score'] = df_eval_data.apply( lambda row: eval(feature_assessment_entry['scoring_function_name'])(row['Predicted'], row['Reference']), axis=1) df_eval_data['accuracy_score'] = round(df_eval_data['accuracy_score'], 2) N = len(df_eval_data) k = feature_assessment_entry['k_folds'] np.random.seed(42) df_eval_data['fold'] = np.random.randint(0, k, size=N) return df_eval_data def merge_and_save_data(OUTPUT_DIR): df_ref_data = pd.read_csv(f"{OUTPUT_DIR}/data_extraction/reference_data.csv") data_merged = [df_ref_data[['ID', 'Front photo', 'Nutritionals photo', 'Ingredients photo', 'EAN photo']]] for feature_name in feature_assessment_entries.keys(): df_eval_data = create_eval_data(f'{OUTPUT_DIR}/data_extraction', feature_assessment_entries[feature_name]) df_eval_data = df_eval_data[['Reference', 'Predicted', 'accuracy_score']] df_eval_data.rename(columns={'Reference': 'Reference_' + feature_name}, inplace=True) df_eval_data.rename(columns={'Predicted': 'Predicted_' + feature_name}, inplace=True) df_eval_data.rename(columns={'accuracy_score': 'accuracy_score_' + feature_name}, inplace=True) data_merged.append(df_eval_data) data_merged = pd.concat(data_merged, axis=1) data_merged.to_csv(f"{OUTPUT_DIR}/data_extraction/merged.csv") return data_merged