import megfile import os import pandas as pd from collections import defaultdict import sys import numpy as np import math GROUPS = [ "background_change", "color_alter", "material_alter", "motion_change", "ps_human", "style_change", "subject-add", "subject-remove", "subject-replace", "text_change", "tone_transfer" ] def analyze_scores(save_path_dir, evaluate_group, language): results = defaultdict(dict) save_path_new = save_path_dir model_total_score = defaultdict(dict) group_dict_sub = {} group_scores_semantics = defaultdict(lambda: defaultdict(list)) group_scores_quality = defaultdict(lambda: defaultdict(list)) group_scores_overall = defaultdict(lambda: defaultdict(list)) group_scores_semantics_intersection = defaultdict(lambda: defaultdict(list)) group_scores_quality_intersection = defaultdict(lambda: defaultdict(list)) group_scores_overall_intersection = defaultdict(lambda: defaultdict(list)) length_total = 0 save_path_dir_raw = save_path_dir for group_name in GROUPS: csv_path = os.path.join(save_path_new, f"{evaluate_group[0]}_{group_name}_gpt_score.csv") csv_file = megfile.smart_open(csv_path) df = pd.read_csv(csv_file) filtered_semantics_scores = [] filtered_quality_scores = [] filtered_overall_scores = [] filtered_semantics_scores_intersection = [] filtered_quality_scores_intersection = [] filtered_overall_scores_intersection = [] for _, row in df.iterrows(): source_image = row['source_image'] edited_image = row['edited_image'] instruction = row['instruction'] semantics_score = row['sementics_score'] quality_score = row['quality_score'] intersection_exist = row['intersection_exist'] instruction_language = row['instruction_language'] if instruction_language == language: pass else: continue overall_score = math.sqrt(semantics_score * quality_score) filtered_semantics_scores.append(semantics_score) filtered_quality_scores.append(quality_score) filtered_overall_scores.append(overall_score) if intersection_exist: filtered_semantics_scores_intersection.append(semantics_score) filtered_quality_scores_intersection.append(quality_score) filtered_overall_scores_intersection.append(overall_score) avg_semantics_score = np.mean(filtered_semantics_scores) avg_quality_score = np.mean(filtered_quality_scores) avg_overall_score = np.mean(filtered_overall_scores) group_scores_semantics[evaluate_group[0]][group_name] = avg_semantics_score group_scores_quality[evaluate_group[0]][group_name] = avg_quality_score group_scores_overall[evaluate_group[0]][group_name] = avg_overall_score avg_semantics_score_intersection = np.mean(filtered_semantics_scores_intersection) avg_quality_score_intersection = np.mean(filtered_quality_scores_intersection) avg_overall_score_intersection = np.mean(filtered_overall_scores_intersection) group_scores_semantics_intersection[evaluate_group[0]][group_name] = avg_semantics_score_intersection group_scores_quality_intersection[evaluate_group[0]][group_name] = avg_quality_score_intersection group_scores_overall_intersection[evaluate_group[0]][group_name] = avg_overall_score_intersection print("\n--- Overall Model Averages ---") print("\nSemantics:") for model_name in evaluate_group: model_scores = [group_scores_semantics[model_name][group] for group in GROUPS] model_avg = np.mean(model_scores) group_scores_semantics[model_name]["avg_semantics"] = model_avg print("\nSemantics Intersection:") for model_name in evaluate_group: model_scores = [group_scores_semantics_intersection[model_name][group] for group in GROUPS] model_avg = np.mean(model_scores) group_scores_semantics_intersection[model_name]["avg_semantics"] = model_avg print("\nQuality:") for model_name in evaluate_group: model_scores = [group_scores_quality[model_name][group] for group in GROUPS] model_avg = np.mean(model_scores) group_scores_quality[model_name]["avg_quality"] = model_avg print("\nQuality Intersection:") for model_name in evaluate_group: model_scores = [group_scores_quality_intersection[model_name][group] for group in GROUPS] model_avg = np.mean(model_scores) group_scores_quality_intersection[model_name]["avg_quality"] = model_avg print("\nOverall:") for model_name in evaluate_group: model_scores = [group_scores_overall[model_name][group] for group in GROUPS] model_avg = np.mean(model_scores) group_scores_overall[model_name]["avg_overall"] = model_avg print("\nOverall Intersection:") for model_name in evaluate_group: model_scores = [group_scores_overall_intersection[model_name][group] for group in GROUPS] model_avg = np.mean(model_scores) group_scores_overall_intersection[model_name]["avg_overall"] = model_avg return group_scores_semantics, group_scores_quality, group_scores_overall, group_scores_semantics_intersection, group_scores_quality_intersection, group_scores_overall_intersection if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("--model_name", type=str, default="UniWorld") parser.add_argument("--save_path", type=str, default="/mnt/data/lb/Remake/UniWorld//eval_output/stage3_ema/Gedit") parser.add_argument("--backbone", type=str, default="gpt4o", choices=["gpt4o", "qwen25vl"]) parser.add_argument("--language", type=str, default="en", choices=["en", "zh"]) args = parser.parse_args() model_name = args.model_name save_path_dir = args.save_path evaluate_group = [args.model_name] backbone = args.backbone save_path_new = os.path.join(save_path_dir, backbone, "eval_results_new") print("\nOverall:") for model_name in evaluate_group: group_scores_semantics, group_scores_quality, group_scores_overall, group_scores_semantics_intersection, group_scores_quality_intersection, group_scores_overall_intersection = analyze_scores(save_path_new, [model_name], language=args.language) for group_name in GROUPS: print(f"{group_name}: {group_scores_semantics[model_name][group_name]:.3f}, {group_scores_quality[model_name][group_name]:.3f}, {group_scores_overall[model_name][group_name]:.3f}") print(f"Average: {group_scores_semantics[model_name]['avg_semantics']:.3f}, {group_scores_quality[model_name]['avg_quality']:.3f}, {group_scores_overall[model_name]['avg_overall']:.3f}") print("\nIntersection:") for group_name in GROUPS: print(f"{group_name}: {group_scores_semantics_intersection[model_name][group_name]:.3f}, {group_scores_quality_intersection[model_name][group_name]:.3f}, {group_scores_overall_intersection[model_name][group_name]:.3f}") print(f"Average Intersection: {group_scores_semantics_intersection[model_name]['avg_semantics']:.3f}, {group_scores_quality_intersection[model_name]['avg_quality']:.3f}, {group_scores_overall_intersection[model_name]['avg_overall']:.3f}")