import random from z_utils import get_dataframe from z_similarity import computes_similarity_w_hypothetical from z_hypothetical_summary import generate_summaries from tqdm import tqdm import numpy as np # CONST random.seed(53) CLEAN_DF_UNIQUE_TITLES = "unique_titles_books_summary.csv" N_SAMPLES_EVAL = 2 TOP_K = 50 TOP_P = 0.85 books_df = get_dataframe(CLEAN_DF_UNIQUE_TITLES) # sampling row id random_values: list = random.sample(range(0, books_df.shape[0]), N_SAMPLES_EVAL) reciprocal_ranks: list[int] = list() pbar = tqdm(total=N_SAMPLES_EVAL) for idx in random_values: # Sample a book book = books_df.iloc[idx] # Generate hypothetical summary fake_summaries = generate_summaries(book_title = book["book_name"], n_samples=5, top_k=TOP_K, top_p=TOP_P) # Compute Simialrity similarity, ranks = computes_similarity_w_hypothetical(hypothetical_summaries=fake_summaries) # Get reciprocal Rank df_ranked = books_df.iloc[ranks] df_ranked = df_ranked.reset_index() df_ranked.drop(columns=["index"], inplace=True) rank = df_ranked[df_ranked["book_name"] == book["book_name"]].index.values[0] + 1 # rank starts 0 hence offseting by 1 # Update list reciprocal_ranks.append(1/rank) pbar.update(1) pbar.close() print(f"USING Paramerters: TOP_K={TOP_K} TOP_P={TOP_P}") print("MRR: ", sum(reciprocal_ranks)/len(reciprocal_ranks)) # Calculate five-number summary values = reciprocal_ranks minimum = np.min(values) q1 = np.percentile(values, 25) # First quartile median = np.median(values) q3 = np.percentile(values, 75) # Third quartile maximum = np.max(values) # Print the five-number summary print("Five-Number Summary:") print(f"Min: {minimum}") print(f"Q1 : {q1}") print(f"Med: {median}") print(f"Q3 : {q3}") print(f"Max: {maximum}")