File size: 1,811 Bytes
dc4b86a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import random 
from z_utils import get_dataframe
from z_similarity import computes_similarity_w_hypothetical
from z_hypothetical_summary import generate_summaries
from tqdm import tqdm
import numpy as np

# CONST
random.seed(53)
CLEAN_DF_UNIQUE_TITLES = "unique_titles_books_summary.csv"
N_SAMPLES_EVAL = 2
TOP_K = 50
TOP_P = 0.85

books_df = get_dataframe(CLEAN_DF_UNIQUE_TITLES)

# sampling row id
random_values: list = random.sample(range(0, books_df.shape[0]), N_SAMPLES_EVAL)

reciprocal_ranks: list[int] = list() 

pbar = tqdm(total=N_SAMPLES_EVAL)

for idx in random_values:
    # Sample a book
    book = books_df.iloc[idx]

    # Generate hypothetical summary
    fake_summaries = generate_summaries(book_title = book["book_name"], n_samples=5, top_k=TOP_K, top_p=TOP_P)

    # Compute Simialrity 
    similarity, ranks = computes_similarity_w_hypothetical(hypothetical_summaries=fake_summaries)
    
    # Get reciprocal Rank
    df_ranked =  books_df.iloc[ranks]
    df_ranked = df_ranked.reset_index()
    df_ranked.drop(columns=["index"], inplace=True)
    rank = df_ranked[df_ranked["book_name"] == book["book_name"]].index.values[0] + 1 # rank starts 0 hence offseting by 1

    # Update list 
    reciprocal_ranks.append(1/rank)
    pbar.update(1) 

pbar.close()

print(f"USING Paramerters: TOP_K={TOP_K}  TOP_P={TOP_P}")
print("MRR: ", sum(reciprocal_ranks)/len(reciprocal_ranks))

# Calculate five-number summary
values = reciprocal_ranks
minimum = np.min(values)
q1 = np.percentile(values, 25)  # First quartile
median = np.median(values)
q3 = np.percentile(values, 75)  # Third quartile
maximum = np.max(values)

# Print the five-number summary
print("Five-Number Summary:")
print(f"Min: {minimum}")
print(f"Q1 : {q1}")
print(f"Med: {median}")
print(f"Q3 : {q3}")
print(f"Max: {maximum}")