File size: 5,631 Bytes
8f074bc
 
 
 
 
 
 
 
 
 
 
0d07c63
8f074bc
0d07c63
8f074bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d07c63
8f074bc
 
0d07c63
8f074bc
 
 
 
 
 
 
 
 
0d07c63
 
8f074bc
 
 
 
 
 
 
 
 
 
 
 
0d07c63
8f074bc
0d07c63
 
 
8f074bc
0d07c63
 
8f074bc
 
 
 
 
 
 
 
 
 
 
 
 
 
0d07c63
8f074bc
 
 
 
 
 
 
 
 
 
 
 
0d07c63
 
 
 
 
 
8f074bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d07c63
8f074bc
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import duckdb
from sentence_transformers import SentenceTransformer
import pandas as pd
import re

def duckdb_vss_local(
    model: SentenceTransformer,
    duckdb_connection: duckdb.DuckDBPyConnection,
    query: str,
    k: int = 1000,
    brevity_penalty: float = 0.0,
    min_length: int = 131,
    reward_for_literal: float = 0.0,
    first_term_reward: float = 20.0,
    partial_match_factor: float = 0.5,
    table_name: str = "maestro_vector_table",
    embedding_column: str = "vec",
):

    query_vector = model.encode(query)
    embedding_dim = model.get_sentence_embedding_dimension()

    sql = f"""
        SELECT 
            *,
            array_cosine_distance(
                {embedding_column}::float[{embedding_dim}], 
                {query_vector.tolist()}::float[{embedding_dim}]
            ) as distance
        FROM {table_name}
        ORDER BY distance
        LIMIT {k}
    """
    result = duckdb_connection.sql(sql).to_df()
    # Utilizar los parámetros "debug" para mostrar columnas intermedias:
    if brevity_penalty > 0:
        result = penalize_short_summaries(result, factor = brevity_penalty, distance_column = 'distance', 
                                          summary_column = 'longBusinessSummary', min_length=min_length, debug = False)
    if reward_for_literal > 0:
        result = reward_literals(result, query, factor = reward_for_literal, 
                                 partial_match_factor= partial_match_factor, first_term_reward=first_term_reward, distance_column = 'distance', 
                                 summary_column = 'longBusinessSummary', debug = False)

    return result

def penalize_short_summaries(
    df: pd.DataFrame,
    factor: float = 0.1,
    distance_column: str = 'distance',
    summary_column: str = 'longBusinessSummary',
    debug: bool = True,
    min_length: int = 131
    ) -> pd.DataFrame:

    result_df = df.copy()
    result_df['summary_length'] = result_df[summary_column].apply(
        lambda x: len(str(x)) if pd.notna(x) else 0
    )
    avg_length = max(1.0, result_df['summary_length'].mean())
    max_dist = result_df['distance'].max()

    result_df['percent_shorter'] = result_df['summary_length'].apply(
        lambda x: max(0, (avg_length - x) / avg_length)
    )

    result_df['orig_distance'] = result_df[distance_column]

    # Asignar distancia máxima para resúmenes más cortos que min_length
    # y aplicar penalización proporcional para el resto
    result_df[distance_column] = result_df.apply(
        lambda row: max_dist if row['summary_length'] < min_length else 
                   min(max_dist, row[distance_column] + (row['percent_shorter'] * factor)),
        axis=1
    )
    
    if not debug:
        result_df = result_df.drop(['orig_distance', 'summary_length', 'percent_shorter'], axis=1)

    result_df = result_df.sort_values(by=distance_column, ascending=True) 
    return result_df

def reward_literals(
    df: pd.DataFrame,
    query: str,
    factor: float = 0.1,
    partial_match_factor: float = 0.5,
    first_term_reward: float = 20.0,
    distance_column: str = 'distance',
    summary_column: str = 'longBusinessSummary',
    debug: bool = True
    ) -> pd.DataFrame:

    result_df = df.copy()
    query_lower = query.lower().strip()
    
    def count_phrase_occurrences(summary):
        if pd.isna(summary):
            return 0
        summary_lower = str(summary).lower()
        # Extraemos la primera palabra del resumen y la limpiamos de caracteres especiales 
        # Por ejemplo: "Grifols, S.A. operates as a plasma therapeutic company..." -> Extrae "Grifols", no "Grifols,"
        first_word = summary_lower.split()[0] if summary_lower.strip() and len(summary_lower.split()) > 0 else ""
        first_term = re.sub(r'[^\w\s]', '', first_word.lower())
        # Comprobamos si la primera palabra coincide con la consulta (típicamente el nombre de la empresa)
        _first_term_reward = first_term_reward if first_term == query_lower else 0
        
        # Cuenta coincidencias exactas (palabras completas)
        exact_pattern = r'\b' + re.escape(query_lower) + r'\b'
        exact_count = len(re.findall(exact_pattern, summary_lower))
        
        # Cuenta coincidencias parciales basadas en el tipo de consulta
        if ' ' in query_lower:  # Si la consulta incluye varias palabras
            # Para frases, contamos las veces que aparece en el texto
            partial_pattern = re.escape(query_lower)
            partial_count = len(re.findall(partial_pattern, summary_lower))
        else:
            # Para consultas de una sola palabra, buscamos subcadenas dentro de palabras
            partial_pattern = r'\b\w*' + re.escape(query_lower) + r'\w*\b'
            partial_count = len(re.findall(partial_pattern, summary_lower))
        
        # Resta las coincidencias exactas de las parciales para evitar contar dos veces
        partial_count = partial_count - exact_count
        
        # Penalizamos las coincidencias parciales:
        return _first_term_reward + exact_count + (partial_count * partial_match_factor)
        
    result_df['term_occurrences'] = result_df[summary_column].apply(count_phrase_occurrences)
    result_df['orig_distance'] = result_df[distance_column]
    result_df[distance_column] = result_df.apply(
        lambda row: max(0, row[distance_column] - (row['term_occurrences'] * factor)),
        axis=1
    )
    if not debug:
        result_df = result_df.drop(['orig_distance', 'term_occurrences'], axis=1)
    result_df = result_df.sort_values(by=distance_column, ascending=True)

    return result_df