search by company name
Browse files- app.py +4 -0
- json/semantic_search_params.json +2 -0
- src/semantic_search.py +20 -6
app.py
CHANGED
@@ -115,7 +115,9 @@ y_cdf, _ = dh_app.configura_distr_prob(shape, loc, scale, max_dist, precision_cd
|
|
115 |
# Parámetros de la de búsqueda VSS:
|
116 |
k = semantic_search_params["k"]
|
117 |
brevity_penalty = semantic_search_params["brevity_penalty"]
|
|
|
118 |
reward_for_literal = semantic_search_params["reward_for_literal"]
|
|
|
119 |
partial_match_factor = semantic_search_params["partial_match_factor"]
|
120 |
print(f"VSS params: k={k}, brevity_penalty={brevity_penalty}, reward_for_literal={reward_for_literal}, partial_match_factor={partial_match_factor}")
|
121 |
|
@@ -187,7 +189,9 @@ def search_theme(theme: str, page: int, *filtros_values) -> Tuple[pd.DataFrame,
|
|
187 |
query=query,
|
188 |
k=k,
|
189 |
brevity_penalty=brevity_penalty,
|
|
|
190 |
reward_for_literal=reward_for_literal,
|
|
|
191 |
partial_match_factor=partial_match_factor,
|
192 |
table_name="vector_table",
|
193 |
embedding_column="embeddings"
|
|
|
115 |
# Parámetros de la de búsqueda VSS:
|
116 |
k = semantic_search_params["k"]
|
117 |
brevity_penalty = semantic_search_params["brevity_penalty"]
|
118 |
+
min_length = semantic_search_params["min_length"]
|
119 |
reward_for_literal = semantic_search_params["reward_for_literal"]
|
120 |
+
first_term_reward = semantic_search_params["first_term_reward"]
|
121 |
partial_match_factor = semantic_search_params["partial_match_factor"]
|
122 |
print(f"VSS params: k={k}, brevity_penalty={brevity_penalty}, reward_for_literal={reward_for_literal}, partial_match_factor={partial_match_factor}")
|
123 |
|
|
|
189 |
query=query,
|
190 |
k=k,
|
191 |
brevity_penalty=brevity_penalty,
|
192 |
+
min_length = min_length,
|
193 |
reward_for_literal=reward_for_literal,
|
194 |
+
first_term_reward=first_term_reward,
|
195 |
partial_match_factor=partial_match_factor,
|
196 |
table_name="vector_table",
|
197 |
embedding_column="embeddings"
|
json/semantic_search_params.json
CHANGED
@@ -2,7 +2,9 @@
|
|
2 |
"semantic_search_params": {
|
3 |
"k": 2000,
|
4 |
"brevity_penalty": 0.1,
|
|
|
5 |
"reward_for_literal": 0.03,
|
|
|
6 |
"partial_match_factor": 0.8
|
7 |
}
|
8 |
}
|
|
|
2 |
"semantic_search_params": {
|
3 |
"k": 2000,
|
4 |
"brevity_penalty": 0.1,
|
5 |
+
"min_length": 131,
|
6 |
"reward_for_literal": 0.03,
|
7 |
+
"first_term_reward": 20,
|
8 |
"partial_match_factor": 0.8
|
9 |
}
|
10 |
}
|
src/semantic_search.py
CHANGED
@@ -9,7 +9,9 @@ def duckdb_vss_local(
|
|
9 |
query: str,
|
10 |
k: int = 1000,
|
11 |
brevity_penalty: float = 0.0,
|
|
|
12 |
reward_for_literal: float = 0.0,
|
|
|
13 |
partial_match_factor: float = 0.5,
|
14 |
table_name: str = "maestro_vector_table",
|
15 |
embedding_column: str = "vec",
|
@@ -33,10 +35,10 @@ def duckdb_vss_local(
|
|
33 |
# Utilizar los parámetros "debug" para mostrar columnas intermedias:
|
34 |
if brevity_penalty > 0:
|
35 |
result = penalize_short_summaries(result, factor = brevity_penalty, distance_column = 'distance',
|
36 |
-
summary_column = 'longBusinessSummary', debug = False)
|
37 |
if reward_for_literal > 0:
|
38 |
result = reward_literals(result, query, factor = reward_for_literal,
|
39 |
-
partial_match_factor= partial_match_factor, distance_column = 'distance',
|
40 |
summary_column = 'longBusinessSummary', debug = False)
|
41 |
|
42 |
return result
|
@@ -46,7 +48,8 @@ def penalize_short_summaries(
|
|
46 |
factor: float = 0.1,
|
47 |
distance_column: str = 'distance',
|
48 |
summary_column: str = 'longBusinessSummary',
|
49 |
-
debug: bool = True
|
|
|
50 |
) -> pd.DataFrame:
|
51 |
|
52 |
result_df = df.copy()
|
@@ -59,10 +62,14 @@ def penalize_short_summaries(
|
|
59 |
result_df['percent_shorter'] = result_df['summary_length'].apply(
|
60 |
lambda x: max(0, (avg_length - x) / avg_length)
|
61 |
)
|
|
|
62 |
result_df['orig_distance'] = result_df[distance_column]
|
63 |
-
|
|
|
|
|
64 |
result_df[distance_column] = result_df.apply(
|
65 |
-
lambda row:
|
|
|
66 |
axis=1
|
67 |
)
|
68 |
|
@@ -77,6 +84,7 @@ def reward_literals(
|
|
77 |
query: str,
|
78 |
factor: float = 0.1,
|
79 |
partial_match_factor: float = 0.5,
|
|
|
80 |
distance_column: str = 'distance',
|
81 |
summary_column: str = 'longBusinessSummary',
|
82 |
debug: bool = True
|
@@ -89,6 +97,12 @@ def reward_literals(
|
|
89 |
if pd.isna(summary):
|
90 |
return 0
|
91 |
summary_lower = str(summary).lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
|
93 |
# Cuenta coincidencias exactas (palabras completas)
|
94 |
exact_pattern = r'\b' + re.escape(query_lower) + r'\b'
|
@@ -108,7 +122,7 @@ def reward_literals(
|
|
108 |
partial_count = partial_count - exact_count
|
109 |
|
110 |
# Penalizamos las coincidencias parciales:
|
111 |
-
return exact_count + (partial_count * partial_match_factor)
|
112 |
|
113 |
result_df['term_occurrences'] = result_df[summary_column].apply(count_phrase_occurrences)
|
114 |
result_df['orig_distance'] = result_df[distance_column]
|
|
|
9 |
query: str,
|
10 |
k: int = 1000,
|
11 |
brevity_penalty: float = 0.0,
|
12 |
+
min_length: int = 131,
|
13 |
reward_for_literal: float = 0.0,
|
14 |
+
first_term_reward: float = 20.0,
|
15 |
partial_match_factor: float = 0.5,
|
16 |
table_name: str = "maestro_vector_table",
|
17 |
embedding_column: str = "vec",
|
|
|
35 |
# Utilizar los parámetros "debug" para mostrar columnas intermedias:
|
36 |
if brevity_penalty > 0:
|
37 |
result = penalize_short_summaries(result, factor = brevity_penalty, distance_column = 'distance',
|
38 |
+
summary_column = 'longBusinessSummary', min_length=min_length, debug = False)
|
39 |
if reward_for_literal > 0:
|
40 |
result = reward_literals(result, query, factor = reward_for_literal,
|
41 |
+
partial_match_factor= partial_match_factor, first_term_reward=first_term_reward, distance_column = 'distance',
|
42 |
summary_column = 'longBusinessSummary', debug = False)
|
43 |
|
44 |
return result
|
|
|
48 |
factor: float = 0.1,
|
49 |
distance_column: str = 'distance',
|
50 |
summary_column: str = 'longBusinessSummary',
|
51 |
+
debug: bool = True,
|
52 |
+
min_length: int = 131
|
53 |
) -> pd.DataFrame:
|
54 |
|
55 |
result_df = df.copy()
|
|
|
62 |
result_df['percent_shorter'] = result_df['summary_length'].apply(
|
63 |
lambda x: max(0, (avg_length - x) / avg_length)
|
64 |
)
|
65 |
+
|
66 |
result_df['orig_distance'] = result_df[distance_column]
|
67 |
+
|
68 |
+
# Asignar distancia máxima para resúmenes más cortos que min_length
|
69 |
+
# y aplicar penalización proporcional para el resto
|
70 |
result_df[distance_column] = result_df.apply(
|
71 |
+
lambda row: max_dist if row['summary_length'] < min_length else
|
72 |
+
min(max_dist, row[distance_column] + (row['percent_shorter'] * factor)),
|
73 |
axis=1
|
74 |
)
|
75 |
|
|
|
84 |
query: str,
|
85 |
factor: float = 0.1,
|
86 |
partial_match_factor: float = 0.5,
|
87 |
+
first_term_reward: float = 20.0,
|
88 |
distance_column: str = 'distance',
|
89 |
summary_column: str = 'longBusinessSummary',
|
90 |
debug: bool = True
|
|
|
97 |
if pd.isna(summary):
|
98 |
return 0
|
99 |
summary_lower = str(summary).lower()
|
100 |
+
# Extraemos la primera palabra del resumen y la limpiamos de caracteres especiales
|
101 |
+
# Por ejemplo: "Grifols, S.A. operates as a plasma therapeutic company..." -> Extrae "Grifols", no "Grifols,"
|
102 |
+
first_word = summary_lower.split()[0] if summary_lower.strip() and len(summary_lower.split()) > 0 else ""
|
103 |
+
first_term = re.sub(r'[^\w\s]', '', first_word.lower())
|
104 |
+
# Comprobamos si la primera palabra coincide con la consulta (típicamente el nombre de la empresa)
|
105 |
+
_first_term_reward = first_term_reward if first_term == query_lower else 0
|
106 |
|
107 |
# Cuenta coincidencias exactas (palabras completas)
|
108 |
exact_pattern = r'\b' + re.escape(query_lower) + r'\b'
|
|
|
122 |
partial_count = partial_count - exact_count
|
123 |
|
124 |
# Penalizamos las coincidencias parciales:
|
125 |
+
return _first_term_reward + exact_count + (partial_count * partial_match_factor)
|
126 |
|
127 |
result_df['term_occurrences'] = result_df[summary_column].apply(count_phrase_occurrences)
|
128 |
result_df['orig_distance'] = result_df[distance_column]
|