reddgr commited on
Commit
0d07c63
·
1 Parent(s): 91cb4eb

search by company name

Browse files
app.py CHANGED
@@ -115,7 +115,9 @@ y_cdf, _ = dh_app.configura_distr_prob(shape, loc, scale, max_dist, precision_cd
115
  # Parámetros de la de búsqueda VSS:
116
  k = semantic_search_params["k"]
117
  brevity_penalty = semantic_search_params["brevity_penalty"]
 
118
  reward_for_literal = semantic_search_params["reward_for_literal"]
 
119
  partial_match_factor = semantic_search_params["partial_match_factor"]
120
  print(f"VSS params: k={k}, brevity_penalty={brevity_penalty}, reward_for_literal={reward_for_literal}, partial_match_factor={partial_match_factor}")
121
 
@@ -187,7 +189,9 @@ def search_theme(theme: str, page: int, *filtros_values) -> Tuple[pd.DataFrame,
187
  query=query,
188
  k=k,
189
  brevity_penalty=brevity_penalty,
 
190
  reward_for_literal=reward_for_literal,
 
191
  partial_match_factor=partial_match_factor,
192
  table_name="vector_table",
193
  embedding_column="embeddings"
 
115
  # Parámetros de la de búsqueda VSS:
116
  k = semantic_search_params["k"]
117
  brevity_penalty = semantic_search_params["brevity_penalty"]
118
+ min_length = semantic_search_params["min_length"]
119
  reward_for_literal = semantic_search_params["reward_for_literal"]
120
+ first_term_reward = semantic_search_params["first_term_reward"]
121
  partial_match_factor = semantic_search_params["partial_match_factor"]
122
  print(f"VSS params: k={k}, brevity_penalty={brevity_penalty}, reward_for_literal={reward_for_literal}, partial_match_factor={partial_match_factor}")
123
 
 
189
  query=query,
190
  k=k,
191
  brevity_penalty=brevity_penalty,
192
+ min_length = min_length,
193
  reward_for_literal=reward_for_literal,
194
+ first_term_reward=first_term_reward,
195
  partial_match_factor=partial_match_factor,
196
  table_name="vector_table",
197
  embedding_column="embeddings"
json/semantic_search_params.json CHANGED
@@ -2,7 +2,9 @@
2
  "semantic_search_params": {
3
  "k": 2000,
4
  "brevity_penalty": 0.1,
 
5
  "reward_for_literal": 0.03,
 
6
  "partial_match_factor": 0.8
7
  }
8
  }
 
2
  "semantic_search_params": {
3
  "k": 2000,
4
  "brevity_penalty": 0.1,
5
+ "min_length": 131,
6
  "reward_for_literal": 0.03,
7
+ "first_term_reward": 20,
8
  "partial_match_factor": 0.8
9
  }
10
  }
src/semantic_search.py CHANGED
@@ -9,7 +9,9 @@ def duckdb_vss_local(
9
  query: str,
10
  k: int = 1000,
11
  brevity_penalty: float = 0.0,
 
12
  reward_for_literal: float = 0.0,
 
13
  partial_match_factor: float = 0.5,
14
  table_name: str = "maestro_vector_table",
15
  embedding_column: str = "vec",
@@ -33,10 +35,10 @@ def duckdb_vss_local(
33
  # Utilizar los parámetros "debug" para mostrar columnas intermedias:
34
  if brevity_penalty > 0:
35
  result = penalize_short_summaries(result, factor = brevity_penalty, distance_column = 'distance',
36
- summary_column = 'longBusinessSummary', debug = False)
37
  if reward_for_literal > 0:
38
  result = reward_literals(result, query, factor = reward_for_literal,
39
- partial_match_factor= partial_match_factor, distance_column = 'distance',
40
  summary_column = 'longBusinessSummary', debug = False)
41
 
42
  return result
@@ -46,7 +48,8 @@ def penalize_short_summaries(
46
  factor: float = 0.1,
47
  distance_column: str = 'distance',
48
  summary_column: str = 'longBusinessSummary',
49
- debug: bool = True
 
50
  ) -> pd.DataFrame:
51
 
52
  result_df = df.copy()
@@ -59,10 +62,14 @@ def penalize_short_summaries(
59
  result_df['percent_shorter'] = result_df['summary_length'].apply(
60
  lambda x: max(0, (avg_length - x) / avg_length)
61
  )
 
62
  result_df['orig_distance'] = result_df[distance_column]
63
- # Penalizamos en función del porcentaje en el que el resumen es más corto que la media (multiplicado por el factor)
 
 
64
  result_df[distance_column] = result_df.apply(
65
- lambda row: min(max_dist, row[distance_column] + (row['percent_shorter'] * factor)),
 
66
  axis=1
67
  )
68
 
@@ -77,6 +84,7 @@ def reward_literals(
77
  query: str,
78
  factor: float = 0.1,
79
  partial_match_factor: float = 0.5,
 
80
  distance_column: str = 'distance',
81
  summary_column: str = 'longBusinessSummary',
82
  debug: bool = True
@@ -89,6 +97,12 @@ def reward_literals(
89
  if pd.isna(summary):
90
  return 0
91
  summary_lower = str(summary).lower()
 
 
 
 
 
 
92
 
93
  # Cuenta coincidencias exactas (palabras completas)
94
  exact_pattern = r'\b' + re.escape(query_lower) + r'\b'
@@ -108,7 +122,7 @@ def reward_literals(
108
  partial_count = partial_count - exact_count
109
 
110
  # Penalizamos las coincidencias parciales:
111
- return exact_count + (partial_count * partial_match_factor)
112
 
113
  result_df['term_occurrences'] = result_df[summary_column].apply(count_phrase_occurrences)
114
  result_df['orig_distance'] = result_df[distance_column]
 
9
  query: str,
10
  k: int = 1000,
11
  brevity_penalty: float = 0.0,
12
+ min_length: int = 131,
13
  reward_for_literal: float = 0.0,
14
+ first_term_reward: float = 20.0,
15
  partial_match_factor: float = 0.5,
16
  table_name: str = "maestro_vector_table",
17
  embedding_column: str = "vec",
 
35
  # Utilizar los parámetros "debug" para mostrar columnas intermedias:
36
  if brevity_penalty > 0:
37
  result = penalize_short_summaries(result, factor = brevity_penalty, distance_column = 'distance',
38
+ summary_column = 'longBusinessSummary', min_length=min_length, debug = False)
39
  if reward_for_literal > 0:
40
  result = reward_literals(result, query, factor = reward_for_literal,
41
+ partial_match_factor= partial_match_factor, first_term_reward=first_term_reward, distance_column = 'distance',
42
  summary_column = 'longBusinessSummary', debug = False)
43
 
44
  return result
 
48
  factor: float = 0.1,
49
  distance_column: str = 'distance',
50
  summary_column: str = 'longBusinessSummary',
51
+ debug: bool = True,
52
+ min_length: int = 131
53
  ) -> pd.DataFrame:
54
 
55
  result_df = df.copy()
 
62
  result_df['percent_shorter'] = result_df['summary_length'].apply(
63
  lambda x: max(0, (avg_length - x) / avg_length)
64
  )
65
+
66
  result_df['orig_distance'] = result_df[distance_column]
67
+
68
+ # Asignar distancia máxima para resúmenes más cortos que min_length
69
+ # y aplicar penalización proporcional para el resto
70
  result_df[distance_column] = result_df.apply(
71
+ lambda row: max_dist if row['summary_length'] < min_length else
72
+ min(max_dist, row[distance_column] + (row['percent_shorter'] * factor)),
73
  axis=1
74
  )
75
 
 
84
  query: str,
85
  factor: float = 0.1,
86
  partial_match_factor: float = 0.5,
87
+ first_term_reward: float = 20.0,
88
  distance_column: str = 'distance',
89
  summary_column: str = 'longBusinessSummary',
90
  debug: bool = True
 
97
  if pd.isna(summary):
98
  return 0
99
  summary_lower = str(summary).lower()
100
+ # Extraemos la primera palabra del resumen y la limpiamos de caracteres especiales
101
+ # Por ejemplo: "Grifols, S.A. operates as a plasma therapeutic company..." -> Extrae "Grifols", no "Grifols,"
102
+ first_word = summary_lower.split()[0] if summary_lower.strip() and len(summary_lower.split()) > 0 else ""
103
+ first_term = re.sub(r'[^\w\s]', '', first_word.lower())
104
+ # Comprobamos si la primera palabra coincide con la consulta (típicamente el nombre de la empresa)
105
+ _first_term_reward = first_term_reward if first_term == query_lower else 0
106
 
107
  # Cuenta coincidencias exactas (palabras completas)
108
  exact_pattern = r'\b' + re.escape(query_lower) + r'\b'
 
122
  partial_count = partial_count - exact_count
123
 
124
  # Penalizamos las coincidencias parciales:
125
+ return _first_term_reward + exact_count + (partial_count * partial_match_factor)
126
 
127
  result_df['term_occurrences'] = result_df[summary_column].apply(count_phrase_occurrences)
128
  result_df['orig_distance'] = result_df[distance_column]