antagonico commited on
Commit
5c96576
1 Parent(s): 11a4a60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -15
app.py CHANGED
@@ -16,26 +16,46 @@ collection = chroma_client.create_collection(name="my_collection")
16
  def generate_hash(text):
17
  return hashlib.md5(text.encode('utf-8')).hexdigest()
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  # Funci贸n para obtener embeddings del modelo
20
  def get_embeddings(sentences):
 
21
  embeddings = []
 
22
  for sentence in sentences:
23
- sentence_hash = generate_hash(sentence)
24
- # Verificar si el embedding ya est谩 en la base de datos
25
- results = collection.query(query_texts=[sentence], n_results=1)
26
- if results and isinstance(results, list) and results[0].get('embedding'):
27
- embeddings.append(np.array(results[0]['embedding']))
28
  else:
29
- # Si no est谩 en la base de datos, calcula el embedding y lo almacena
30
- embedding = model.encode(sentence, show_progress_bar=False)
31
- collection.add(
32
- embeddings=[embedding.tolist()],
33
- documents=[sentence],
34
- metadatas=[{"source": "my_source"}],
35
- ids=[sentence_hash] # Usa el hash como ID
36
- )
37
- embeddings.append(embedding)
38
- return np.array(embeddings)
 
39
 
40
  # Funci贸n para comparar las sentencias
41
  def calculate_similarity(args):
@@ -69,3 +89,5 @@ iface = gr.Interface(
69
 
70
  # Inicia la interfaz de Gradio
71
  iface.launch()
 
 
 
16
  def generate_hash(text):
17
  return hashlib.md5(text.encode('utf-8')).hexdigest()
18
 
19
+ # Funci贸n para obtener embeddings del modelo
20
+ import sqlite3
21
+ import gradio as gr
22
+ from sentence_transformers import SentenceTransformer
23
+ from sklearn.metrics.pairwise import cosine_similarity
24
+ import numpy as np
25
+ import multiprocessing
26
+
27
+ # Inicializa la base de datos y crea la tabla si no existe
28
+ conn = sqlite3.connect('embeddings.db')
29
+ c = conn.cursor()
30
+ c.execute('''CREATE TABLE IF NOT EXISTS embeddings
31
+ (sentence TEXT PRIMARY KEY, embedding BLOB)''')
32
+ conn.commit()
33
+
34
+ # Carga el modelo
35
+ model = SentenceTransformer('Maite89/Roberta_finetuning_semantic_similarity_stsb_multi_mt')
36
+
37
  # Funci贸n para obtener embeddings del modelo
38
  def get_embeddings(sentences):
39
+ # Intenta recuperar los embeddings de la base de datos
40
  embeddings = []
41
+ new_sentences = []
42
  for sentence in sentences:
43
+ c.execute('SELECT embedding FROM embeddings WHERE sentence=?', (sentence,))
44
+ result = c.fetchone()
45
+ if result:
46
+ embeddings.append(np.frombuffer(result[0], dtype=np.float32))
 
47
  else:
48
+ new_sentences.append(sentence)
49
+
50
+ # Si hay nuevas sentencias, obt茅n los embeddings y almac茅nalos en la base de datos
51
+ if new_sentences:
52
+ new_embeddings = model.encode(new_sentences, show_progress_bar=False)
53
+ embeddings.extend(new_embeddings)
54
+ c.executemany('INSERT INTO embeddings VALUES (?,?)',
55
+ [(sent, emb.tobytes()) for sent, emb in zip(new_sentences, new_embeddings)])
56
+ conn.commit()
57
+
58
+ return embeddings
59
 
60
  # Funci贸n para comparar las sentencias
61
  def calculate_similarity(args):
 
89
 
90
  # Inicia la interfaz de Gradio
91
  iface.launch()
92
+ conn.close()
93
+