Spaces:
Sleeping
Sleeping
Update chatpdf.py
Browse files- chatpdf.py +11 -26
chatpdf.py
CHANGED
@@ -139,7 +139,7 @@ class ChatPDF:
|
|
139 |
self,
|
140 |
similarity_model: SimilarityABC = None,
|
141 |
generate_model_type: str = "auto",
|
142 |
-
generate_model_name_or_path: str = "
|
143 |
lora_model_name_or_path: str = None,
|
144 |
corpus_files: Union[str, List[str]] = None,
|
145 |
save_corpus_emb_dir: str = "corpus_embs/",
|
@@ -188,7 +188,7 @@ class ChatPDF:
|
|
188 |
if similarity_model is not None:
|
189 |
self.sim_model = similarity_model
|
190 |
else:
|
191 |
-
m1 = BertSimilarity(model_name_or_path="
|
192 |
m2 = BM25Similarity()
|
193 |
default_sim_model = EnsembleSimilarity(similarities=[m1, m2], weights=[0.5, 0.5], c=2)
|
194 |
self.sim_model = default_sim_model
|
@@ -420,28 +420,13 @@ class ChatPDF:
|
|
420 |
"""
|
421 |
reference_results = []
|
422 |
sim_contents = self.sim_model.most_similar(query, topn=self.similarity_top_k)
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
for
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
# Extraer el valor necesario si corpus_id es un diccionario
|
431 |
-
corpus_id = next(iter(corpus_id.keys())) # Tomar la primera clave como ejemplo
|
432 |
-
if corpus_id in self.sim_model.corpus:
|
433 |
-
hit_chunk = self.sim_model.corpus[corpus_id]
|
434 |
-
reference_results.append(hit_chunk)
|
435 |
-
|
436 |
-
elif isinstance(sim_contents, dict):
|
437 |
-
for query_id, id_score_dict in sim_contents.items():
|
438 |
-
for corpus_id, s in id_score_dict.items():
|
439 |
-
if corpus_id in self.sim_model.corpus:
|
440 |
-
hit_chunk = self.sim_model.corpus[corpus_id]
|
441 |
-
reference_results.append(hit_chunk)
|
442 |
-
else:
|
443 |
-
logger.error(f"Unexpected type for sim_contents: {type(sim_contents)}")
|
444 |
-
|
445 |
|
446 |
if reference_results:
|
447 |
if self.rerank_model is not None:
|
@@ -579,9 +564,9 @@ class ChatPDF:
|
|
579 |
|
580 |
if __name__ == "__main__":
|
581 |
parser = argparse.ArgumentParser()
|
582 |
-
parser.add_argument("--sim_model_name", type=str, default="
|
583 |
parser.add_argument("--gen_model_type", type=str, default="auto")
|
584 |
-
parser.add_argument("--gen_model_name", type=str, default="
|
585 |
parser.add_argument("--lora_model", type=str, default=None)
|
586 |
parser.add_argument("--rerank_model_name", type=str, default="")
|
587 |
parser.add_argument("--corpus_files", type=str, default="Acuerdo009.pdf")
|
|
|
139 |
self,
|
140 |
similarity_model: SimilarityABC = None,
|
141 |
generate_model_type: str = "auto",
|
142 |
+
generate_model_name_or_path: str = "LenguajeNaturalAI/leniachat-qwen2-1.5B-v0",
|
143 |
lora_model_name_or_path: str = None,
|
144 |
corpus_files: Union[str, List[str]] = None,
|
145 |
save_corpus_emb_dir: str = "corpus_embs/",
|
|
|
188 |
if similarity_model is not None:
|
189 |
self.sim_model = similarity_model
|
190 |
else:
|
191 |
+
m1 = BertSimilarity(model_name_or_path="jaimevera1107/all-MiniLM-L6-v2-similarity-es", device=self.device)
|
192 |
m2 = BM25Similarity()
|
193 |
default_sim_model = EnsembleSimilarity(similarities=[m1, m2], weights=[0.5, 0.5], c=2)
|
194 |
self.sim_model = default_sim_model
|
|
|
420 |
"""
|
421 |
reference_results = []
|
422 |
sim_contents = self.sim_model.most_similar(query, topn=self.similarity_top_k)
|
423 |
+
# Get reference results from corpus
|
424 |
+
hit_chunk_dict = dict()
|
425 |
+
for query_id, id_score_dict in sim_contents.items():
|
426 |
+
for corpus_id, s in id_score_dict.items():
|
427 |
+
hit_chunk = self.sim_model.corpus[corpus_id]
|
428 |
+
reference_results.append(hit_chunk)
|
429 |
+
hit_chunk_dict[corpus_id] = hit_chunk
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
430 |
|
431 |
if reference_results:
|
432 |
if self.rerank_model is not None:
|
|
|
564 |
|
565 |
if __name__ == "__main__":
|
566 |
parser = argparse.ArgumentParser()
|
567 |
+
parser.add_argument("--sim_model_name", type=str, default="jaimevera1107/all-MiniLM-L6-v2-similarity-es")
|
568 |
parser.add_argument("--gen_model_type", type=str, default="auto")
|
569 |
+
parser.add_argument("--gen_model_name", type=str, default="LenguajeNaturalAI/leniachat-qwen2-1.5B-v0")
|
570 |
parser.add_argument("--lora_model", type=str, default=None)
|
571 |
parser.add_argument("--rerank_model_name", type=str, default="")
|
572 |
parser.add_argument("--corpus_files", type=str, default="Acuerdo009.pdf")
|