ZoniaChatbot commited on
Commit
ea1f5e9
·
verified ·
1 Parent(s): 85f038b

Update chatpdf.py

Browse files
Files changed (1) hide show
  1. chatpdf.py +11 -26
chatpdf.py CHANGED
@@ -139,7 +139,7 @@ class ChatPDF:
139
  self,
140
  similarity_model: SimilarityABC = None,
141
  generate_model_type: str = "auto",
142
- generate_model_name_or_path: str = "Qwen/Qwen2-0.5B-Instruct",
143
  lora_model_name_or_path: str = None,
144
  corpus_files: Union[str, List[str]] = None,
145
  save_corpus_emb_dir: str = "corpus_embs/",
@@ -188,7 +188,7 @@ class ChatPDF:
188
  if similarity_model is not None:
189
  self.sim_model = similarity_model
190
  else:
191
- m1 = BertSimilarity(model_name_or_path="shibing624/text2vec-base-multilingual", device=self.device)
192
  m2 = BM25Similarity()
193
  default_sim_model = EnsembleSimilarity(similarities=[m1, m2], weights=[0.5, 0.5], c=2)
194
  self.sim_model = default_sim_model
@@ -420,28 +420,13 @@ class ChatPDF:
420
  """
421
  reference_results = []
422
  sim_contents = self.sim_model.most_similar(query, topn=self.similarity_top_k)
423
-
424
- # Verificar si sim_contents es una lista o un diccionario
425
- if isinstance(sim_contents, list):
426
- for item in sim_contents:
427
- # Ajustar según la estructura real de item
428
- corpus_id = item[0] if isinstance(item, (list, tuple)) else item # Asegurarse de que corpus_id sea el valor correcto
429
- if isinstance(corpus_id, dict):
430
- # Extraer el valor necesario si corpus_id es un diccionario
431
- corpus_id = next(iter(corpus_id.keys())) # Tomar la primera clave como ejemplo
432
- if corpus_id in self.sim_model.corpus:
433
- hit_chunk = self.sim_model.corpus[corpus_id]
434
- reference_results.append(hit_chunk)
435
-
436
- elif isinstance(sim_contents, dict):
437
- for query_id, id_score_dict in sim_contents.items():
438
- for corpus_id, s in id_score_dict.items():
439
- if corpus_id in self.sim_model.corpus:
440
- hit_chunk = self.sim_model.corpus[corpus_id]
441
- reference_results.append(hit_chunk)
442
- else:
443
- logger.error(f"Unexpected type for sim_contents: {type(sim_contents)}")
444
-
445
 
446
  if reference_results:
447
  if self.rerank_model is not None:
@@ -579,9 +564,9 @@ class ChatPDF:
579
 
580
  if __name__ == "__main__":
581
  parser = argparse.ArgumentParser()
582
- parser.add_argument("--sim_model_name", type=str, default="shibing624/text2vec-base-multilingual")
583
  parser.add_argument("--gen_model_type", type=str, default="auto")
584
- parser.add_argument("--gen_model_name", type=str, default="Qwen/Qwen2-0.5B-Instruct")
585
  parser.add_argument("--lora_model", type=str, default=None)
586
  parser.add_argument("--rerank_model_name", type=str, default="")
587
  parser.add_argument("--corpus_files", type=str, default="Acuerdo009.pdf")
 
139
  self,
140
  similarity_model: SimilarityABC = None,
141
  generate_model_type: str = "auto",
142
+ generate_model_name_or_path: str = "LenguajeNaturalAI/leniachat-qwen2-1.5B-v0",
143
  lora_model_name_or_path: str = None,
144
  corpus_files: Union[str, List[str]] = None,
145
  save_corpus_emb_dir: str = "corpus_embs/",
 
188
  if similarity_model is not None:
189
  self.sim_model = similarity_model
190
  else:
191
+ m1 = BertSimilarity(model_name_or_path="jaimevera1107/all-MiniLM-L6-v2-similarity-es", device=self.device)
192
  m2 = BM25Similarity()
193
  default_sim_model = EnsembleSimilarity(similarities=[m1, m2], weights=[0.5, 0.5], c=2)
194
  self.sim_model = default_sim_model
 
420
  """
421
  reference_results = []
422
  sim_contents = self.sim_model.most_similar(query, topn=self.similarity_top_k)
423
+ # Get reference results from corpus
424
+ hit_chunk_dict = dict()
425
+ for query_id, id_score_dict in sim_contents.items():
426
+ for corpus_id, s in id_score_dict.items():
427
+ hit_chunk = self.sim_model.corpus[corpus_id]
428
+ reference_results.append(hit_chunk)
429
+ hit_chunk_dict[corpus_id] = hit_chunk
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
430
 
431
  if reference_results:
432
  if self.rerank_model is not None:
 
564
 
565
  if __name__ == "__main__":
566
  parser = argparse.ArgumentParser()
567
+ parser.add_argument("--sim_model_name", type=str, default="jaimevera1107/all-MiniLM-L6-v2-similarity-es")
568
  parser.add_argument("--gen_model_type", type=str, default="auto")
569
+ parser.add_argument("--gen_model_name", type=str, default="LenguajeNaturalAI/leniachat-qwen2-1.5B-v0")
570
  parser.add_argument("--lora_model", type=str, default=None)
571
  parser.add_argument("--rerank_model_name", type=str, default="")
572
  parser.add_argument("--corpus_files", type=str, default="Acuerdo009.pdf")