ZoniaChatbot commited on
Commit
678969c
Β·
verified Β·
1 Parent(s): ec18a4a

Update chatpdf.py

Browse files
Files changed (1) hide show
  1. chatpdf.py +8 -6
chatpdf.py CHANGED
@@ -127,7 +127,7 @@ class Rag:
127
  self,
128
  similarity_model: SimilarityABC = None,
129
  generate_model_type: str = "auto",
130
- generate_model_name_or_path: str = "LenguajeNaturalAI/leniachat-qwen2-1.5B-v0",
131
  lora_model_name_or_path: str = None,
132
  corpus_files: Union[str, List[str]] = None,
133
  save_corpus_emb_dir: str = "corpus_embs/",
@@ -176,7 +176,7 @@ class Rag:
176
  if similarity_model is not None:
177
  self.sim_model = similarity_model
178
  else:
179
- m1 = BertSimilarity(model_name_or_path="shibing624/text2vec-base-multilingual", device=self.device)
180
  m2 = BM25Similarity()
181
  default_sim_model = EnsembleSimilarity(similarities=[m1, m2], weights=[0.5, 0.5], c=2)
182
  self.sim_model = default_sim_model
@@ -193,7 +193,7 @@ class Rag:
193
  self.add_corpus(corpus_files)
194
  self.save_corpus_emb_dir = save_corpus_emb_dir
195
  if rerank_model_name_or_path is None:
196
- rerank_model_name_or_path = "BAAI/bge-reranker-base"
197
  if rerank_model_name_or_path:
198
  self.rerank_tokenizer = AutoTokenizer.from_pretrained(rerank_model_name_or_path)
199
  self.rerank_model = AutoModelForSequenceClassification.from_pretrained(rerank_model_name_or_path)
@@ -341,6 +341,8 @@ class Rag:
341
  raw_text = [text.strip() for text in page_text.splitlines() if text.strip()]
342
  new_text = ''
343
  for text in raw_text:
 
 
344
  new_text += text
345
  if text[-1] in ['.', '!', '?', '。', '!', '?', '…', ';', 'οΌ›', ':', ':', '”', '’', 'οΌ‰', '】', '》', '」',
346
  '』', '〕', '〉', '》', 'γ€—', 'γ€ž', 'γ€Ÿ', 'Β»', '"', "'", ')', ']', '}']:
@@ -521,12 +523,12 @@ class Rag:
521
 
522
  if __name__ == "__main__":
523
  parser = argparse.ArgumentParser()
524
- parser.add_argument("--sim_model_name", type=str, default="shibing624/text2vec-base-multilingual")
525
  parser.add_argument("--gen_model_type", type=str, default="auto")
526
- parser.add_argument("--gen_model_name", type=str, default="LenguajeNaturalAI/leniachat-qwen2-1.5B-v0")
527
  parser.add_argument("--lora_model", type=str, default=None)
528
  parser.add_argument("--rerank_model_name", type=str, default="")
529
- parser.add_argument("--corpus_files", type=str, default="data/sample.pdf")
530
  parser.add_argument("--device", type=str, default=None)
531
  parser.add_argument("--int4", action='store_true', help="use int4 quantization")
532
  parser.add_argument("--int8", action='store_true', help="use int8 quantization")
 
127
  self,
128
  similarity_model: SimilarityABC = None,
129
  generate_model_type: str = "auto",
130
+ generate_model_name_or_path: str = "Qwen/Qwen2-0.5B-Instruct",
131
  lora_model_name_or_path: str = None,
132
  corpus_files: Union[str, List[str]] = None,
133
  save_corpus_emb_dir: str = "corpus_embs/",
 
176
  if similarity_model is not None:
177
  self.sim_model = similarity_model
178
  else:
179
+ m1 = BertSimilarity(model_name_or_path="hiiamsid/sentence_similarity_spanish_es", device=self.device)
180
  m2 = BM25Similarity()
181
  default_sim_model = EnsembleSimilarity(similarities=[m1, m2], weights=[0.5, 0.5], c=2)
182
  self.sim_model = default_sim_model
 
193
  self.add_corpus(corpus_files)
194
  self.save_corpus_emb_dir = save_corpus_emb_dir
195
  if rerank_model_name_or_path is None:
196
+ rerank_model_name_or_path = "BAAI/bge-reranker-large"
197
  if rerank_model_name_or_path:
198
  self.rerank_tokenizer = AutoTokenizer.from_pretrained(rerank_model_name_or_path)
199
  self.rerank_model = AutoModelForSequenceClassification.from_pretrained(rerank_model_name_or_path)
 
341
  raw_text = [text.strip() for text in page_text.splitlines() if text.strip()]
342
  new_text = ''
343
  for text in raw_text:
344
+ if new_text:
345
+ new_text += ' '
346
  new_text += text
347
  if text[-1] in ['.', '!', '?', '。', '!', '?', '…', ';', 'οΌ›', ':', ':', '”', '’', 'οΌ‰', '】', '》', '」',
348
  '』', '〕', '〉', '》', 'γ€—', 'γ€ž', 'γ€Ÿ', 'Β»', '"', "'", ')', ']', '}']:
 
523
 
524
  if __name__ == "__main__":
525
  parser = argparse.ArgumentParser()
526
+ parser.add_argument("--sim_model_name", type=str, default="hiiamsid/sentence_similarity_spanish_es")
527
  parser.add_argument("--gen_model_type", type=str, default="auto")
528
+ parser.add_argument("--gen_model_name", type=str, default="Qwen/Qwen2-0.5B-Instruct")
529
  parser.add_argument("--lora_model", type=str, default=None)
530
  parser.add_argument("--rerank_model_name", type=str, default="")
531
+ parser.add_argument("--corpus_files", type=str, default="Acuerdo009.pdf")
532
  parser.add_argument("--device", type=str, default=None)
533
  parser.add_argument("--int4", action='store_true', help="use int4 quantization")
534
  parser.add_argument("--int8", action='store_true', help="use int8 quantization")