Spaces:
Sleeping
Sleeping
Update chatpdf.py
Browse files- chatpdf.py +8 -6
chatpdf.py
CHANGED
@@ -127,7 +127,7 @@ class Rag:
|
|
127 |
self,
|
128 |
similarity_model: SimilarityABC = None,
|
129 |
generate_model_type: str = "auto",
|
130 |
-
generate_model_name_or_path: str = "
|
131 |
lora_model_name_or_path: str = None,
|
132 |
corpus_files: Union[str, List[str]] = None,
|
133 |
save_corpus_emb_dir: str = "corpus_embs/",
|
@@ -176,7 +176,7 @@ class Rag:
|
|
176 |
if similarity_model is not None:
|
177 |
self.sim_model = similarity_model
|
178 |
else:
|
179 |
-
m1 = BertSimilarity(model_name_or_path="
|
180 |
m2 = BM25Similarity()
|
181 |
default_sim_model = EnsembleSimilarity(similarities=[m1, m2], weights=[0.5, 0.5], c=2)
|
182 |
self.sim_model = default_sim_model
|
@@ -193,7 +193,7 @@ class Rag:
|
|
193 |
self.add_corpus(corpus_files)
|
194 |
self.save_corpus_emb_dir = save_corpus_emb_dir
|
195 |
if rerank_model_name_or_path is None:
|
196 |
-
rerank_model_name_or_path = "BAAI/bge-reranker-
|
197 |
if rerank_model_name_or_path:
|
198 |
self.rerank_tokenizer = AutoTokenizer.from_pretrained(rerank_model_name_or_path)
|
199 |
self.rerank_model = AutoModelForSequenceClassification.from_pretrained(rerank_model_name_or_path)
|
@@ -341,6 +341,8 @@ class Rag:
|
|
341 |
raw_text = [text.strip() for text in page_text.splitlines() if text.strip()]
|
342 |
new_text = ''
|
343 |
for text in raw_text:
|
|
|
|
|
344 |
new_text += text
|
345 |
if text[-1] in ['.', '!', '?', 'γ', 'οΌ', 'οΌ', 'β¦', ';', 'οΌ', ':', 'οΌ', 'β', 'β', 'οΌ', 'γ', 'γ', 'γ',
|
346 |
'γ', 'γ', 'γ', 'γ', 'γ', 'γ', 'γ', 'Β»', '"', "'", ')', ']', '}']:
|
@@ -521,12 +523,12 @@ class Rag:
|
|
521 |
|
522 |
if __name__ == "__main__":
|
523 |
parser = argparse.ArgumentParser()
|
524 |
-
parser.add_argument("--sim_model_name", type=str, default="
|
525 |
parser.add_argument("--gen_model_type", type=str, default="auto")
|
526 |
-
parser.add_argument("--gen_model_name", type=str, default="
|
527 |
parser.add_argument("--lora_model", type=str, default=None)
|
528 |
parser.add_argument("--rerank_model_name", type=str, default="")
|
529 |
-
parser.add_argument("--corpus_files", type=str, default="
|
530 |
parser.add_argument("--device", type=str, default=None)
|
531 |
parser.add_argument("--int4", action='store_true', help="use int4 quantization")
|
532 |
parser.add_argument("--int8", action='store_true', help="use int8 quantization")
|
|
|
127 |
self,
|
128 |
similarity_model: SimilarityABC = None,
|
129 |
generate_model_type: str = "auto",
|
130 |
+
generate_model_name_or_path: str = "Qwen/Qwen2-0.5B-Instruct",
|
131 |
lora_model_name_or_path: str = None,
|
132 |
corpus_files: Union[str, List[str]] = None,
|
133 |
save_corpus_emb_dir: str = "corpus_embs/",
|
|
|
176 |
if similarity_model is not None:
|
177 |
self.sim_model = similarity_model
|
178 |
else:
|
179 |
+
m1 = BertSimilarity(model_name_or_path="hiiamsid/sentence_similarity_spanish_es", device=self.device)
|
180 |
m2 = BM25Similarity()
|
181 |
default_sim_model = EnsembleSimilarity(similarities=[m1, m2], weights=[0.5, 0.5], c=2)
|
182 |
self.sim_model = default_sim_model
|
|
|
193 |
self.add_corpus(corpus_files)
|
194 |
self.save_corpus_emb_dir = save_corpus_emb_dir
|
195 |
if rerank_model_name_or_path is None:
|
196 |
+
rerank_model_name_or_path = "BAAI/bge-reranker-large"
|
197 |
if rerank_model_name_or_path:
|
198 |
self.rerank_tokenizer = AutoTokenizer.from_pretrained(rerank_model_name_or_path)
|
199 |
self.rerank_model = AutoModelForSequenceClassification.from_pretrained(rerank_model_name_or_path)
|
|
|
341 |
raw_text = [text.strip() for text in page_text.splitlines() if text.strip()]
|
342 |
new_text = ''
|
343 |
for text in raw_text:
|
344 |
+
if new_text:
|
345 |
+
new_text += ' '
|
346 |
new_text += text
|
347 |
if text[-1] in ['.', '!', '?', 'γ', 'οΌ', 'οΌ', 'β¦', ';', 'οΌ', ':', 'οΌ', 'β', 'β', 'οΌ', 'γ', 'γ', 'γ',
|
348 |
'γ', 'γ', 'γ', 'γ', 'γ', 'γ', 'γ', 'Β»', '"', "'", ')', ']', '}']:
|
|
|
523 |
|
524 |
if __name__ == "__main__":
|
525 |
parser = argparse.ArgumentParser()
|
526 |
+
parser.add_argument("--sim_model_name", type=str, default="hiiamsid/sentence_similarity_spanish_es")
|
527 |
parser.add_argument("--gen_model_type", type=str, default="auto")
|
528 |
+
parser.add_argument("--gen_model_name", type=str, default="Qwen/Qwen2-0.5B-Instruct")
|
529 |
parser.add_argument("--lora_model", type=str, default=None)
|
530 |
parser.add_argument("--rerank_model_name", type=str, default="")
|
531 |
+
parser.add_argument("--corpus_files", type=str, default="Acuerdo009.pdf")
|
532 |
parser.add_argument("--device", type=str, default=None)
|
533 |
parser.add_argument("--int4", action='store_true', help="use int4 quantization")
|
534 |
parser.add_argument("--int8", action='store_true', help="use int8 quantization")
|