Update app.py
Browse files
app.py
CHANGED
@@ -61,7 +61,7 @@ def indian_to_english(sentence):
|
|
61 |
|
62 |
|
63 |
llm_model = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
64 |
-
|
65 |
|
66 |
# default_persist_directory = './chroma_HF/'
|
67 |
list_llm = ["mistralai/Mistral-7B-Instruct-v0.2", "mistralai/Mixtral-8x7B-Instruct-v0.1", "mistralai/Mistral-7B-Instruct-v0.1", \
|
@@ -82,11 +82,14 @@ def load_doc(list_file_path, chunk_size, chunk_overlap):
|
|
82 |
for loader in loaders:
|
83 |
pages.extend(loader.load())
|
84 |
# text_splitter = RecursiveCharacterTextSplitter(chunk_size = 600, chunk_overlap = 50)
|
85 |
-
text_splitter = RecursiveCharacterTextSplitter(
|
86 |
-
|
87 |
-
|
|
|
|
|
88 |
doc_splits = text_splitter.split_documents(pages)
|
89 |
return doc_splits
|
|
|
90 |
|
91 |
|
92 |
# Create vector database
|
@@ -247,9 +250,9 @@ def demo():
|
|
247 |
db_btn = gr.Radio(["ChromaDB"], label="Vector database type", value = "ChromaDB", type="index", info="Choose your vector database",visible=False)
|
248 |
with gr.Accordion("Advanced options - Document text splitter", open=False, visible=False):
|
249 |
with gr.Row():
|
250 |
-
slider_chunk_size = gr.Slider(value=
|
251 |
with gr.Row():
|
252 |
-
slider_chunk_overlap = gr.Slider(value=
|
253 |
|
254 |
with gr.Accordion("Advanced options - LLM model", open=False, visible=False):
|
255 |
with gr.Row():
|
|
|
61 |
|
62 |
|
63 |
llm_model = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
64 |
+
tokenizer_name = "thenlper/gte-small"
|
65 |
|
66 |
# default_persist_directory = './chroma_HF/'
|
67 |
list_llm = ["mistralai/Mistral-7B-Instruct-v0.2", "mistralai/Mixtral-8x7B-Instruct-v0.1", "mistralai/Mistral-7B-Instruct-v0.1", \
|
|
|
82 |
for loader in loaders:
|
83 |
pages.extend(loader.load())
|
84 |
# text_splitter = RecursiveCharacterTextSplitter(chunk_size = 600, chunk_overlap = 50)
|
85 |
+
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
|
86 |
+
AutoTokenizer.from_pretrained(tokenizer_name),
|
87 |
+
chunk_size=chunk_size,
|
88 |
+
chunk_overlap=chunk_overlap,
|
89 |
+
strip_whitespace=True)
|
90 |
doc_splits = text_splitter.split_documents(pages)
|
91 |
return doc_splits
|
92 |
+
|
93 |
|
94 |
|
95 |
# Create vector database
|
|
|
250 |
db_btn = gr.Radio(["ChromaDB"], label="Vector database type", value = "ChromaDB", type="index", info="Choose your vector database",visible=False)
|
251 |
with gr.Accordion("Advanced options - Document text splitter", open=False, visible=False):
|
252 |
with gr.Row():
|
253 |
+
slider_chunk_size = gr.Slider(value=512, label="Chunk size", info="Chunk size", interactive=False, visible=False)
|
254 |
with gr.Row():
|
255 |
+
slider_chunk_overlap = gr.Slider(value=128, label="Chunk overlap", info="Chunk overlap", interactive=False, visible=False)
|
256 |
|
257 |
with gr.Accordion("Advanced options - LLM model", open=False, visible=False):
|
258 |
with gr.Row():
|