vishwask commited on
Commit
4d023c6
·
verified ·
1 Parent(s): f04f4de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -6
app.py CHANGED
@@ -61,7 +61,7 @@ def indian_to_english(sentence):
61
 
62
 
63
  llm_model = "mistralai/Mixtral-8x7B-Instruct-v0.1"
64
-
65
 
66
  # default_persist_directory = './chroma_HF/'
67
  list_llm = ["mistralai/Mistral-7B-Instruct-v0.2", "mistralai/Mixtral-8x7B-Instruct-v0.1", "mistralai/Mistral-7B-Instruct-v0.1", \
@@ -82,11 +82,14 @@ def load_doc(list_file_path, chunk_size, chunk_overlap):
82
  for loader in loaders:
83
  pages.extend(loader.load())
84
  # text_splitter = RecursiveCharacterTextSplitter(chunk_size = 600, chunk_overlap = 50)
85
- text_splitter = RecursiveCharacterTextSplitter(
86
- chunk_size = chunk_size,
87
- chunk_overlap = chunk_overlap)
 
 
88
  doc_splits = text_splitter.split_documents(pages)
89
  return doc_splits
 
90
 
91
 
92
  # Create vector database
@@ -247,9 +250,9 @@ def demo():
247
  db_btn = gr.Radio(["ChromaDB"], label="Vector database type", value = "ChromaDB", type="index", info="Choose your vector database",visible=False)
248
  with gr.Accordion("Advanced options - Document text splitter", open=False, visible=False):
249
  with gr.Row():
250
- slider_chunk_size = gr.Slider(value=20000, label="Chunk size", info="Chunk size", interactive=False, visible=False)
251
  with gr.Row():
252
- slider_chunk_overlap = gr.Slider(value=2000, label="Chunk overlap", info="Chunk overlap", interactive=False, visible=False)
253
 
254
  with gr.Accordion("Advanced options - LLM model", open=False, visible=False):
255
  with gr.Row():
 
61
 
62
 
63
  llm_model = "mistralai/Mixtral-8x7B-Instruct-v0.1"
64
+ tokenizer_name = "thenlper/gte-small"
65
 
66
  # default_persist_directory = './chroma_HF/'
67
  list_llm = ["mistralai/Mistral-7B-Instruct-v0.2", "mistralai/Mixtral-8x7B-Instruct-v0.1", "mistralai/Mistral-7B-Instruct-v0.1", \
 
82
  for loader in loaders:
83
  pages.extend(loader.load())
84
  # text_splitter = RecursiveCharacterTextSplitter(chunk_size = 600, chunk_overlap = 50)
85
+ text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
86
+ AutoTokenizer.from_pretrained(tokenizer_name),
87
+ chunk_size=chunk_size,
88
+ chunk_overlap=chunk_overlap,
89
+ strip_whitespace=True)
90
  doc_splits = text_splitter.split_documents(pages)
91
  return doc_splits
92
+
93
 
94
 
95
  # Create vector database
 
250
  db_btn = gr.Radio(["ChromaDB"], label="Vector database type", value = "ChromaDB", type="index", info="Choose your vector database",visible=False)
251
  with gr.Accordion("Advanced options - Document text splitter", open=False, visible=False):
252
  with gr.Row():
253
+ slider_chunk_size = gr.Slider(value=512, label="Chunk size", info="Chunk size", interactive=False, visible=False)
254
  with gr.Row():
255
+ slider_chunk_overlap = gr.Slider(value=128, label="Chunk overlap", info="Chunk overlap", interactive=False, visible=False)
256
 
257
  with gr.Accordion("Advanced options - LLM model", open=False, visible=False):
258
  with gr.Row():