Stéphanie Kamgnia Wonkap commited on
Commit
7469d7c
1 Parent(s): 3e70cf2

fixing app.py

Browse files
Files changed (2) hide show
  1. app.py +3 -3
  2. src/data_preparation.py +17 -13
app.py CHANGED
@@ -61,16 +61,16 @@ def main():
61
  " ",
62
  "",]
63
  st.session_state.docs_processed = split_documents(
64
- 512, # We choose a chunk size adapted to our model
65
  st.session_state.raw_document_base,
66
  #tokenizer_name=EMBEDDING_MODEL_NAME,
67
  separator=st.session_state.MARKDOWN_SEPARATORS
68
  )
69
- st.session_state.embedding_model=NVIDIAEmbeddings()
70
  st.session_state.KNOWLEDGE_VECTOR_DATABASE= init_vectorDB_from_doc(st.session_state.docs_processed,
71
  st.session_state.embedding_model)
72
  if (user_query) and (st.button("Get Answer")):
73
- num_doc_before_rerank=15
74
  st.session_state.retriever= st.session_state.KNOWLEDGE_VECTOR_DATABASE.as_retriever(search_type="similarity",
75
  search_kwargs={"k": num_doc_before_rerank})
76
 
 
61
  " ",
62
  "",]
63
  st.session_state.docs_processed = split_documents(
64
+ 400, # We choose a chunk size adapted to our model
65
  st.session_state.raw_document_base,
66
  #tokenizer_name=EMBEDDING_MODEL_NAME,
67
  separator=st.session_state.MARKDOWN_SEPARATORS
68
  )
69
+ st.session_state.embedding_model=NVIDIAEmbeddings(model="NV-Embed-QA", truncate="END")
70
  st.session_state.KNOWLEDGE_VECTOR_DATABASE= init_vectorDB_from_doc(st.session_state.docs_processed,
71
  st.session_state.embedding_model)
72
  if (user_query) and (st.button("Get Answer")):
73
+ num_doc_before_rerank=5
74
  st.session_state.retriever= st.session_state.KNOWLEDGE_VECTOR_DATABASE.as_retriever(search_type="similarity",
75
  search_kwargs={"k": num_doc_before_rerank})
76
 
src/data_preparation.py CHANGED
@@ -12,31 +12,35 @@ from typing import List, Optional
12
  #from langchain import HuggingFacePipeline
13
  #from langchain.chains import RetrievalQA
14
 
15
- EMBEDDING_MODEL_NAME = "OrdalieTech/Solon-embeddings-large-0.1"
16
 
17
 
18
  def split_documents(
19
  chunk_size: int,
20
  knowledge_base: List[LangchainDocument],
21
- tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
22
  separator:List[str]=None,
23
  ) -> List[LangchainDocument]:
24
  """
25
  Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
26
  """
27
- text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
28
- AutoTokenizer.from_pretrained(tokenizer_name),
29
- chunk_size=chunk_size,
30
- chunk_overlap=int(chunk_size / 10),
31
- add_start_index=True,
32
- strip_whitespace=True,
33
- separators=separator,
34
- )
 
 
 
 
35
 
36
  docs_processed = []
37
- for doc in knowledge_base:
38
- docs_processed += text_splitter.split_documents([doc])
39
-
40
  # Remove duplicates
41
  unique_texts = {}
42
  docs_processed_unique = []
 
12
  #from langchain import HuggingFacePipeline
13
  #from langchain.chains import RetrievalQA
14
 
15
+ #EMBEDDING_MODEL_NAME = "OrdalieTech/Solon-embeddings-large-0.1"
16
 
17
 
18
  def split_documents(
19
  chunk_size: int,
20
  knowledge_base: List[LangchainDocument],
21
+ #tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
22
  separator:List[str]=None,
23
  ) -> List[LangchainDocument]:
24
  """
25
  Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
26
  """
27
+ #text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
28
+ # AutoTokenizer.from_pretrained(tokenizer_name),
29
+ # chunk_size=chunk_size,
30
+ # chunk_overlap=int(chunk_size / 10),
31
+ # add_start_index=True,
32
+ # strip_whitespace=True,
33
+ # separators=separator,
34
+ #)
35
+ text_splitter= RecursiveCharacterTextSplitter( chunk_size=chunk_size,
36
+ chunk_overlap=int(chunk_size / 10),
37
+ strip_whitespace=True,
38
+ separators=separator)
39
 
40
  docs_processed = []
41
+ #for doc in knowledge_base:
42
+ # docs_processed += text_splitter.split_documents([doc])
43
+ docs_processed=text_splitter.split_documents(knowledge_base)
44
  # Remove duplicates
45
  unique_texts = {}
46
  docs_processed_unique = []