fracapuano commited on
Commit
cda0f94
·
1 Parent(s): 51a7497

add: multi-chunksize splitter for better sematic precision

Browse files
Files changed (1) hide show
  1. qa/utils.py +18 -18
qa/utils.py CHANGED
@@ -137,22 +137,22 @@ def text_to_docs(pages: Union[Text, Tuple[Text]], **kwargs) -> List[HashDocument
137
 
138
  # Split pages into chunks
139
  doc_chunks = []
140
- # Get the text splitter
141
- text_splitter = get_text_splitter()
142
-
143
- for doc in page_docs:
144
- # this splits the page into chunks
145
- chunks = text_splitter.split_text(doc.page_content)
146
- for i, chunk in enumerate(chunks):
147
- # Create a new document for each individual chunk
148
- new_doc = HashDocument(
149
- page_content=chunk,
150
- metadata={"file_name": doc.metadata["file_name"], "page": doc.metadata["page"], "chunk": i}
151
- )
152
- # Add sources to metadata for retrieval later on
153
- new_doc.metadata["source"] = \
154
- f"{new_doc.metadata['file_name']}/Page-{new_doc.metadata['page']}/Chunk-{new_doc.metadata['chunk']}"
155
- doc_chunks.append(new_doc)
156
 
157
  return doc_chunks
158
 
@@ -193,8 +193,8 @@ def get_answer(
193
  chain = load_qa_with_sources_chain(
194
  ChatOpenAI(temperature=0, openai_api_key=st.session_state.get("OPENAI_API_KEY"), model=model, streaming=stream_answer),
195
  chain_type="stuff",
196
- prompt=STUFF_PROMPT
197
- # verbose=True,
198
  # chain_type_kwargs={
199
  # "verbose": True,
200
  # "prompt": query,
 
137
 
138
  # Split pages into chunks
139
  doc_chunks = []
140
+ for ntokens in [50,250,500,750]:
141
+ # Get the text splitter
142
+ text_splitter = get_text_splitter(chunk_size=ntokens, chunk_overlap=ntokens//10)
143
+ for doc in page_docs:
144
+ # this splits the page into chunks
145
+ chunks = text_splitter.split_text(doc.page_content)
146
+ for i, chunk in enumerate(chunks):
147
+ # Create a new document for each individual chunk
148
+ new_doc = HashDocument(
149
+ page_content=chunk,
150
+ metadata={"file_name": doc.metadata["file_name"], "page": doc.metadata["page"], "chunk": i}
151
+ )
152
+ # Add sources to metadata for retrieval later on
153
+ new_doc.metadata["source"] = \
154
+ f"{new_doc.metadata['file_name']}/Page-{new_doc.metadata['page']}/Chunk-{new_doc.metadata['chunk']}/Chunksize-{ntokens}"
155
+ doc_chunks.append(new_doc)
156
 
157
  return doc_chunks
158
 
 
193
  chain = load_qa_with_sources_chain(
194
  ChatOpenAI(temperature=0, openai_api_key=st.session_state.get("OPENAI_API_KEY"), model=model, streaming=stream_answer),
195
  chain_type="stuff",
196
+ prompt=STUFF_PROMPT,
197
+ verbose=True,
198
  # chain_type_kwargs={
199
  # "verbose": True,
200
  # "prompt": query,