Spaces:
Runtime error
Runtime error
fracapuano
commited on
Commit
·
cda0f94
1
Parent(s):
51a7497
add: multi-chunksize splitter for better sematic precision
Browse files- qa/utils.py +18 -18
qa/utils.py
CHANGED
@@ -137,22 +137,22 @@ def text_to_docs(pages: Union[Text, Tuple[Text]], **kwargs) -> List[HashDocument
|
|
137 |
|
138 |
# Split pages into chunks
|
139 |
doc_chunks = []
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
|
157 |
return doc_chunks
|
158 |
|
@@ -193,8 +193,8 @@ def get_answer(
|
|
193 |
chain = load_qa_with_sources_chain(
|
194 |
ChatOpenAI(temperature=0, openai_api_key=st.session_state.get("OPENAI_API_KEY"), model=model, streaming=stream_answer),
|
195 |
chain_type="stuff",
|
196 |
-
prompt=STUFF_PROMPT
|
197 |
-
|
198 |
# chain_type_kwargs={
|
199 |
# "verbose": True,
|
200 |
# "prompt": query,
|
|
|
137 |
|
138 |
# Split pages into chunks
|
139 |
doc_chunks = []
|
140 |
+
for ntokens in [50,250,500,750]:
|
141 |
+
# Get the text splitter
|
142 |
+
text_splitter = get_text_splitter(chunk_size=ntokens, chunk_overlap=ntokens//10)
|
143 |
+
for doc in page_docs:
|
144 |
+
# this splits the page into chunks
|
145 |
+
chunks = text_splitter.split_text(doc.page_content)
|
146 |
+
for i, chunk in enumerate(chunks):
|
147 |
+
# Create a new document for each individual chunk
|
148 |
+
new_doc = HashDocument(
|
149 |
+
page_content=chunk,
|
150 |
+
metadata={"file_name": doc.metadata["file_name"], "page": doc.metadata["page"], "chunk": i}
|
151 |
+
)
|
152 |
+
# Add sources to metadata for retrieval later on
|
153 |
+
new_doc.metadata["source"] = \
|
154 |
+
f"{new_doc.metadata['file_name']}/Page-{new_doc.metadata['page']}/Chunk-{new_doc.metadata['chunk']}/Chunksize-{ntokens}"
|
155 |
+
doc_chunks.append(new_doc)
|
156 |
|
157 |
return doc_chunks
|
158 |
|
|
|
193 |
chain = load_qa_with_sources_chain(
|
194 |
ChatOpenAI(temperature=0, openai_api_key=st.session_state.get("OPENAI_API_KEY"), model=model, streaming=stream_answer),
|
195 |
chain_type="stuff",
|
196 |
+
prompt=STUFF_PROMPT,
|
197 |
+
verbose=True,
|
198 |
# chain_type_kwargs={
|
199 |
# "verbose": True,
|
200 |
# "prompt": query,
|