Kevin Hu
commited on
Commit
·
930d161
1
Parent(s):
9e42ef1
be better chunks before graphrag (#1811)
Browse files### What problem does this PR solve?
#1594
### Type of change
- [x] Refactoring
- rag/app/naive.py +2 -3
rag/app/naive.py
CHANGED
@@ -273,14 +273,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
273 |
raise NotImplementedError(
|
274 |
"file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
|
275 |
|
276 |
-
if kwargs.get("section_only", False):
|
277 |
-
return [t for t, _ in sections]
|
278 |
-
|
279 |
st = timer()
|
280 |
chunks = naive_merge(
|
281 |
sections, int(parser_config.get(
|
282 |
"chunk_token_num", 128)), parser_config.get(
|
283 |
"delimiter", "\n!?。;!?"))
|
|
|
|
|
284 |
|
285 |
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
286 |
cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
|
|
|
273 |
raise NotImplementedError(
|
274 |
"file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
|
275 |
|
|
|
|
|
|
|
276 |
st = timer()
|
277 |
chunks = naive_merge(
|
278 |
sections, int(parser_config.get(
|
279 |
"chunk_token_num", 128)), parser_config.get(
|
280 |
"delimiter", "\n!?。;!?"))
|
281 |
+
if kwargs.get("section_only", False):
|
282 |
+
return chunks
|
283 |
|
284 |
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
285 |
cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
|