Kevin Hu commited on
Commit
930d161
·
1 Parent(s): 9e42ef1

be better chunks before graphrag (#1811)

Browse files

### What problem does this PR solve?

#1594

### Type of change

- [x] Refactoring

Files changed (1) hide show
  1. rag/app/naive.py +2 -3
rag/app/naive.py CHANGED
@@ -273,14 +273,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
273
  raise NotImplementedError(
274
  "file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
275
 
276
- if kwargs.get("section_only", False):
277
- return [t for t, _ in sections]
278
-
279
  st = timer()
280
  chunks = naive_merge(
281
  sections, int(parser_config.get(
282
  "chunk_token_num", 128)), parser_config.get(
283
  "delimiter", "\n!?。;!?"))
 
 
284
 
285
  res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
286
  cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
 
273
  raise NotImplementedError(
274
  "file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
275
 
 
 
 
276
  st = timer()
277
  chunks = naive_merge(
278
  sections, int(parser_config.get(
279
  "chunk_token_num", 128)), parser_config.get(
280
  "delimiter", "\n!?。;!?"))
281
+ if kwargs.get("section_only", False):
282
+ return chunks
283
 
284
  res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
285
  cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))