Kevin Hu commited on
Commit
ef2a724
·
1 Parent(s): 8f1a7d6

add sql to naive parser (#1908)

Browse files

### What problem does this PR solve?


### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)

api/utils/file_utils.py CHANGED
@@ -156,7 +156,7 @@ def filename_type(filename):
156
  return FileType.PDF.value
157
 
158
  if re.match(
159
- r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html)$", filename):
160
  return FileType.DOC.value
161
 
162
  if re.match(
 
156
  return FileType.PDF.value
157
 
158
  if re.match(
159
+ r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql)$", filename):
160
  return FileType.DOC.value
161
 
162
  if re.match(
deepdoc/parser/txt_parser.py CHANGED
@@ -12,6 +12,7 @@
12
  #
13
 
14
  from rag.nlp import find_codec,num_tokens_from_string
 
15
 
16
  class RAGFlowTxtParser:
17
  def __call__(self, fnm, binary=None, chunk_token_num=128):
@@ -29,14 +30,17 @@ class RAGFlowTxtParser:
29
  return self.parser_txt(txt, chunk_token_num)
30
 
31
  @classmethod
32
- def parser_txt(cls, txt, chunk_token_num=128):
33
  if type(txt) != str:
34
  raise TypeError("txt type should be str!")
35
  sections = []
36
- for sec in txt.split("\n"):
 
 
 
37
  if num_tokens_from_string(sec) > 10 * int(chunk_token_num):
38
- sections.append((sec[: int(len(sec) / 2)], ""))
39
- sections.append((sec[int(len(sec) / 2) :], ""))
40
  else:
41
- sections.append((sec, ""))
42
  return sections
 
12
  #
13
 
14
  from rag.nlp import find_codec,num_tokens_from_string
15
+ import re
16
 
17
  class RAGFlowTxtParser:
18
  def __call__(self, fnm, binary=None, chunk_token_num=128):
 
30
  return self.parser_txt(txt, chunk_token_num)
31
 
32
  @classmethod
33
+ def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"):
34
  if type(txt) != str:
35
  raise TypeError("txt type should be str!")
36
  sections = []
37
+ for sec in re.split(r"[%s]+"%delimiter, txt):
38
+ if sections and sec in delimiter:
39
+ sections[-1][0] += sec
40
+ continue
41
  if num_tokens_from_string(sec) > 10 * int(chunk_token_num):
42
+ sections.append([sec[: int(len(sec) / 2)], ""])
43
+ sections.append([sec[int(len(sec) / 2) :], ""])
44
  else:
45
+ sections.append([sec, ""])
46
  return sections
rag/app/naive.py CHANGED
@@ -224,9 +224,11 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
224
  excel_parser = ExcelParser()
225
  sections = [(l, "") for l in excel_parser.html(binary) if l]
226
 
227
- elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
228
  callback(0.1, "Start to parse.")
229
- sections = TxtParser()(filename,binary,parser_config.get("chunk_token_num", 128))
 
 
230
  callback(0.8, "Finish parsing.")
231
 
232
  elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
 
224
  excel_parser = ExcelParser()
225
  sections = [(l, "") for l in excel_parser.html(binary) if l]
226
 
227
+ elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
228
  callback(0.1, "Start to parse.")
229
+ sections = TxtParser()(filename,binary,
230
+ parser_config.get("chunk_token_num", 128),
231
+ parser_config.get("delimiter", "\n!?;。;!?"))
232
  callback(0.8, "Finish parsing.")
233
 
234
  elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):