charles0519 commited on
Commit
b741e5b
·
unverified ·
1 Parent(s): 3f820aa

feature: 优化上传索引文件时空格分割 (#427)

Browse files

* feature: 优化上传索引文件时空格分割

* feature: 优化上传索引文件时空格分割

Files changed (2) hide show
  1. .gitignore +1 -0
  2. modules/llama_func.py +32 -32
.gitignore CHANGED
@@ -136,3 +136,4 @@ dmypy.json
136
  api_key.txt
137
 
138
  auth.json
 
 
136
  api_key.txt
137
 
138
  auth.json
139
+ .idea
modules/llama_func.py CHANGED
@@ -13,7 +13,6 @@ from llama_index import (
13
  from langchain.llms import OpenAI
14
  import colorama
15
 
16
-
17
  from modules.presets import *
18
  from modules.utils import *
19
 
@@ -30,37 +29,38 @@ def get_documents(file_src):
30
  logging.debug("Loading PDF...")
31
  CJKPDFReader = download_loader("CJKPDFReader")
32
  loader = CJKPDFReader()
33
- documents += loader.load_data(file=file.name)
34
  elif os.path.splitext(file.name)[1] == ".docx":
35
  logging.debug("Loading DOCX...")
36
  DocxReader = download_loader("DocxReader")
37
  loader = DocxReader()
38
- documents += loader.load_data(file=file.name)
39
  elif os.path.splitext(file.name)[1] == ".epub":
40
  logging.debug("Loading EPUB...")
41
  EpubReader = download_loader("EpubReader")
42
  loader = EpubReader()
43
- documents += loader.load_data(file=file.name)
44
  else:
45
  logging.debug("Loading text file...")
46
  with open(file.name, "r", encoding="utf-8") as f:
47
- text = add_space(f.read())
48
- documents += [Document(text)]
 
49
  index_name = sha1sum(index_name)
50
  return documents, index_name
51
 
52
 
53
  def construct_index(
54
- api_key,
55
- file_src,
56
- max_input_size=4096,
57
- num_outputs=1,
58
- max_chunk_overlap=20,
59
- chunk_size_limit=600,
60
- embedding_limit=None,
61
- separator=" ",
62
- num_children=10,
63
- max_keywords_per_chunk=10,
64
  ):
65
  os.environ["OPENAI_API_KEY"] = api_key
66
  chunk_size_limit = None if chunk_size_limit == 0 else chunk_size_limit
@@ -97,12 +97,12 @@ def construct_index(
97
 
98
 
99
  def chat_ai(
100
- api_key,
101
- index,
102
- question,
103
- context,
104
- chatbot,
105
- reply_language,
106
  ):
107
  os.environ["OPENAI_API_KEY"] = api_key
108
 
@@ -133,15 +133,15 @@ def chat_ai(
133
 
134
 
135
  def ask_ai(
136
- api_key,
137
- index,
138
- question,
139
- prompt_tmpl,
140
- refine_tmpl,
141
- sim_k=1,
142
- temprature=0,
143
- prefix_messages=[],
144
- reply_language="中文",
145
  ):
146
  os.environ["OPENAI_API_KEY"] = api_key
147
 
@@ -174,7 +174,7 @@ def ask_ai(
174
  for index, node in enumerate(response.source_nodes):
175
  brief = node.source_text[:25].replace("\n", "")
176
  nodes.append(
177
- f"<details><summary>[{index+1}]\t{brief}...</summary><p>{node.source_text}</p></details>"
178
  )
179
  new_response = ret_text + "\n----------\n" + "\n\n".join(nodes)
180
  logging.info(
 
13
  from langchain.llms import OpenAI
14
  import colorama
15
 
 
16
  from modules.presets import *
17
  from modules.utils import *
18
 
 
29
  logging.debug("Loading PDF...")
30
  CJKPDFReader = download_loader("CJKPDFReader")
31
  loader = CJKPDFReader()
32
+ text_raw = loader.load_data(file=file.name)[0].text
33
  elif os.path.splitext(file.name)[1] == ".docx":
34
  logging.debug("Loading DOCX...")
35
  DocxReader = download_loader("DocxReader")
36
  loader = DocxReader()
37
+ text_raw = loader.load_data(file=file.name)[0].text
38
  elif os.path.splitext(file.name)[1] == ".epub":
39
  logging.debug("Loading EPUB...")
40
  EpubReader = download_loader("EpubReader")
41
  loader = EpubReader()
42
+ text_raw = loader.load_data(file=file.name)[0].text
43
  else:
44
  logging.debug("Loading text file...")
45
  with open(file.name, "r", encoding="utf-8") as f:
46
+ text_raw = f.read()
47
+ text = add_space(text_raw)
48
+ documents += [Document(text)]
49
  index_name = sha1sum(index_name)
50
  return documents, index_name
51
 
52
 
53
  def construct_index(
54
+ api_key,
55
+ file_src,
56
+ max_input_size=4096,
57
+ num_outputs=1,
58
+ max_chunk_overlap=20,
59
+ chunk_size_limit=600,
60
+ embedding_limit=None,
61
+ separator=" ",
62
+ num_children=10,
63
+ max_keywords_per_chunk=10,
64
  ):
65
  os.environ["OPENAI_API_KEY"] = api_key
66
  chunk_size_limit = None if chunk_size_limit == 0 else chunk_size_limit
 
97
 
98
 
99
  def chat_ai(
100
+ api_key,
101
+ index,
102
+ question,
103
+ context,
104
+ chatbot,
105
+ reply_language,
106
  ):
107
  os.environ["OPENAI_API_KEY"] = api_key
108
 
 
133
 
134
 
135
  def ask_ai(
136
+ api_key,
137
+ index,
138
+ question,
139
+ prompt_tmpl,
140
+ refine_tmpl,
141
+ sim_k=1,
142
+ temprature=0,
143
+ prefix_messages=[],
144
+ reply_language="中文",
145
  ):
146
  os.environ["OPENAI_API_KEY"] = api_key
147
 
 
174
  for index, node in enumerate(response.source_nodes):
175
  brief = node.source_text[:25].replace("\n", "")
176
  nodes.append(
177
+ f"<details><summary>[{index + 1}]\t{brief}...</summary><p>{node.source_text}</p></details>"
178
  )
179
  new_response = ret_text + "\n----------\n" + "\n\n".join(nodes)
180
  logging.info(