Tuchuanhuhuhu commited on
Commit
daf68a9
·
1 Parent(s): fe0c8bd

修复索引逻辑 #789

Browse files
Files changed (1) hide show
  1. modules/index_func.py +3 -7
modules/index_func.py CHANGED
@@ -51,7 +51,7 @@ def get_documents(file_src):
51
  pdfReader = PyPDF2.PdfReader(pdfFileObj)
52
  for page in tqdm(pdfReader.pages):
53
  pdftext += page.extract_text()
54
- texts = Document(page_content=pdftext, metadata={"source": filepath})
55
  elif file_type == ".docx":
56
  logging.debug("Loading Word...")
57
  from langchain.document_loaders import UnstructuredWordDocumentLoader
@@ -71,8 +71,7 @@ def get_documents(file_src):
71
  logging.debug("Loading Excel...")
72
  text_list = excel_to_string(filepath)
73
  for elem in text_list:
74
- documents.append(Document(page_content=elem, metadata={"source": filepath}))
75
- continue
76
  else:
77
  logging.debug("Loading text file...")
78
  from langchain.document_loaders import TextLoader
@@ -83,10 +82,7 @@ def get_documents(file_src):
83
  logging.error(f"Error loading file: {filename}")
84
  traceback.print_exc()
85
 
86
- try:
87
- texts = text_splitter.split_documents(texts)
88
- except AttributeError:
89
- texts = text_splitter.split_documents([texts])
90
  documents.extend(texts)
91
  logging.debug("Documents loaded.")
92
  return documents
 
51
  pdfReader = PyPDF2.PdfReader(pdfFileObj)
52
  for page in tqdm(pdfReader.pages):
53
  pdftext += page.extract_text()
54
+ texts = [Document(page_content=pdftext, metadata={"source": filepath})]
55
  elif file_type == ".docx":
56
  logging.debug("Loading Word...")
57
  from langchain.document_loaders import UnstructuredWordDocumentLoader
 
71
  logging.debug("Loading Excel...")
72
  text_list = excel_to_string(filepath)
73
  for elem in text_list:
74
+ texts.append(Document(page_content=elem, metadata={"source": filepath}))
 
75
  else:
76
  logging.debug("Loading text file...")
77
  from langchain.document_loaders import TextLoader
 
82
  logging.error(f"Error loading file: {filename}")
83
  traceback.print_exc()
84
 
85
+ texts = text_splitter.split_documents(texts)
 
 
 
86
  documents.extend(texts)
87
  logging.debug("Documents loaded.")
88
  return documents