Spaces:
Sleeping
Sleeping
Tuchuanhuhuhu
commited on
Commit
·
daf68a9
1
Parent(s):
fe0c8bd
修复索引逻辑 #789
Browse files- modules/index_func.py +3 -7
modules/index_func.py
CHANGED
@@ -51,7 +51,7 @@ def get_documents(file_src):
|
|
51 |
pdfReader = PyPDF2.PdfReader(pdfFileObj)
|
52 |
for page in tqdm(pdfReader.pages):
|
53 |
pdftext += page.extract_text()
|
54 |
-
texts = Document(page_content=pdftext, metadata={"source": filepath})
|
55 |
elif file_type == ".docx":
|
56 |
logging.debug("Loading Word...")
|
57 |
from langchain.document_loaders import UnstructuredWordDocumentLoader
|
@@ -71,8 +71,7 @@ def get_documents(file_src):
|
|
71 |
logging.debug("Loading Excel...")
|
72 |
text_list = excel_to_string(filepath)
|
73 |
for elem in text_list:
|
74 |
-
|
75 |
-
continue
|
76 |
else:
|
77 |
logging.debug("Loading text file...")
|
78 |
from langchain.document_loaders import TextLoader
|
@@ -83,10 +82,7 @@ def get_documents(file_src):
|
|
83 |
logging.error(f"Error loading file: {filename}")
|
84 |
traceback.print_exc()
|
85 |
|
86 |
-
|
87 |
-
texts = text_splitter.split_documents(texts)
|
88 |
-
except AttributeError:
|
89 |
-
texts = text_splitter.split_documents([texts])
|
90 |
documents.extend(texts)
|
91 |
logging.debug("Documents loaded.")
|
92 |
return documents
|
|
|
51 |
pdfReader = PyPDF2.PdfReader(pdfFileObj)
|
52 |
for page in tqdm(pdfReader.pages):
|
53 |
pdftext += page.extract_text()
|
54 |
+
texts = [Document(page_content=pdftext, metadata={"source": filepath})]
|
55 |
elif file_type == ".docx":
|
56 |
logging.debug("Loading Word...")
|
57 |
from langchain.document_loaders import UnstructuredWordDocumentLoader
|
|
|
71 |
logging.debug("Loading Excel...")
|
72 |
text_list = excel_to_string(filepath)
|
73 |
for elem in text_list:
|
74 |
+
texts.append(Document(page_content=elem, metadata={"source": filepath}))
|
|
|
75 |
else:
|
76 |
logging.debug("Loading text file...")
|
77 |
from langchain.document_loaders import TextLoader
|
|
|
82 |
logging.error(f"Error loading file: {filename}")
|
83 |
traceback.print_exc()
|
84 |
|
85 |
+
texts = text_splitter.split_documents(texts)
|
|
|
|
|
|
|
86 |
documents.extend(texts)
|
87 |
logging.debug("Documents loaded.")
|
88 |
return documents
|