Spaces:
Sleeping
Sleeping
Commit
·
0a8d09f
1
Parent(s):
806e2f5
update
Browse files- src/demo/asg_retriever.py +2 -0
- src/demo/asg_splitter.py +3 -0
- src/demo/views.py +3 -1
src/demo/asg_retriever.py
CHANGED
@@ -164,6 +164,8 @@ def process_pdf(file_path: str, survey_id: str, embedder: HuggingFaceEmbeddings,
|
|
164 |
# Load and split the PDF
|
165 |
split_start_time = time.time()
|
166 |
splitters = TextSplitting().mineru_recursive_splitter(file_path, survey_id, mode)
|
|
|
|
|
167 |
|
168 |
documents_list = [document.page_content for document in splitters]
|
169 |
for i in range(len(documents_list)):
|
|
|
164 |
# Load and split the PDF
|
165 |
split_start_time = time.time()
|
166 |
splitters = TextSplitting().mineru_recursive_splitter(file_path, survey_id, mode)
|
167 |
+
if not splitters:
|
168 |
+
raise ValueError(f"Failed to load or split PDF: {file_path}")
|
169 |
|
170 |
documents_list = [document.page_content for document in splitters]
|
171 |
for i in range(len(documents_list)):
|
src/demo/asg_splitter.py
CHANGED
@@ -4,6 +4,9 @@ from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
4 |
class TextSplitting:
|
5 |
def mineru_recursive_splitter(self, file_path, survey_id, mode):
|
6 |
docs = DocumentLoading().load_pdf(file_path, survey_id, mode)
|
|
|
|
|
|
|
7 |
text_splitter = RecursiveCharacterTextSplitter(
|
8 |
chunk_size=400,
|
9 |
chunk_overlap=30,
|
|
|
4 |
class TextSplitting:
|
5 |
def mineru_recursive_splitter(self, file_path, survey_id, mode):
|
6 |
docs = DocumentLoading().load_pdf(file_path, survey_id, mode)
|
7 |
+
if docs is None:
|
8 |
+
# 若加载失败,则返回空列表,调用方需处理
|
9 |
+
return []
|
10 |
text_splitter = RecursiveCharacterTextSplitter(
|
11 |
chunk_size=400,
|
12 |
chunk_overlap=30,
|
src/demo/views.py
CHANGED
@@ -313,7 +313,9 @@ def PosRank_get_top5_ngrams(input_pd):
|
|
313 |
|
314 |
def process_file(file_name, survey_id, mode):
|
315 |
global embedder
|
316 |
-
|
|
|
|
|
317 |
collection_name = result[0]
|
318 |
name = result[-1]
|
319 |
return collection_name, name
|
|
|
313 |
|
314 |
def process_file(file_name, survey_id, mode):
|
315 |
global embedder
|
316 |
+
# 如果 file_name 不是绝对路径,则拼接 MEDIA_ROOT
|
317 |
+
abs_file_path = file_name if os.path.isabs(file_name) else os.path.join(settings.MEDIA_ROOT, file_name)
|
318 |
+
result = process_pdf(abs_file_path, survey_id, embedder, mode)
|
319 |
collection_name = result[0]
|
320 |
name = result[-1]
|
321 |
return collection_name, name
|