technicolor commited on
Commit
0a8d09f
·
1 Parent(s): 806e2f5
src/demo/asg_retriever.py CHANGED
@@ -164,6 +164,8 @@ def process_pdf(file_path: str, survey_id: str, embedder: HuggingFaceEmbeddings,
164
  # Load and split the PDF
165
  split_start_time = time.time()
166
  splitters = TextSplitting().mineru_recursive_splitter(file_path, survey_id, mode)
 
 
167
 
168
  documents_list = [document.page_content for document in splitters]
169
  for i in range(len(documents_list)):
 
164
  # Load and split the PDF
165
  split_start_time = time.time()
166
  splitters = TextSplitting().mineru_recursive_splitter(file_path, survey_id, mode)
167
+ if not splitters:
168
+ raise ValueError(f"Failed to load or split PDF: {file_path}")
169
 
170
  documents_list = [document.page_content for document in splitters]
171
  for i in range(len(documents_list)):
src/demo/asg_splitter.py CHANGED
@@ -4,6 +4,9 @@ from langchain_text_splitters import RecursiveCharacterTextSplitter
4
  class TextSplitting:
5
  def mineru_recursive_splitter(self, file_path, survey_id, mode):
6
  docs = DocumentLoading().load_pdf(file_path, survey_id, mode)
 
 
 
7
  text_splitter = RecursiveCharacterTextSplitter(
8
  chunk_size=400,
9
  chunk_overlap=30,
 
4
  class TextSplitting:
5
  def mineru_recursive_splitter(self, file_path, survey_id, mode):
6
  docs = DocumentLoading().load_pdf(file_path, survey_id, mode)
7
+ if docs is None:
8
+ # 若加载失败,则返回空列表,调用方需处理
9
+ return []
10
  text_splitter = RecursiveCharacterTextSplitter(
11
  chunk_size=400,
12
  chunk_overlap=30,
src/demo/views.py CHANGED
@@ -313,7 +313,9 @@ def PosRank_get_top5_ngrams(input_pd):
313
 
314
  def process_file(file_name, survey_id, mode):
315
  global embedder
316
- result = process_pdf(file_name, survey_id, embedder, mode)
 
 
317
  collection_name = result[0]
318
  name = result[-1]
319
  return collection_name, name
 
313
 
314
  def process_file(file_name, survey_id, mode):
315
  global embedder
316
+ # 如果 file_name 不是绝对路径,则拼接 MEDIA_ROOT
317
+ abs_file_path = file_name if os.path.isabs(file_name) else os.path.join(settings.MEDIA_ROOT, file_name)
318
+ result = process_pdf(abs_file_path, survey_id, embedder, mode)
319
  collection_name = result[0]
320
  name = result[-1]
321
  return collection_name, name