technicolor commited on
Commit
ba0c780
·
1 Parent(s): 0a8d09f
Files changed (2) hide show
  1. src/demo/asg_loader.py +8 -1
  2. src/demo/views.py +5 -0
src/demo/asg_loader.py CHANGED
@@ -161,6 +161,13 @@ class DocumentLoading:
161
  return extracted_data['abstract'] + extracted_data['introduction'] + extracted_data['main_content']
162
 
163
  def load_pdf(self, pdf_file, survey_id, mode):
 
 
 
 
 
 
 
164
  base_name = os.path.splitext(os.path.basename(pdf_file))[0]
165
  target_dir = os.path.join(get_path('md', survey_id), base_name)
166
  md_file_path = os.path.join(target_dir, mode, f"{base_name}.md")
@@ -170,7 +177,7 @@ class DocumentLoading:
170
  print(f"Markdown file for {pdf_file} already exists at {md_file_path}. Skipping conversion.", flush=True)
171
  return self.process_md_file(md_file_path, survey_id)
172
 
173
- command = ["mineru", "-p", pdf_file, "-o", get_path('md', survey_id), "-m", mode]
174
  try:
175
  subprocess.run(command, check=True)
176
  # 检查是否生成了 Markdown 文件
 
161
  return extracted_data['abstract'] + extracted_data['introduction'] + extracted_data['main_content']
162
 
163
  def load_pdf(self, pdf_file, survey_id, mode):
164
+ # 确保 mode 合法
165
+ valid_modes = ['auto', 'txt', 'ocr']
166
+ mineru_mode = mode if mode in valid_modes else 'auto'
167
+ if mode not in valid_modes:
168
+ print(f"Warning: unsupported mineru method '{mode}', defaulting to 'auto'.")
169
+ # 同时修正用于生成路径的 mode
170
+ mode = mineru_mode
171
  base_name = os.path.splitext(os.path.basename(pdf_file))[0]
172
  target_dir = os.path.join(get_path('md', survey_id), base_name)
173
  md_file_path = os.path.join(target_dir, mode, f"{base_name}.md")
 
177
  print(f"Markdown file for {pdf_file} already exists at {md_file_path}. Skipping conversion.", flush=True)
178
  return self.process_md_file(md_file_path, survey_id)
179
 
180
+ command = ["mineru", "-p", pdf_file, "-o", get_path('md', survey_id), "-m", mineru_mode]
181
  try:
182
  subprocess.run(command, check=True)
183
  # 检查是否生成了 Markdown 文件
src/demo/views.py CHANGED
@@ -313,6 +313,11 @@ def PosRank_get_top5_ngrams(input_pd):
313
 
314
  def process_file(file_name, survey_id, mode):
315
  global embedder
 
 
 
 
 
316
  # 如果 file_name 不是绝对路径,则拼接 MEDIA_ROOT
317
  abs_file_path = file_name if os.path.isabs(file_name) else os.path.join(settings.MEDIA_ROOT, file_name)
318
  result = process_pdf(abs_file_path, survey_id, embedder, mode)
 
313
 
314
  def process_file(file_name, survey_id, mode):
315
  global embedder
316
+ # 校验 mode,仅允许 auto/txt/ocr
317
+ valid_modes = ['auto', 'txt', 'ocr']
318
+ if mode not in valid_modes:
319
+ print(f"Warning: unsupported mode '{mode}', defaulting to 'auto'.")
320
+ mode = 'auto'
321
  # 如果 file_name 不是绝对路径,则拼接 MEDIA_ROOT
322
  abs_file_path = file_name if os.path.isabs(file_name) else os.path.join(settings.MEDIA_ROOT, file_name)
323
  result = process_pdf(abs_file_path, survey_id, embedder, mode)