technicolor commited on
Commit
70cc6ab
·
1 Parent(s): ba0c780
Files changed (2) hide show
  1. src/demo/asg_loader.py +39 -12
  2. src/demo/path_utils.py +16 -6
src/demo/asg_loader.py CHANGED
@@ -161,23 +161,47 @@ class DocumentLoading:
161
  return extracted_data['abstract'] + extracted_data['introduction'] + extracted_data['main_content']
162
 
163
  def load_pdf(self, pdf_file, survey_id, mode):
164
- # 确保 mode 合法
165
- valid_modes = ['auto', 'txt', 'ocr']
166
- mineru_mode = mode if mode in valid_modes else 'auto'
167
- if mode not in valid_modes:
168
- print(f"Warning: unsupported mineru method '{mode}', defaulting to 'auto'.")
169
- # 同时修正用于生成路径的 mode
170
- mode = mineru_mode
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  base_name = os.path.splitext(os.path.basename(pdf_file))[0]
172
  target_dir = os.path.join(get_path('md', survey_id), base_name)
173
- md_file_path = os.path.join(target_dir, mode, f"{base_name}.md")
 
174
  print("The md file path is: ", md_file_path)
175
 
176
  if os.path.exists(md_file_path):
177
  print(f"Markdown file for {pdf_file} already exists at {md_file_path}. Skipping conversion.", flush=True)
178
- return self.process_md_file(md_file_path, survey_id)
179
-
180
- command = ["mineru", "-p", pdf_file, "-o", get_path('md', survey_id), "-m", mineru_mode]
 
 
 
181
  try:
182
  subprocess.run(command, check=True)
183
  # 检查是否生成了 Markdown 文件
@@ -187,7 +211,10 @@ class DocumentLoading:
187
  return None
188
  else:
189
  print(f"Successfully converted {pdf_file} to markdown format in {target_dir}.")
190
- return self.process_md_file(md_file_path, survey_id)
 
 
 
191
  except subprocess.CalledProcessError as e:
192
  print(f"An error occurred during conversion: {e}")
193
  # 如果发生错误且文件夹已生成,则删除文件夹
 
161
  return extracted_data['abstract'] + extracted_data['introduction'] + extracted_data['main_content']
162
 
163
  def load_pdf(self, pdf_file, survey_id, mode):
164
+ """
165
+ Parameters
166
+ ----------
167
+ pdf_file : str
168
+ 绝对路径 PDF 文件
169
+ survey_id : str
170
+ 当前 survey ID,用于组织输出目录
171
+ mode : str
172
+ 前端传递的模式,用于控制提取 intro 还是全文,
173
+ 可能为 intro / full / auto / txt / ocr。
174
+
175
+ 设计:
176
+ • mineru 只支持 auto / txt / ocr,这里统一用 'auto'(或保留传入的合法值),
177
+ 与前端 intro/full 概念解耦。
178
+ • read_type 控制返回介绍还是全文:
179
+ - mode == 'intro' → 只返回 introduction
180
+ - 其它 → 返回全文(abstract+intro+main)
181
+ """
182
+
183
+ valid_mineru_methods = ['auto', 'txt', 'ocr']
184
+ if mode in valid_mineru_methods:
185
+ mineru_method = mode
186
+ read_type = 'full'
187
+ else:
188
+ mineru_method = 'auto' # 默认的 mineru 解析方式
189
+ read_type = 'intro' if mode == 'intro' else 'full'
190
+
191
  base_name = os.path.splitext(os.path.basename(pdf_file))[0]
192
  target_dir = os.path.join(get_path('md', survey_id), base_name)
193
+ # mineru 会把 md 文件放到 <target_dir>/<mineru_method>/<name>.md
194
+ md_file_path = os.path.join(target_dir, mineru_method, f"{base_name}.md")
195
  print("The md file path is: ", md_file_path)
196
 
197
  if os.path.exists(md_file_path):
198
  print(f"Markdown file for {pdf_file} already exists at {md_file_path}. Skipping conversion.", flush=True)
199
+ if read_type == 'intro':
200
+ return self.process_md_file(md_file_path, survey_id)
201
+ else:
202
+ return self.process_md_file_full(md_file_path, survey_id)
203
+
204
+ command = ["mineru", "-p", pdf_file, "-o", get_path('md', survey_id), "-m", mineru_method]
205
  try:
206
  subprocess.run(command, check=True)
207
  # 检查是否生成了 Markdown 文件
 
211
  return None
212
  else:
213
  print(f"Successfully converted {pdf_file} to markdown format in {target_dir}.")
214
+ if read_type == 'intro':
215
+ return self.process_md_file(md_file_path, survey_id)
216
+ else:
217
+ return self.process_md_file_full(md_file_path, survey_id)
218
  except subprocess.CalledProcessError as e:
219
  print(f"An error occurred during conversion: {e}")
220
  # 如果发生错误且文件夹已生成,则删除文件夹
src/demo/path_utils.py CHANGED
@@ -15,13 +15,22 @@ def setup_hf_cache():
15
  # 本地环境使用默认缓存目录
16
  return None
17
 
18
- # 检查是否在 Hugging Face Spaces 环境中
 
 
19
  def get_data_paths():
20
- # 如果在 Hugging Face Spaces 中,使用临时目录
 
 
 
 
 
 
21
  if os.environ.get('SPACE_ID') or os.environ.get('HF_SPACE_ID'):
22
- # 使用临时目录
23
  temp_dir = tempfile.mkdtemp()
24
- return {
 
25
  'DATA_PATH': os.path.join(temp_dir, 'pdf/'),
26
  'TXT_PATH': os.path.join(temp_dir, 'txt/'),
27
  'TSV_PATH': os.path.join(temp_dir, 'tsv/'),
@@ -31,8 +40,7 @@ def get_data_paths():
31
  'RESULTS_PATH': os.path.join(temp_dir, 'results/')
32
  }
33
  else:
34
- # 本地环境使用原来的路径
35
- return {
36
  'DATA_PATH': './src/static/data/pdf/',
37
  'TXT_PATH': './src/static/data/txt/',
38
  'TSV_PATH': './src/static/data/tsv/',
@@ -42,6 +50,8 @@ def get_data_paths():
42
  'RESULTS_PATH': './src/static/data/results/'
43
  }
44
 
 
 
45
  # 全局路径管理函数
46
  def get_path(path_type, survey_id=None, filename=None):
47
  """
 
15
  # 本地环境使用默认缓存目录
16
  return None
17
 
18
+ # 全局缓存,保证多个调用共享同一组路径
19
+ _DATA_PATHS = None
20
+
21
  def get_data_paths():
22
+ """返回统一的数据路径字典。
23
+ 在 Hugging Face Space 中只生成一次临时目录并缓存到 _DATA_PATHS,
24
+ 避免多次调用导致路径不一致。"""
25
+ global _DATA_PATHS
26
+ if _DATA_PATHS is not None:
27
+ return _DATA_PATHS
28
+
29
  if os.environ.get('SPACE_ID') or os.environ.get('HF_SPACE_ID'):
30
+ # 创建一次临时目录并缓存
31
  temp_dir = tempfile.mkdtemp()
32
+ print(f"Using shared temp data directory: {temp_dir}")
33
+ _DATA_PATHS = {
34
  'DATA_PATH': os.path.join(temp_dir, 'pdf/'),
35
  'TXT_PATH': os.path.join(temp_dir, 'txt/'),
36
  'TSV_PATH': os.path.join(temp_dir, 'tsv/'),
 
40
  'RESULTS_PATH': os.path.join(temp_dir, 'results/')
41
  }
42
  else:
43
+ _DATA_PATHS = {
 
44
  'DATA_PATH': './src/static/data/pdf/',
45
  'TXT_PATH': './src/static/data/txt/',
46
  'TSV_PATH': './src/static/data/tsv/',
 
50
  'RESULTS_PATH': './src/static/data/results/'
51
  }
52
 
53
+ return _DATA_PATHS
54
+
55
  # 全局路径管理函数
56
  def get_path(path_type, survey_id=None, filename=None):
57
  """