Spaces:
Sleeping
Sleeping
Commit
·
70cc6ab
1
Parent(s):
ba0c780
update
Browse files- src/demo/asg_loader.py +39 -12
- src/demo/path_utils.py +16 -6
src/demo/asg_loader.py
CHANGED
@@ -161,23 +161,47 @@ class DocumentLoading:
|
|
161 |
return extracted_data['abstract'] + extracted_data['introduction'] + extracted_data['main_content']
|
162 |
|
163 |
def load_pdf(self, pdf_file, survey_id, mode):
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
base_name = os.path.splitext(os.path.basename(pdf_file))[0]
|
172 |
target_dir = os.path.join(get_path('md', survey_id), base_name)
|
173 |
-
|
|
|
174 |
print("The md file path is: ", md_file_path)
|
175 |
|
176 |
if os.path.exists(md_file_path):
|
177 |
print(f"Markdown file for {pdf_file} already exists at {md_file_path}. Skipping conversion.", flush=True)
|
178 |
-
|
179 |
-
|
180 |
-
|
|
|
|
|
|
|
181 |
try:
|
182 |
subprocess.run(command, check=True)
|
183 |
# 检查是否生成了 Markdown 文件
|
@@ -187,7 +211,10 @@ class DocumentLoading:
|
|
187 |
return None
|
188 |
else:
|
189 |
print(f"Successfully converted {pdf_file} to markdown format in {target_dir}.")
|
190 |
-
|
|
|
|
|
|
|
191 |
except subprocess.CalledProcessError as e:
|
192 |
print(f"An error occurred during conversion: {e}")
|
193 |
# 如果发生错误且文件夹已生成,则删除文件夹
|
|
|
161 |
return extracted_data['abstract'] + extracted_data['introduction'] + extracted_data['main_content']
|
162 |
|
163 |
def load_pdf(self, pdf_file, survey_id, mode):
|
164 |
+
"""
|
165 |
+
Parameters
|
166 |
+
----------
|
167 |
+
pdf_file : str
|
168 |
+
绝对路径 PDF 文件
|
169 |
+
survey_id : str
|
170 |
+
当前 survey ID,用于组织输出目录
|
171 |
+
mode : str
|
172 |
+
前端传递的模式,用于控制提取 intro 还是全文,
|
173 |
+
可能为 intro / full / auto / txt / ocr。
|
174 |
+
|
175 |
+
设计:
|
176 |
+
• mineru 只支持 auto / txt / ocr,这里统一用 'auto'(或保留传入的合法值),
|
177 |
+
与前端 intro/full 概念解耦。
|
178 |
+
• read_type 控制返回介绍还是全文:
|
179 |
+
- mode == 'intro' → 只返回 introduction
|
180 |
+
- 其它 → 返回全文(abstract+intro+main)
|
181 |
+
"""
|
182 |
+
|
183 |
+
valid_mineru_methods = ['auto', 'txt', 'ocr']
|
184 |
+
if mode in valid_mineru_methods:
|
185 |
+
mineru_method = mode
|
186 |
+
read_type = 'full'
|
187 |
+
else:
|
188 |
+
mineru_method = 'auto' # 默认的 mineru 解析方式
|
189 |
+
read_type = 'intro' if mode == 'intro' else 'full'
|
190 |
+
|
191 |
base_name = os.path.splitext(os.path.basename(pdf_file))[0]
|
192 |
target_dir = os.path.join(get_path('md', survey_id), base_name)
|
193 |
+
# mineru 会把 md 文件放到 <target_dir>/<mineru_method>/<name>.md
|
194 |
+
md_file_path = os.path.join(target_dir, mineru_method, f"{base_name}.md")
|
195 |
print("The md file path is: ", md_file_path)
|
196 |
|
197 |
if os.path.exists(md_file_path):
|
198 |
print(f"Markdown file for {pdf_file} already exists at {md_file_path}. Skipping conversion.", flush=True)
|
199 |
+
if read_type == 'intro':
|
200 |
+
return self.process_md_file(md_file_path, survey_id)
|
201 |
+
else:
|
202 |
+
return self.process_md_file_full(md_file_path, survey_id)
|
203 |
+
|
204 |
+
command = ["mineru", "-p", pdf_file, "-o", get_path('md', survey_id), "-m", mineru_method]
|
205 |
try:
|
206 |
subprocess.run(command, check=True)
|
207 |
# 检查是否生成了 Markdown 文件
|
|
|
211 |
return None
|
212 |
else:
|
213 |
print(f"Successfully converted {pdf_file} to markdown format in {target_dir}.")
|
214 |
+
if read_type == 'intro':
|
215 |
+
return self.process_md_file(md_file_path, survey_id)
|
216 |
+
else:
|
217 |
+
return self.process_md_file_full(md_file_path, survey_id)
|
218 |
except subprocess.CalledProcessError as e:
|
219 |
print(f"An error occurred during conversion: {e}")
|
220 |
# 如果发生错误且文件夹已生成,则删除文件夹
|
src/demo/path_utils.py
CHANGED
@@ -15,13 +15,22 @@ def setup_hf_cache():
|
|
15 |
# 本地环境使用默认缓存目录
|
16 |
return None
|
17 |
|
18 |
-
#
|
|
|
|
|
19 |
def get_data_paths():
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
if os.environ.get('SPACE_ID') or os.environ.get('HF_SPACE_ID'):
|
22 |
-
#
|
23 |
temp_dir = tempfile.mkdtemp()
|
24 |
-
|
|
|
25 |
'DATA_PATH': os.path.join(temp_dir, 'pdf/'),
|
26 |
'TXT_PATH': os.path.join(temp_dir, 'txt/'),
|
27 |
'TSV_PATH': os.path.join(temp_dir, 'tsv/'),
|
@@ -31,8 +40,7 @@ def get_data_paths():
|
|
31 |
'RESULTS_PATH': os.path.join(temp_dir, 'results/')
|
32 |
}
|
33 |
else:
|
34 |
-
|
35 |
-
return {
|
36 |
'DATA_PATH': './src/static/data/pdf/',
|
37 |
'TXT_PATH': './src/static/data/txt/',
|
38 |
'TSV_PATH': './src/static/data/tsv/',
|
@@ -42,6 +50,8 @@ def get_data_paths():
|
|
42 |
'RESULTS_PATH': './src/static/data/results/'
|
43 |
}
|
44 |
|
|
|
|
|
45 |
# 全局路径管理函数
|
46 |
def get_path(path_type, survey_id=None, filename=None):
|
47 |
"""
|
|
|
15 |
# 本地环境使用默认缓存目录
|
16 |
return None
|
17 |
|
18 |
+
# 全局缓存,保证多个调用共享同一组路径
|
19 |
+
_DATA_PATHS = None
|
20 |
+
|
21 |
def get_data_paths():
|
22 |
+
"""返回统一的数据路径字典。
|
23 |
+
在 Hugging Face Space 中只生成一次临时目录并缓存到 _DATA_PATHS,
|
24 |
+
避免多次调用导致路径不一致。"""
|
25 |
+
global _DATA_PATHS
|
26 |
+
if _DATA_PATHS is not None:
|
27 |
+
return _DATA_PATHS
|
28 |
+
|
29 |
if os.environ.get('SPACE_ID') or os.environ.get('HF_SPACE_ID'):
|
30 |
+
# 创建一次临时目录并缓存
|
31 |
temp_dir = tempfile.mkdtemp()
|
32 |
+
print(f"Using shared temp data directory: {temp_dir}")
|
33 |
+
_DATA_PATHS = {
|
34 |
'DATA_PATH': os.path.join(temp_dir, 'pdf/'),
|
35 |
'TXT_PATH': os.path.join(temp_dir, 'txt/'),
|
36 |
'TSV_PATH': os.path.join(temp_dir, 'tsv/'),
|
|
|
40 |
'RESULTS_PATH': os.path.join(temp_dir, 'results/')
|
41 |
}
|
42 |
else:
|
43 |
+
_DATA_PATHS = {
|
|
|
44 |
'DATA_PATH': './src/static/data/pdf/',
|
45 |
'TXT_PATH': './src/static/data/txt/',
|
46 |
'TSV_PATH': './src/static/data/tsv/',
|
|
|
50 |
'RESULTS_PATH': './src/static/data/results/'
|
51 |
}
|
52 |
|
53 |
+
return _DATA_PATHS
|
54 |
+
|
55 |
# 全局路径管理函数
|
56 |
def get_path(path_type, survey_id=None, filename=None):
|
57 |
"""
|