Spaces:
Sleeping
Sleeping
Commit
·
ba0c780
1
Parent(s):
0a8d09f
update
Browse files- src/demo/asg_loader.py +8 -1
- src/demo/views.py +5 -0
src/demo/asg_loader.py
CHANGED
@@ -161,6 +161,13 @@ class DocumentLoading:
|
|
161 |
return extracted_data['abstract'] + extracted_data['introduction'] + extracted_data['main_content']
|
162 |
|
163 |
def load_pdf(self, pdf_file, survey_id, mode):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
base_name = os.path.splitext(os.path.basename(pdf_file))[0]
|
165 |
target_dir = os.path.join(get_path('md', survey_id), base_name)
|
166 |
md_file_path = os.path.join(target_dir, mode, f"{base_name}.md")
|
@@ -170,7 +177,7 @@ class DocumentLoading:
|
|
170 |
print(f"Markdown file for {pdf_file} already exists at {md_file_path}. Skipping conversion.", flush=True)
|
171 |
return self.process_md_file(md_file_path, survey_id)
|
172 |
|
173 |
-
command = ["mineru", "-p", pdf_file, "-o", get_path('md', survey_id), "-m",
|
174 |
try:
|
175 |
subprocess.run(command, check=True)
|
176 |
# 检查是否生成了 Markdown 文件
|
|
|
161 |
return extracted_data['abstract'] + extracted_data['introduction'] + extracted_data['main_content']
|
162 |
|
163 |
def load_pdf(self, pdf_file, survey_id, mode):
|
164 |
+
# 确保 mode 合法
|
165 |
+
valid_modes = ['auto', 'txt', 'ocr']
|
166 |
+
mineru_mode = mode if mode in valid_modes else 'auto'
|
167 |
+
if mode not in valid_modes:
|
168 |
+
print(f"Warning: unsupported mineru method '{mode}', defaulting to 'auto'.")
|
169 |
+
# 同时修正用于生成路径的 mode
|
170 |
+
mode = mineru_mode
|
171 |
base_name = os.path.splitext(os.path.basename(pdf_file))[0]
|
172 |
target_dir = os.path.join(get_path('md', survey_id), base_name)
|
173 |
md_file_path = os.path.join(target_dir, mode, f"{base_name}.md")
|
|
|
177 |
print(f"Markdown file for {pdf_file} already exists at {md_file_path}. Skipping conversion.", flush=True)
|
178 |
return self.process_md_file(md_file_path, survey_id)
|
179 |
|
180 |
+
command = ["mineru", "-p", pdf_file, "-o", get_path('md', survey_id), "-m", mineru_mode]
|
181 |
try:
|
182 |
subprocess.run(command, check=True)
|
183 |
# 检查是否生成了 Markdown 文件
|
src/demo/views.py
CHANGED
@@ -313,6 +313,11 @@ def PosRank_get_top5_ngrams(input_pd):
|
|
313 |
|
314 |
def process_file(file_name, survey_id, mode):
|
315 |
global embedder
|
|
|
|
|
|
|
|
|
|
|
316 |
# 如果 file_name 不是绝对路径,则拼接 MEDIA_ROOT
|
317 |
abs_file_path = file_name if os.path.isabs(file_name) else os.path.join(settings.MEDIA_ROOT, file_name)
|
318 |
result = process_pdf(abs_file_path, survey_id, embedder, mode)
|
|
|
313 |
|
314 |
def process_file(file_name, survey_id, mode):
|
315 |
global embedder
|
316 |
+
# 校验 mode,仅允许 auto/txt/ocr
|
317 |
+
valid_modes = ['auto', 'txt', 'ocr']
|
318 |
+
if mode not in valid_modes:
|
319 |
+
print(f"Warning: unsupported mode '{mode}', defaulting to 'auto'.")
|
320 |
+
mode = 'auto'
|
321 |
# 如果 file_name 不是绝对路径,则拼接 MEDIA_ROOT
|
322 |
abs_file_path = file_name if os.path.isabs(file_name) else os.path.join(settings.MEDIA_ROOT, file_name)
|
323 |
result = process_pdf(abs_file_path, survey_id, embedder, mode)
|