Spaces:

technicolor
/

InteractiveSurvey

Sleeping

App Files Files Community

technicolor commited on 4 days ago

Commit

70cc6ab

1 Parent(s): ba0c780

update

Browse files

Files changed (2) hide show

src/demo/asg_loader.py +39 -12
src/demo/path_utils.py +16 -6

src/demo/asg_loader.py CHANGED Viewed

@@ -161,23 +161,47 @@ class DocumentLoading:
         return extracted_data['abstract'] + extracted_data['introduction'] + extracted_data['main_content']
     def load_pdf(self, pdf_file, survey_id, mode):
-        # 确保 mode 合法
-        valid_modes = ['auto', 'txt', 'ocr']
-        mineru_mode = mode if mode in valid_modes else 'auto'
-        if mode not in valid_modes:
-            print(f"Warning: unsupported mineru method '{mode}', defaulting to 'auto'.")
-            # 同时修正用于生成路径的 mode
-            mode = mineru_mode
         base_name = os.path.splitext(os.path.basename(pdf_file))[0]
         target_dir = os.path.join(get_path('md', survey_id), base_name)
-        md_file_path = os.path.join(target_dir, mode, f"{base_name}.md")
         print("The md file path is: ", md_file_path)
         if os.path.exists(md_file_path):
             print(f"Markdown file for {pdf_file} already exists at {md_file_path}. Skipping conversion.", flush=True)
-            return self.process_md_file(md_file_path, survey_id)
-        command = ["mineru", "-p", pdf_file, "-o", get_path('md', survey_id), "-m", mineru_mode]
         try:
             subprocess.run(command, check=True)
             # 检查是否生成了 Markdown 文件
@@ -187,7 +211,10 @@ class DocumentLoading:
                 return None
             else:
                 print(f"Successfully converted {pdf_file} to markdown format in {target_dir}.")
-                return self.process_md_file(md_file_path, survey_id)
         except subprocess.CalledProcessError as e:
             print(f"An error occurred during conversion: {e}")
             # 如果发生错误且文件夹已生成，则删除文件夹

         return extracted_data['abstract'] + extracted_data['introduction'] + extracted_data['main_content']
     def load_pdf(self, pdf_file, survey_id, mode):
+        """
+        Parameters
+        ----------
+        pdf_file : str
+            绝对路径 PDF 文件
+        survey_id : str
+            当前 survey ID，用于组织输出目录
+        mode : str
+            前端传递的模式，用于控制提取 intro 还是全文，
+            可能为 intro / full / auto / txt / ocr。
+        设计：
+        • mineru 只支持 auto / txt / ocr，这里统一用 'auto'（或保留传入的合法值），
+          与前端 intro/full 概念解耦。
+        • read_type 控制返回介绍还是全文：
+              - mode == 'intro'  →  只返回 introduction
+              - 其它             →  返回全文（abstract+intro+main）
+        """
+        valid_mineru_methods = ['auto', 'txt', 'ocr']
+        if mode in valid_mineru_methods:
+            mineru_method = mode
+            read_type = 'full'
+        else:
+            mineru_method = 'auto'  # 默认的 mineru 解析方式
+            read_type = 'intro' if mode == 'intro' else 'full'
         base_name = os.path.splitext(os.path.basename(pdf_file))[0]
         target_dir = os.path.join(get_path('md', survey_id), base_name)
+        # mineru 会把 md 文件放到  <target_dir>/<mineru_method>/<name>.md
+        md_file_path = os.path.join(target_dir, mineru_method, f"{base_name}.md")
         print("The md file path is: ", md_file_path)
         if os.path.exists(md_file_path):
             print(f"Markdown file for {pdf_file} already exists at {md_file_path}. Skipping conversion.", flush=True)
+            if read_type == 'intro':
+                return self.process_md_file(md_file_path, survey_id)
+            else:
+                return self.process_md_file_full(md_file_path, survey_id)
+        command = ["mineru", "-p", pdf_file, "-o", get_path('md', survey_id), "-m", mineru_method]
         try:
             subprocess.run(command, check=True)
             # 检查是否生成了 Markdown 文件
                 return None
             else:
                 print(f"Successfully converted {pdf_file} to markdown format in {target_dir}.")
+                if read_type == 'intro':
+                    return self.process_md_file(md_file_path, survey_id)
+                else:
+                    return self.process_md_file_full(md_file_path, survey_id)
         except subprocess.CalledProcessError as e:
             print(f"An error occurred during conversion: {e}")
             # 如果发生错误且文件夹已生成，则删除文件夹

src/demo/path_utils.py CHANGED Viewed

@@ -15,13 +15,22 @@ def setup_hf_cache():
         # 本地环境使用默认缓存目录
         return None
-# 检查是否在 Hugging Face Spaces 环境中
 def get_data_paths():
-    # 如果在 Hugging Face Spaces 中，使用临时目录
     if os.environ.get('SPACE_ID') or os.environ.get('HF_SPACE_ID'):
-        # 使用临时目录
         temp_dir = tempfile.mkdtemp()
-        return {
             'DATA_PATH': os.path.join(temp_dir, 'pdf/'),
             'TXT_PATH': os.path.join(temp_dir, 'txt/'),
             'TSV_PATH': os.path.join(temp_dir, 'tsv/'),
@@ -31,8 +40,7 @@ def get_data_paths():
             'RESULTS_PATH': os.path.join(temp_dir, 'results/')
         }
     else:
-        # 本地环境使用原来的路径
-        return {
             'DATA_PATH': './src/static/data/pdf/',
             'TXT_PATH': './src/static/data/txt/',
             'TSV_PATH': './src/static/data/tsv/',
@@ -42,6 +50,8 @@ def get_data_paths():
             'RESULTS_PATH': './src/static/data/results/'
         }
 # 全局路径管理函数
 def get_path(path_type, survey_id=None, filename=None):
     """

         # 本地环境使用默认缓存目录
         return None
+# 全局缓存，保证多个调用共享同一组路径
+_DATA_PATHS = None
 def get_data_paths():
+    """返回统一的数据路径字典。
+    在 Hugging Face Space 中只生成一次临时目录并缓存到 _DATA_PATHS，
+    避免多次调用导致路径不一致。"""
+    global _DATA_PATHS
+    if _DATA_PATHS is not None:
+        return _DATA_PATHS
     if os.environ.get('SPACE_ID') or os.environ.get('HF_SPACE_ID'):
+        # 创建一次临时目录并缓存
         temp_dir = tempfile.mkdtemp()
+        print(f"Using shared temp data directory: {temp_dir}")
+        _DATA_PATHS = {
             'DATA_PATH': os.path.join(temp_dir, 'pdf/'),
             'TXT_PATH': os.path.join(temp_dir, 'txt/'),
             'TSV_PATH': os.path.join(temp_dir, 'tsv/'),
             'RESULTS_PATH': os.path.join(temp_dir, 'results/')
         }
     else:
+        _DATA_PATHS = {
             'DATA_PATH': './src/static/data/pdf/',
             'TXT_PATH': './src/static/data/txt/',
             'TSV_PATH': './src/static/data/tsv/',
             'RESULTS_PATH': './src/static/data/results/'
         }
+    return _DATA_PATHS
 # 全局路径管理函数
 def get_path(path_type, survey_id=None, filename=None):
     """