Spaces:

technicolor
/

InteractiveSurvey

Sleeping

App Files Files Community

technicolor commited on Jul 1

Commit

92d8c87

1 Parent(s): c320a1b

update

Browse files

Files changed (12) hide show

src/demo/asg_add_flowchart.py +13 -7
src/demo/asg_latex.py +14 -12
src/demo/asg_loader.py +79 -57
src/demo/asg_outline.py +47 -45
src/demo/asg_retriever.py +2 -1
src/demo/category_and_tsne.py +13 -47
src/demo/path_utils.py +64 -0
src/demo/survey_generation_pipeline/asg_loader.py +23 -18
src/demo/survey_generation_pipeline/asg_outline.py +16 -17
src/demo/survey_generation_pipeline/asg_retriever.py +3 -2
src/demo/survey_generation_pipeline/category_and_tsne.py +12 -47
src/demo/views.py +84 -77

src/demo/asg_add_flowchart.py CHANGED Viewed

@@ -2,17 +2,23 @@ import json
 import os
 import re
 from urllib.parse import quote
-import os
-import json
 import torch
 import torchvision.transforms as transforms
 from torchvision import models
 from PIL import Image
-# 常量定义
-BASE_DIR = os.path.normpath("src/static/data/md")  # 根目录
-INFO_DIR = os.path.normpath("src/static/data/info")  # 存放 JSON 结果的目录
 # 加载 PyTorch EfficientNet 训练好的 3 类分类模型
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -288,9 +294,9 @@ def insert_tex_images(json_path, ref_names, text):
 # 示例用法
 if __name__ == "__main__":
     # Markdown 文件路径
-    md_file_path = "src/static/data/info/test/survey_test_processed.md"
     # JSON 文件路径
-    json_file_path = "src/static/data/info/test/flowchart_results.json"
     try:
         with open(md_file_path, "r", encoding="utf-8") as f:

 import os
 import re
 from urllib.parse import quote
+import cv2
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.patches import Rectangle
+import matplotlib.patches as mpatches
+from .path_utils import get_path
 import torch
 import torchvision.transforms as transforms
 from torchvision import models
 from PIL import Image
+# 使用动态路径
+BASE_DIR = get_path('md')  # 根目录
+INFO_DIR = get_path('info')  # 存放 JSON 结果的目录
 # 加载 PyTorch EfficientNet 训练好的 3 类分类模型
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # 示例用法
 if __name__ == "__main__":
     # Markdown 文件路径
+    md_file_path = get_path('info', 'test', 'survey_test_processed.md')
     # JSON 文件路径
+    json_file_path = get_path('info', 'test', 'flowchart_results.json')
     try:
         with open(md_file_path, "r", encoding="utf-8") as f:

src/demo/asg_latex.py CHANGED Viewed

@@ -1,12 +1,14 @@
 import re
 import subprocess
-import os
-from openai import OpenAI
 import dotenv
 from .asg_add_flowchart import insert_tex_images
 from .asg_mindmap import insert_outline_figure
 def _remove_div_blocks(lines):
     """
@@ -435,7 +437,7 @@ def md_to_tex_section_without_jpg(section):
     #    - 标题看起来只是一个段落号, 形如"3"、"3.1"、"3.1.1" 等 (可根据需要调宽或调窄判断规则)
     # 例：用一个正则匹配 `数字(.数字)*`，可带可不带后缀空格
-    #   如果 heading_text 完全匹配这个模式，就认为它是个“纯编号标题”，不必调用 LLM
     pure_number_pattern = re.compile(r'^\d+(\.\d+)*$')
     # 先去一下两端空格
@@ -548,7 +550,7 @@ def md_to_tex_section_without_jpg(section):
 def insert_section(tex_path: str, section_content: str):
     """
-    将 section_content 追加到 .tex 文件“最后一个 section(或子节)的正文末尾”。
     具体逻辑如下：
       1. 如果文件内找不到任何 \section{...}、\subsection{...}、\subsubsection{...}，
          那么就将 section_content 插入到 \end{abstract} 之后。
@@ -564,7 +566,7 @@ def insert_section(tex_path: str, section_content: str):
     注意：
         - 这段逻辑会将新的内容**追加**到最后一个标题所对应正文的末尾，
-          这样可以避免把之前的内容“分割”或“顶开”。
     """
     if not os.path.exists(tex_path):
@@ -616,11 +618,11 @@ def insert_section(tex_path: str, section_content: str):
         )
     else:
-        # 有标题时，将内容追加到“最后一个标题对应正文”的末尾
         last_title_line = title_lines[-1]
         # 找到下一个标题的行号（如果有），或 \end{document} 行号，以确定正文区间结束
-        # “最后标题正文”从 last_title_line+1 一直到 next_title_line-1（或结束）
         next_boundaries = [end_document_line if end_document_line is not None else len(lines)]
         for t_line in title_lines:
             if t_line > last_title_line:
@@ -628,8 +630,8 @@ def insert_section(tex_path: str, section_content: str):
         # next_boundary 是最后标题之后遇到的第一个 boundary（若没有, 就是文件末尾）
         next_boundary = min(next_boundaries) if next_boundaries else len(lines)
-        # 我们希望将新的内容插在“最后标题正文的最末尾”之后，也就是说在 next_boundary 前。
-        # 不过若“最后标题”本身就处于全文件最终，next_boundary 可能表示文件末尾/文档结束。
         # 这里为了避免把最后一行顶下去，可以先把其中的正文行都保留，再在最后插入 section_content。
         new_lines = []
         new_lines.extend(lines[:next_boundary])  # 保留从头到最后正文结束
@@ -804,8 +806,8 @@ if __name__ == "__main__":
     # 读取环境变量
     dotenv.load_dotenv()
     # md_path = preprocess_md("src/demo/latex_template/test copy.md", "src/demo/latex_template/test_preprocessed.md")
-    md_path = 'src/static/data/info/undefined/survey_undefined_preprocessed.md'
-    tex_path = "src/static/data/info/undefined/template.tex"
     md_to_tex(md_path, tex_path, title="A Comprehensive Review of ADMM On Consensus Distributed Optimization")
     # insert_figures('src/static/data/info/undefined/outline.png',
     #                'src/demo/latex_template/template.tex',

+import os
 import re
 import subprocess
+import shutil
+from .path_utils import get_path
 import dotenv
 from .asg_add_flowchart import insert_tex_images
 from .asg_mindmap import insert_outline_figure
+from openai import OpenAI
 def _remove_div_blocks(lines):
     """
     #    - 标题看起来只是一个段落号, 形如"3"、"3.1"、"3.1.1" 等 (可根据需要调宽或调窄判断规则)
     # 例：用一个正则匹配 `数字(.数字)*`，可带可不带后缀空格
+    #   如果 heading_text 完全匹配这个模式，就认为它是个"纯编号标题"，不必调用 LLM
     pure_number_pattern = re.compile(r'^\d+(\.\d+)*$')
     # 先去一下两端空格
 def insert_section(tex_path: str, section_content: str):
     """
+    将 section_content 追加到 .tex 文件"最后一个 section(或子节)的正文末尾"。
     具体逻辑如下：
       1. 如果文件内找不到任何 \section{...}、\subsection{...}、\subsubsection{...}，
          那么就将 section_content 插入到 \end{abstract} 之后。
     注意：
         - 这段逻辑会将新的内容**追加**到最后一个标题所对应正文的末尾，
+          这样可以避免把之前的内容"分割"或"顶开"。
     """
     if not os.path.exists(tex_path):
         )
     else:
+        # 有标题时，将内容追加到"最后一个标题对应正文"的末尾
         last_title_line = title_lines[-1]
         # 找到下一个标题的行号（如果有），或 \end{document} 行号，以确定正文区间结束
+        # "最后标题正文"从 last_title_line+1 一直到 next_title_line-1（或结束）
         next_boundaries = [end_document_line if end_document_line is not None else len(lines)]
         for t_line in title_lines:
             if t_line > last_title_line:
         # next_boundary 是最后标题之后遇到的第一个 boundary（若没有, 就是文件末尾）
         next_boundary = min(next_boundaries) if next_boundaries else len(lines)
+        # 我们希望将新的内容插在"最后标题正文的最末尾"之后，也就是说在 next_boundary 前。
+        # 不过若"最后标题"本身就处于全文件最终，next_boundary 可能表示文件末尾/文档结束。
         # 这里为了避免把最后一行顶下去，可以先把其中的正文行都保留，再在最后插入 section_content。
         new_lines = []
         new_lines.extend(lines[:next_boundary])  # 保留从头到最后正文结束
     # 读取环境变量
     dotenv.load_dotenv()
     # md_path = preprocess_md("src/demo/latex_template/test copy.md", "src/demo/latex_template/test_preprocessed.md")
+    md_path = get_path('info', 'undefined', 'survey_undefined_preprocessed.md')
+    tex_path = get_path('info', 'undefined', 'template.tex')
     md_to_tex(md_path, tex_path, title="A Comprehensive Review of ADMM On Consensus Distributed Optimization")
     # insert_figures('src/static/data/info/undefined/outline.png',
     #                'src/demo/latex_template/template.tex',

src/demo/asg_loader.py CHANGED Viewed

@@ -2,9 +2,14 @@ import os
 import re
 import json
 import subprocess
 from langchain_community.document_loaders import UnstructuredMarkdownLoader
-from langchain_core.documents import Document
 import shutil
 class DocumentLoading:
     def convert_pdf_to_md(self, pdf_file, output_dir="output", method="auto"):
@@ -128,8 +133,8 @@ class DocumentLoading:
         for char in invalid_chars:
             title_new = title_new.replace(char, ' ')
-        os.makedirs(f'./src/static/data/txt/{survey_id}', exist_ok=True)
-        with open(f'./src/static/data/txt/{survey_id}/{title_new}.json', 'w', encoding='utf-8') as f:
             json.dump(extracted_data, f, ensure_ascii=False, indent=4)
         return extracted_data['introduction']
@@ -150,69 +155,73 @@ class DocumentLoading:
         for char in invalid_chars:
             title_new = title_new.replace(char, ' ')
-        os.makedirs(f'./src/static/data/txt/{survey_id}', exist_ok=True)
-        with open(f'./src/static/data/txt/{survey_id}/{title_new}.json', 'w', encoding='utf-8') as f:
             json.dump(extracted_data, f, ensure_ascii=False, indent=4)
         return extracted_data['abstract'] + extracted_data['introduction'] + extracted_data['main_content']
     def load_pdf(self, pdf_file, survey_id, mode):
-        os.makedirs(f'./src/static/data/md/{survey_id}', exist_ok=True)
-        output_dir = f"./src/static/data/md/{survey_id}"
         base_name = os.path.splitext(os.path.basename(pdf_file))[0]
-        target_dir = os.path.join(output_dir, base_name, "auto")
-        # 1. Convert PDF to markdown if the folder doesn't exist
-        self.convert_pdf_to_md(pdf_file, output_dir)
-        # 2. Process the markdown file in the output directory
-        md_file_path = os.path.join(target_dir, f"{base_name}.md")
-        if not os.path.exists(md_file_path):
-            raise FileNotFoundError(f"Markdown file {md_file_path} does not exist. Conversion might have failed.")
-        if mode == "intro":
             return self.process_md_file(md_file_path, survey_id)
-        elif mode == "full":
-            return self.process_md_file_full(md_file_path, survey_id)
-    # wrong, still being tested
     def load_pdf_new(self, pdf_dir, survey_id):
-        os.makedirs(f'./src/static/data/md/{survey_id}', exist_ok=True)
-        output_dir = f"./src/static/data/md/{survey_id}"
-        self.convert_pdf_to_md_new(pdf_dir, output_dir)
-        markdown_files = glob.glob(os.path.join(output_dir, "*", "auto", "*.md"))
-        all_introductions = []
-        for md_file_path in markdown_files:
-            try:
-                introduction = self.process_md_file(md_file_path, survey_id)
-                all_introductions.append(introduction)
-            except FileNotFoundError as e:
-                print(f"Markdown file {md_file_path} does not exist. Conversion might have failed.")
-        return all_introductions
     def parallel_load_pdfs(self, pdf_files, survey_id, max_workers=4):
         with ProcessPoolExecutor(max_workers=max_workers) as executor:
-            # Submit tasks for parallel execution
-            futures = [executor.submit(self.load_pdf, pdf, survey_id) for pdf in pdf_files]
-            # Collect results
             for future in futures:
                 try:
-                    result = future.result()
-                    print(f"Processed result: {result}")
-                except Exception as e:
-                    print(f"Error processing PDF: {e}")
     def ensure_non_empty_introduction(self, introduction, full_text):
-        """
-        Ensure introduction is not empty. If empty, replace with full text.
-        """
-        if introduction == "N/A" or len(introduction.strip()) < 50:
-            return full_text.strip()
         return introduction
     def extract_information_from_md_new(self, md_text):
@@ -240,17 +249,30 @@ class DocumentLoading:
         # Introduction extraction
         introduction_match = re.search(
-            r'\n\n([1I][\.\- ]?\s*)?[Ii]\s*[nN]\s*[tT]\s*[rR]\s*[oO]\s*[dD]\s*[uU]\s*[cC]\s*[tT]\s*[iI]\s*[oO]\s*[nN][\.\- ]?\s*\n\n(.*?)',
-            md_text, re.DOTALL
         )
         introduction = introduction_match.group(2).strip() if introduction_match else "N/A"
-        # Ensure introduction is not empty
-        introduction = self.ensure_non_empty_introduction(introduction, md_text)
-        return {
             "title": title,
             "authors": authors,
             "abstract": abstract,
-            "introduction": introduction
-        }

 import re
 import json
 import subprocess
+import glob
+from pathlib import Path
+from concurrent.futures import ProcessPoolExecutor
 from langchain_community.document_loaders import UnstructuredMarkdownLoader
+from langchain.schema import Document
 import shutil
+import tempfile
+from .path_utils import get_path
 class DocumentLoading:
     def convert_pdf_to_md(self, pdf_file, output_dir="output", method="auto"):
         for char in invalid_chars:
             title_new = title_new.replace(char, ' ')
+        os.makedirs(get_path('txt', survey_id), exist_ok=True)
+        with open(get_path('txt', survey_id, f'{title_new}.json'), 'w', encoding='utf-8') as f:
             json.dump(extracted_data, f, ensure_ascii=False, indent=4)
         return extracted_data['introduction']
         for char in invalid_chars:
             title_new = title_new.replace(char, ' ')
+        os.makedirs(get_path('txt', survey_id), exist_ok=True)
+        with open(get_path('txt', survey_id, f'{title_new}.json'), 'w', encoding='utf-8') as f:
             json.dump(extracted_data, f, ensure_ascii=False, indent=4)
         return extracted_data['abstract'] + extracted_data['introduction'] + extracted_data['main_content']
     def load_pdf(self, pdf_file, survey_id, mode):
         base_name = os.path.splitext(os.path.basename(pdf_file))[0]
+        target_dir = os.path.join(get_path('md', survey_id), base_name)
+        md_file_path = os.path.join(target_dir, mode, f"{base_name}.md")
+        print("The md file path is: ", md_file_path)
+        if os.path.exists(md_file_path):
+            print(f"Markdown file for {pdf_file} already exists at {md_file_path}. Skipping conversion.", flush=True)
             return self.process_md_file(md_file_path, survey_id)
+        command = ["mineru", "-p", pdf_file, "-o", get_path('md', survey_id), "-m", mode]
+        try:
+            subprocess.run(command, check=True)
+            # 检查是否生成了 Markdown 文件
+            if not os.path.exists(md_file_path):
+                print(f"Conversion failed: Markdown file not found at {md_file_path}. Cleaning up folder...")
+                shutil.rmtree(target_dir)  # 删除生成的文件夹
+                return None
+            else:
+                print(f"Successfully converted {pdf_file} to markdown format in {target_dir}.")
+                return self.process_md_file(md_file_path, survey_id)
+        except subprocess.CalledProcessError as e:
+            print(f"An error occurred during conversion: {e}")
+            # 如果发生错误且文件夹已生成，则删除文件夹
+            if os.path.exists(target_dir):
+                print(f"Cleaning up incomplete folder: {target_dir}")
+                shutil.rmtree(target_dir)
+            return None
     def load_pdf_new(self, pdf_dir, survey_id):
+        pdf_files = glob.glob(os.path.join(pdf_dir, "*.pdf"))
+        for pdf_file in pdf_files:
+            base_name = os.path.splitext(os.path.basename(pdf_file))[0]
+            target_dir = os.path.join(get_path('md', survey_id), base_name)
+            if os.path.exists(target_dir):
+                print(f"Folder for {pdf_file} already exists in {get_path('md', survey_id)}. Skipping conversion.")
+            else:
+                command = ["mineru", "-p", pdf_file, "-o", get_path('md', survey_id), "-m", "auto"]
+                try:
+                    subprocess.run(command, check=True)
+                    print(f"Successfully converted {pdf_file} to markdown format in {target_dir}.")
+                except subprocess.CalledProcessError as e:
+                    print(f"An error occurred: {e}")
     def parallel_load_pdfs(self, pdf_files, survey_id, max_workers=4):
+        # Create a process pool to run the conversion in parallel
         with ProcessPoolExecutor(max_workers=max_workers) as executor:
+            # Submit each PDF file to the process pool for conversion
+            futures = [executor.submit(self.load_pdf, pdf, survey_id, "auto") for pdf in pdf_files]
+            # Optionally, you can monitor the status of each future as they complete
             for future in futures:
                 try:
+                    future.result()  # This will raise any exceptions that occurred during the processing
+                except Exception as exc:
+                    print(f"An error occurred during processing: {exc}")
     def ensure_non_empty_introduction(self, introduction, full_text):
+        if len(introduction) < 50:
+            return full_text[:1000]
         return introduction
     def extract_information_from_md_new(self, md_text):
         # Introduction extraction
         introduction_match = re.search(
+            r'\n\n([1I][\.\- ]?\s*)?[Ii]\s*[nN]\s*[tT]\s*[rR]\s*[oO]\s*[dD]\s*[uU]\s*[cC]\s*[tT]\s*[iI]\s*[oO]\s*[nN][\.\- ]?\s*\n\n(.*?)'
+            r'(?=\n\n(?:([2I][I]|\s*2)[^\n]*?\n\n|\n\n(?:[2I][I][^\n]*?\n\n)))',
+            md_text,
+            re.DOTALL
         )
         introduction = introduction_match.group(2).strip() if introduction_match else "N/A"
+        # Main content extraction
+        main_content_match = re.search(
+            r'(.*?)(\n\n([3I][\.\- ]?\s*)?[Rr][Ee][Ff][Ee][Rr][Ee][Nn][Cc][Ee][Ss][^\n]*\n\n|\Z)',
+            md_text,
+            re.DOTALL
+        )
+        if main_content_match:
+            main_content = main_content_match.group(1).strip()
+        else:
+            main_content = "N/A"
+        extracted_data = {
             "title": title,
             "authors": authors,
             "abstract": abstract,
+            "introduction": introduction,
+            "main_content": main_content
+        }
+        return extracted_data

src/demo/asg_outline.py CHANGED Viewed

@@ -10,6 +10,7 @@ from .asg_conclusion import ConclusionGenerator
 from .asg_retriever import *
 import pandas as df
 from .references import generate_references
 class OutlineGenerator():
@@ -260,7 +261,7 @@ class OutlineGenerator():
         return messages, clean_text
 def parseOutline(survey_id):
-    file_path = f'./src/static/data/txt/{survey_id}/outline.json'
     try:
         with open(file_path, 'r', encoding='utf-8') as file:
             data = json.load(file)
@@ -286,7 +287,7 @@ def parseOutline(survey_id):
         print("Failed to extract a valid list string from the outline content.")
         return []
-    # 检查提取结果是否为“列表的列表”格式（应该以 "[[" 开头）
     fixed_str = response_extracted.strip()
     if not fixed_str.startswith("[["):
         # 如果不是，则去掉原有的首尾括号，再重新包装：[[ ... ]]
@@ -531,45 +532,45 @@ def generateOutlineHTML_qwen(survey_id):
         }
         // 确认编辑并提交数据
-function confirmOutline() {
-    const outlineData = []; // 用于存储提交到后端的数据
-    // 遍历所有的可编辑输入框
-    document.querySelectorAll("#edit-outline .list-group-item").forEach((item) => {
-        const level = item.classList.contains("level-1") ? 1 :
-                      item.classList.contains("level-2") ? 2 : 3; // 获取层级
-        const content = item.querySelector("input").value.trim(); // 获取编辑框的值
-        // 将数据转换为数组格式 [level, content]
-        outlineData.push([level, content]);
-    });
-    console.log("Submitting to backend:", outlineData); // 打印提交数据以供调试
-    // 使用 AJAX 提交数据到后端
-    const csrftoken = getCookie("csrftoken"); // 获取 CSRF token
-    fetch("/save_outline/", {
-        method: "POST",
-        headers: {
-            "Content-Type": "application/json",
-            "X-CSRFToken": csrftoken, // Django 的 CSRF 令牌
-        },
-        body: JSON.stringify({ outline: outlineData }) // 将数据转换为 JSON 字符串
-    })
-        .then((response) => response.json())
-        .then((data) => {
-            if (data.status === "success") {
-                $('#sections_').html(data.html);
-                alert("Outline updated successfully!");
-            } else {
-                alert("Error updating outline: " + data.message);
-            }
-        })
-        .catch((error) => {
-            console.error("Error:", error);
-            alert("Error updating outline. Please check the console for details.");
-        });
-}
         </script>
         '''
     return html
@@ -825,7 +826,7 @@ def generateSurvey(survey_id, title, collection_list, pipeline):
     temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
     temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
-    output_path = f'./src/static/data/txt/{survey_id}/generated_result.json'
     with open(output_path, 'w', encoding='utf-8') as f:
         json.dump(temp, f, ensure_ascii=False, indent=4)
     print(f"Survey has been saved to {output_path}.")
@@ -910,7 +911,8 @@ def generateSurvey_qwen(survey_id, title, collection_list, pipeline):
     temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
     temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
     temp["content"] = insert_section(temp["content"], "Future Directions", temp["future_directions"])
-    output_path = f'./src/static/data/txt/{survey_id}/generated_result.json'
     with open(output_path, 'w', encoding='utf-8') as f:
         json.dump(temp, f, ensure_ascii=False, indent=4)
     print(f"Survey has been saved to {output_path}.")
@@ -962,7 +964,7 @@ def generateSurvey_qwen_new(survey_id, title, collection_list, pipeline, citatio
     temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
     temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
     temp["content"] = insert_section(temp["content"], "Future Directions", temp["future_directions"])
-    output_path = f'./src/static/data/txt/{survey_id}/generated_result.json'
     with open(output_path, 'w', encoding='utf-8') as f:
         json.dump(temp, f, ensure_ascii=False, indent=4)
     print(f"Survey has been saved to {output_path}.")
@@ -994,7 +996,7 @@ if __name__ == '__main__':
     Many paradigms have been proposed to asses informativeness of data samples for active learning. One of the popular approaches is selecting the most uncertain data sample, i.e the data sample in which current classifier is least confident. Some other approaches are selecting the sample which yields a model with minimum risk or the data sample which yields fastest convergence in gradient based methods.//
     An active under-sampling approach is presented in this paper to change the data distribution of training datasets, and improve the classification accuracy of minority classes while maintaining overall classification performance.//
     In this paper, we propose an uncertainty-based active learning algorithm which requires only samples of one class and a set of unlabeled data in order to operate.//
-    The principal contribution of our work is twofold: First, we use Bayes’ rule and density estimation to avoid the need to have a model of all classes for computing the uncertainty measure.//
     This technique reduces the number of input parameters of the problem. At the rest of this paper, we first review recent related works in the fields of active learning and active one-class learning (section II).//
     The classifier predicts that all the samples are non-fraud, it will have a quite high accuracy. However, for problems like fraud detection, minority class classification accuracy is more critical.//
     The algorithm used and the features selected are always the key points at design time, and many experiments are needed to select the final algorithm and the best suited feature set.//

 from .asg_retriever import *
 import pandas as df
 from .references import generate_references
+from .path_utils import get_path
 class OutlineGenerator():
         return messages, clean_text
 def parseOutline(survey_id):
+    file_path = get_path('txt', survey_id, 'outline.json')
     try:
         with open(file_path, 'r', encoding='utf-8') as file:
             data = json.load(file)
         print("Failed to extract a valid list string from the outline content.")
         return []
+    # 检查提取结果是否为"列表的列表"格式（应该以 "[[" 开头）
     fixed_str = response_extracted.strip()
     if not fixed_str.startswith("[["):
         # 如果不是，则去掉原有的首尾括号，再重新包装：[[ ... ]]
         }
         // 确认编辑并提交数据
+        function confirmOutline() {
+            const outlineData = []; // 用于存储提交到后端的数据
+            // 遍历所有的可编辑输入框
+            document.querySelectorAll("#edit-outline .list-group-item").forEach((item) => {
+                const level = item.classList.contains("level-1") ? 1 :
+                              item.classList.contains("level-2") ? 2 : 3; // 获取层级
+                const content = item.querySelector("input").value.trim(); // 获取编辑框的值
+                // 将数据转换为数组格式 [level, content]
+                outlineData.push([level, content]);
+            });
+            console.log("Submitting to backend:", outlineData); // 打印提交数据以供调试
+            // 使用 AJAX 提交数据到后端
+            const csrftoken = getCookie("csrftoken"); // 获取 CSRF token
+            fetch("/save_outline/", {
+                method: "POST",
+                headers: {
+                    "Content-Type": "application/json",
+                    "X-CSRFToken": csrftoken, // Django 的 CSRF 令牌
+                },
+                body: JSON.stringify({ outline: outlineData }) // 将数据转换为 JSON 字符串
+            })
+                .then((response) => response.json())
+                .then((data) => {
+                    if (data.status === "success") {
+                        $('#sections_').html(data.html);
+                        alert("Outline updated successfully!");
+                    } else {
+                        alert("Error updating outline: " + data.message);
+                    }
+                })
+                .catch((error) => {
+                    console.error("Error:", error);
+                    alert("Error updating outline. Please check the console for details.");
+                });
+        }
         </script>
         '''
     return html
     temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
     temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
+    output_path = get_path('txt', survey_id, 'generated_result.json')
     with open(output_path, 'w', encoding='utf-8') as f:
         json.dump(temp, f, ensure_ascii=False, indent=4)
     print(f"Survey has been saved to {output_path}.")
     temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
     temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
     temp["content"] = insert_section(temp["content"], "Future Directions", temp["future_directions"])
+    # references = generate_references_dir(get_path('txt', survey_id))
+    output_path = get_path('txt', survey_id, 'generated_result.json')
     with open(output_path, 'w', encoding='utf-8') as f:
         json.dump(temp, f, ensure_ascii=False, indent=4)
     print(f"Survey has been saved to {output_path}.")
     temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
     temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
     temp["content"] = insert_section(temp["content"], "Future Directions", temp["future_directions"])
+    output_path = get_path('txt', survey_id, 'generated_result.json')
     with open(output_path, 'w', encoding='utf-8') as f:
         json.dump(temp, f, ensure_ascii=False, indent=4)
     print(f"Survey has been saved to {output_path}.")
     Many paradigms have been proposed to asses informativeness of data samples for active learning. One of the popular approaches is selecting the most uncertain data sample, i.e the data sample in which current classifier is least confident. Some other approaches are selecting the sample which yields a model with minimum risk or the data sample which yields fastest convergence in gradient based methods.//
     An active under-sampling approach is presented in this paper to change the data distribution of training datasets, and improve the classification accuracy of minority classes while maintaining overall classification performance.//
     In this paper, we propose an uncertainty-based active learning algorithm which requires only samples of one class and a set of unlabeled data in order to operate.//
+    The principal contribution of our work is twofold: First, we use Bayes' rule and density estimation to avoid the need to have a model of all classes for computing the uncertainty measure.//
     This technique reduces the number of input parameters of the problem. At the rest of this paper, we first review recent related works in the fields of active learning and active one-class learning (section II).//
     The classifier predicts that all the samples are non-fraud, it will have a quite high accuracy. However, for problems like fraud detection, minority class classification accuracy is more critical.//
     The algorithm used and the features selected are always the key points at design time, and many experiments are needed to select the final algorithm and the best suited feature set.//

src/demo/asg_retriever.py CHANGED Viewed

@@ -8,6 +8,7 @@ from .asg_splitter import TextSplitting
 from langchain_huggingface import HuggingFaceEmbeddings
 import time
 import concurrent.futures
 class Retriever:
     client = None
@@ -355,7 +356,7 @@ def query_multiple_collections(collection_names: list[str], query_list: list[str
             results[collection_name] = future.result()
     # Automatically save the results to a JSON file
-    file_path = f'./src/static/data/info/{survey_id}/retrieved_context.json'
     with open(file_path, 'w', encoding='utf-8') as f:
         json.dump(results, f, ensure_ascii=False, indent=4)

 from langchain_huggingface import HuggingFaceEmbeddings
 import time
 import concurrent.futures
+from .path_utils import get_path
 class Retriever:
     client = None
             results[collection_name] = future.result()
     # Automatically save the results to a JSON file
+    file_path = get_path('info', survey_id, 'retrieved_context.json')
     with open(file_path, 'w', encoding='utf-8') as f:
         json.dump(results, f, ensure_ascii=False, indent=4)

src/demo/category_and_tsne.py CHANGED Viewed

@@ -1,14 +1,20 @@
 from sklearn.metrics import silhouette_score
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
-import matplotlib.pyplot as plt
 from sklearn.manifold import TSNE
 from sklearn.cluster import AgglomerativeClustering
-import json
-IMG_PATH = './src/static/img/'
 plt.switch_backend('agg')
 device = 0
@@ -133,46 +139,6 @@ class ClusteringWithTopic:
         print(f"Best n_topics={self.best_n_topics}, Best silhouette_score={self.best_score}")
         return self.best_labels, self.best_topic_model, self.best_n_topics
-def clustering(df, n_cluster, survey_id):
-    text = df['retrieval_result'].astype(str)
-    clustering = ClusteringWithTopic(text, n_cluster)
-    df['label'] = clustering.fit_and_get_labels(text)
-    print("The clustering result is: ")
-    for col in df.columns:
-        print(f"{col}: {df.iloc[0][col]}")
-    # Save topic model information as JSON
-    topic_json = clustering.topic_model.get_topic_info().to_json()
-    with open(f'./src/static/data/info/{survey_id}/topic.json', 'w', encoding="utf-8") as file:
-        file.write(topic_json)
-    # Create a dictionary from 'ref_title' and 'retrieval_result' columns
-    description_dict = dict(zip(df['ref_title'], df['retrieval_result']))
-    # Save the dictionary to description.json
-    with open(f'./src/static/data/info/{survey_id}/description.json', 'w', encoding="utf-8") as file:
-        json.dump(description_dict, file, ensure_ascii=False, indent=4)
-    # df['top_n_words'] = clustering.topic_model.get_topic_info()['Representation'].tolist()
-    # df['topic_word'] = clustering.topic_model.get_topic_info()['KeyBERT'].tolist()
-    X = np.array(clustering.embeddings)
-    perplexity = 10
-    if X.shape[0] <= perplexity:
-        perplexity = max(1, X.shape[0] // 2)
-    tsne = TSNE(n_components=2, init='pca', perplexity=perplexity, random_state=42)
-    X_tsne = tsne.fit_transform(X)
-    colors = scatter(X_tsne, df['label'])
-    plt.savefig(IMG_PATH + 'tsne_' + survey_id + '.png', dpi=800, transparent=True)
-    plt.close()
-    output_tsv_filename = "./src/static/data/tsv/" + survey_id + '.tsv'
-    df.to_csv(output_tsv_filename, sep='\t')
-    return df, colors
 def clustering(df, n_topics_list, survey_id):
     text = df['retrieval_result'].astype(str)
     clustering = ClusteringWithTopic(text, n_topics_list)
@@ -184,12 +150,12 @@ def clustering(df, n_topics_list, survey_id):
     # 保存 topic model 信息
     topic_json = topic_model.get_topic_info().to_json()
-    with open(f'./src/static/data/info/{survey_id}/topic.json', 'w', encoding="utf-8") as file:
         file.write(topic_json)
     # 创建描述信息
     description_dict = dict(zip(df['ref_title'], df['retrieval_result']))
-    with open(f'./src/static/data/info/{survey_id}/description.json', 'w', encoding="utf-8") as file:
         json.dump(description_dict, file, ensure_ascii=False, indent=4)
     # t-SNE 降维可视化
@@ -201,10 +167,10 @@ def clustering(df, n_topics_list, survey_id):
     colors = scatter(X_tsne, df['label'])  # 计算颜色
-    plt.savefig(IMG_PATH + 'tsne_' + survey_id + '.png', dpi=800, transparent=True)
     plt.close()
-    output_tsv_filename = "./src/static/data/tsv/" + survey_id + '.tsv'
     df.to_csv(output_tsv_filename, sep='\t')
     return df, colors, best_n_topics

 from sklearn.metrics import silhouette_score
 import numpy as np
+import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
+import json
 from sklearn.manifold import TSNE
 from sklearn.cluster import AgglomerativeClustering
+from sentence_transformers import SentenceTransformer
+from bertopic import BERTopic
+from bertopic.representation import KeyBERTInspired
+from sklearn.feature_extraction.text import CountVectorizer
+from bertopic.vectorizers import ClassTfidfTransformer
+from umap import UMAP
+from .path_utils import get_path
 plt.switch_backend('agg')
 device = 0
         print(f"Best n_topics={self.best_n_topics}, Best silhouette_score={self.best_score}")
         return self.best_labels, self.best_topic_model, self.best_n_topics
 def clustering(df, n_topics_list, survey_id):
     text = df['retrieval_result'].astype(str)
     clustering = ClusteringWithTopic(text, n_topics_list)
     # 保存 topic model 信息
     topic_json = topic_model.get_topic_info().to_json()
+    with open(get_path('info', survey_id, 'topic.json'), 'w', encoding="utf-8") as file:
         file.write(topic_json)
     # 创建描述信息
     description_dict = dict(zip(df['ref_title'], df['retrieval_result']))
+    with open(get_path('info', survey_id, 'description.json'), 'w', encoding="utf-8") as file:
         json.dump(description_dict, file, ensure_ascii=False, indent=4)
     # t-SNE 降维可视化
     colors = scatter(X_tsne, df['label'])  # 计算颜色
+    plt.savefig(get_path('img', filename='tsne_' + survey_id + '.png'), dpi=800, transparent=True)
     plt.close()
+    output_tsv_filename = get_path('tsv', survey_id + '.tsv')
     df.to_csv(output_tsv_filename, sep='\t')
     return df, colors, best_n_topics

src/demo/path_utils.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import os
+import tempfile
+# 检查是否在 Hugging Face Spaces 环境中
+def get_data_paths():
+    # 如果在 Hugging Face Spaces 中，使用临时目录
+    if os.environ.get('SPACE_ID') or os.environ.get('HF_SPACE_ID'):
+        # 使用临时目录
+        temp_dir = tempfile.mkdtemp()
+        return {
+            'DATA_PATH': os.path.join(temp_dir, 'pdf/'),
+            'TXT_PATH': os.path.join(temp_dir, 'txt/'),
+            'TSV_PATH': os.path.join(temp_dir, 'tsv/'),
+            'MD_PATH': os.path.join(temp_dir, 'md/'),
+            'INFO_PATH': os.path.join(temp_dir, 'info/'),
+            'IMG_PATH': os.path.join(temp_dir, 'img/'),
+            'RESULTS_PATH': os.path.join(temp_dir, 'results/')
+        }
+    else:
+        # 本地环境使用原来的路径
+        return {
+            'DATA_PATH': './src/static/data/pdf/',
+            'TXT_PATH': './src/static/data/txt/',
+            'TSV_PATH': './src/static/data/tsv/',
+            'MD_PATH': './src/static/data/md/',
+            'INFO_PATH': './src/static/data/info/',
+            'IMG_PATH': './src/static/img/',
+            'RESULTS_PATH': './src/static/data/results/'
+        }
+# 全局路径管理函数
+def get_path(path_type, survey_id=None, filename=None):
+    """
+    获取动态路径
+    path_type: 'pdf', 'txt', 'tsv', 'md', 'info', 'img', 'results'
+    survey_id: 可选的调查ID
+    filename: 可选的文件名
+    """
+    paths_config = get_data_paths()
+    if path_type == 'pdf':
+        base_path = paths_config['DATA_PATH']
+    elif path_type == 'txt':
+        base_path = paths_config['TXT_PATH']
+    elif path_type == 'tsv':
+        base_path = paths_config['TSV_PATH']
+    elif path_type == 'md':
+        base_path = paths_config['MD_PATH']
+    elif path_type == 'info':
+        base_path = paths_config['INFO_PATH']
+    elif path_type == 'img':
+        base_path = paths_config['IMG_PATH']
+    elif path_type == 'results':
+        base_path = paths_config['RESULTS_PATH']
+    else:
+        raise ValueError(f"Unknown path type: {path_type}")
+    if survey_id:
+        base_path = os.path.join(base_path, str(survey_id))
+    if filename:
+        return os.path.join(base_path, filename)
+    return base_path

src/demo/survey_generation_pipeline/asg_loader.py CHANGED Viewed

@@ -6,9 +6,14 @@ import os
 import re
 import json
 import subprocess
 from langchain_community.document_loaders import UnstructuredMarkdownLoader
-from langchain_core.documents import Document
 import shutil
 # load spaCy model
 # nlp = spacy.load("en_core_web_sm")
@@ -130,7 +135,9 @@ class DocumentLoading:
             }
             return extracted_data
-    def process_md_file(self, md_file_path, survey_id, txt_path='./src/static/data/txt/'):
         loader = UnstructuredMarkdownLoader(md_file_path)
         data = loader.load()
         assert len(data) == 1, "Expected exactly one document in the markdown file."
@@ -146,15 +153,15 @@ class DocumentLoading:
         invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '_']
         for char in invalid_chars:
             title_new = title_new.replace(char, ' ')
-        # print("============================")
-        # print(title_new)
-        os.makedirs(f'{txt_path}/{survey_id}', exist_ok=True)
-        with open(f'{txt_path}/{survey_id}/{title_new}.json', 'w', encoding='utf-8') as f:
             json.dump(extracted_data, f, ensure_ascii=False, indent=4)
-        # print(extracted_data)
         return extracted_data['introduction']
-    def process_md_file_full(self, md_file_path, survey_id, txt_path='./src/static/data/txt/'):
         loader = UnstructuredMarkdownLoader(md_file_path)
         data = loader.load()
         assert len(data) == 1, "Expected exactly one document in the markdown file."
@@ -170,18 +177,16 @@ class DocumentLoading:
         invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '_']
         for char in invalid_chars:
             title_new = title_new.replace(char, ' ')
-        # print("============================")
-        # print(title_new)
-        os.makedirs(f'{txt_path}/{survey_id}', exist_ok=True)
-        with open(f'{txt_path}/{survey_id}/{title_new}.json', 'w', encoding='utf-8') as f:
             json.dump(extracted_data, f, ensure_ascii=False, indent=4)
-        # print(extracted_data)
         return extracted_data['abstract'] + extracted_data['introduction'] + extracted_data['main_content']
     def load_pdf(self, pdf_file, survey_id, mode):
-        os.makedirs(f'./src/static/data/md/{survey_id}', exist_ok=True)
-        output_dir = f"./src/static/data/md/{survey_id}"
         base_name = os.path.splitext(os.path.basename(pdf_file))[0]
         target_dir = os.path.join(output_dir, base_name, "auto")
@@ -200,8 +205,8 @@ class DocumentLoading:
     # wrong, still being tested
     def load_pdf_new(self, pdf_dir, survey_id):
-        os.makedirs(f'./src/static/data/md/{survey_id}', exist_ok=True)
-        output_dir = f"./src/static/data/md/{survey_id}"
         self.convert_pdf_to_md_new(pdf_dir, output_dir)
         markdown_files = glob.glob(os.path.join(output_dir, "*", "auto", "*.md"))
         all_introductions = []
@@ -416,7 +421,7 @@ class DocumentLoading:
     #         # clear blocks that are likely annotations
     #         if re.search(r'\d{4}\s\d+\s\w+\sConference\s.*?\|\s.*?\|\sDOI:.*?\s\|\s\w+:\s.*?\n', block, flags=re.DOTALL) or \
     #         re.search(r'http\S+', block) or \
-    #         re.search(r'\d+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+', block, flags=re.DOTALL):
     #             continue
     #         cleaned_blocks.append(block)
     #     return cleaned_blocks

 import re
 import json
 import subprocess
+import glob
+from pathlib import Path
+from concurrent.futures import ProcessPoolExecutor
 from langchain_community.document_loaders import UnstructuredMarkdownLoader
+from langchain.schema import Document
 import shutil
+import tempfile
+from ..path_utils import get_path
 # load spaCy model
 # nlp = spacy.load("en_core_web_sm")
             }
             return extracted_data
+    def process_md_file(self, md_file_path, survey_id, txt_path=None):
+        if txt_path is None:
+            txt_path = get_path('txt')
         loader = UnstructuredMarkdownLoader(md_file_path)
         data = loader.load()
         assert len(data) == 1, "Expected exactly one document in the markdown file."
         invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '_']
         for char in invalid_chars:
             title_new = title_new.replace(char, ' ')
+        os.makedirs(get_path('txt', survey_id), exist_ok=True)
+        with open(get_path('txt', survey_id, f'{title_new}.json'), 'w', encoding='utf-8') as f:
             json.dump(extracted_data, f, ensure_ascii=False, indent=4)
         return extracted_data['introduction']
+    def process_md_file_full(self, md_file_path, survey_id, txt_path=None):
+        if txt_path is None:
+            txt_path = get_path('txt')
         loader = UnstructuredMarkdownLoader(md_file_path)
         data = loader.load()
         assert len(data) == 1, "Expected exactly one document in the markdown file."
         invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '_']
         for char in invalid_chars:
             title_new = title_new.replace(char, ' ')
+        os.makedirs(get_path('txt', survey_id), exist_ok=True)
+        with open(get_path('txt', survey_id, f'{title_new}.json'), 'w', encoding='utf-8') as f:
             json.dump(extracted_data, f, ensure_ascii=False, indent=4)
         return extracted_data['abstract'] + extracted_data['introduction'] + extracted_data['main_content']
     def load_pdf(self, pdf_file, survey_id, mode):
+        os.makedirs(get_path('md', survey_id), exist_ok=True)
+        output_dir = get_path('md', survey_id)
         base_name = os.path.splitext(os.path.basename(pdf_file))[0]
         target_dir = os.path.join(output_dir, base_name, "auto")
     # wrong, still being tested
     def load_pdf_new(self, pdf_dir, survey_id):
+        os.makedirs(get_path('md', survey_id), exist_ok=True)
+        output_dir = get_path('md', survey_id)
         self.convert_pdf_to_md_new(pdf_dir, output_dir)
         markdown_files = glob.glob(os.path.join(output_dir, "*", "auto", "*.md"))
         all_introductions = []
     #         # clear blocks that are likely annotations
     #         if re.search(r'\d{4}\s\d+\s\w+\sConference\s.*?\|\s.*?\|\sDOI:.*?\s\|\s\w+:\s.*?\n', block, flags=re.DOTALL) or \
     #         re.search(r'http\S+', block) or \
+    #         re.search(r'\d+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+', block, flags=re.DOTALL):
     #             continue
     #         cleaned_blocks.append(block)
     #     return cleaned_blocks

src/demo/survey_generation_pipeline/asg_outline.py CHANGED Viewed

@@ -9,7 +9,8 @@ from asg_abstract import AbstractGenerator
 from asg_conclusion import ConclusionGenerator
 from asg_retriever import *
 import pandas as df
-from references import generate_references
 class OutlineGenerator():
@@ -259,8 +260,10 @@ class OutlineGenerator():
         clean_text = re.sub(r'\s+', ' ', text).strip()
         return messages, clean_text
-def parseOutline(survey_id, info_path = './src/static/data/txt'):
-    file_path = f'{info_path}/{survey_id}/outline.json'
     try:
         with open(file_path, 'r', encoding='utf-8') as file:
             data = json.load(file)
@@ -286,7 +289,7 @@ def parseOutline(survey_id, info_path = './src/static/data/txt'):
         print("Failed to extract a valid list string from the outline content.")
         return []
-    # 检查提取结果是否为“列表的列表”格式（应该以 "[[" 开头）
     fixed_str = response_extracted.strip()
     if not fixed_str.startswith("[["):
         # 如果不是，则去掉原有的首尾括号，再重新包装：[[ ... ]]
@@ -825,7 +828,7 @@ def generateSurvey(survey_id, title, collection_list, pipeline):
     temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
     temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
-    output_path = f'./src/static/data/txt/{survey_id}/generated_result.json'
     with open(output_path, 'w', encoding='utf-8') as f:
         json.dump(temp, f, ensure_ascii=False, indent=4)
     print(f"Survey has been saved to {output_path}.")
@@ -910,25 +913,21 @@ def generateSurvey_qwen(survey_id, title, collection_list, pipeline):
     temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
     temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
     temp["content"] = insert_section(temp["content"], "Future Directions", temp["future_directions"])
-    output_path = f'./src/static/data/txt/{survey_id}/generated_result.json'
     with open(output_path, 'w', encoding='utf-8') as f:
         json.dump(temp, f, ensure_ascii=False, indent=4)
     print(f"Survey has been saved to {output_path}.")
     return
 # wza
-def generateSurvey_qwen_new(survey_id, title, collection_list, pipeline, citation_data_list, txt_path = "./src/static/data/txt"):
-    outline = str(parseOutline(survey_id,  info_path ='./info'))
     client = getQwenClient()
     context_list = generate_context_list(outline, collection_list)
-    # print("!!!!!!!!")
-    # print(context_list)
-    # print("2025")
-    # 不再重复查询citation数据，而是直接使用传入的citation_data_list
-    # citation_data_list来自get_survey_id传入的Global_citation_data
     temp = {
         "survey_id": survey_id,
         "outline": outline,
@@ -969,7 +968,7 @@ def generateSurvey_qwen_new(survey_id, title, collection_list, pipeline, citatio
     temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
     temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
     temp["content"] = insert_section(temp["content"], "Future Directions", temp["future_directions"])
-    output_path = f'{txt_path}/{survey_id}/generated_result.json'
     with open(output_path, 'w', encoding='utf-8') as f:
         json.dump(temp, f, ensure_ascii=False, indent=4)
     print(f"Survey has been saved to {output_path}.")
@@ -1002,7 +1001,7 @@ if __name__ == '__main__':
     Many paradigms have been proposed to asses informativeness of data samples for active learning. One of the popular approaches is selecting the most uncertain data sample, i.e the data sample in which current classifier is least confident. Some other approaches are selecting the sample which yields a model with minimum risk or the data sample which yields fastest convergence in gradient based methods.//
     An active under-sampling approach is presented in this paper to change the data distribution of training datasets, and improve the classification accuracy of minority classes while maintaining overall classification performance.//
     In this paper, we propose an uncertainty-based active learning algorithm which requires only samples of one class and a set of unlabeled data in order to operate.//
-    The principal contribution of our work is twofold: First, we use Bayes’ rule and density estimation to avoid the need to have a model of all classes for computing the uncertainty measure.//
     This technique reduces the number of input parameters of the problem. At the rest of this paper, we first review recent related works in the fields of active learning and active one-class learning (section II).//
     The classifier predicts that all the samples are non-fraud, it will have a quite high accuracy. However, for problems like fraud detection, minority class classification accuracy is more critical.//
     The algorithm used and the features selected are always the key points at design time, and many experiments are needed to select the final algorithm and the best suited feature set.//

 from asg_conclusion import ConclusionGenerator
 from asg_retriever import *
 import pandas as df
+from .references import generate_references
+from ..path_utils import get_path
 class OutlineGenerator():
         clean_text = re.sub(r'\s+', ' ', text).strip()
         return messages, clean_text
+def parseOutline(survey_id, info_path=None):
+    if info_path is None:
+        info_path = get_path('txt')
+    file_path = get_path('txt', survey_id, 'outline.json')
     try:
         with open(file_path, 'r', encoding='utf-8') as file:
             data = json.load(file)
         print("Failed to extract a valid list string from the outline content.")
         return []
+    # 检查提取结果是否为"列表的列表"格式（应该以 "[[" 开头）
     fixed_str = response_extracted.strip()
     if not fixed_str.startswith("[["):
         # 如果不是，则去掉原有的首尾括号，再重新包装：[[ ... ]]
     temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
     temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
+    output_path = get_path('txt', survey_id, 'generated_result.json')
     with open(output_path, 'w', encoding='utf-8') as f:
         json.dump(temp, f, ensure_ascii=False, indent=4)
     print(f"Survey has been saved to {output_path}.")
     temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
     temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
     temp["content"] = insert_section(temp["content"], "Future Directions", temp["future_directions"])
+    # references = generate_references_dir(get_path('txt', survey_id))
+    output_path = get_path('txt', survey_id, 'generated_result.json')
     with open(output_path, 'w', encoding='utf-8') as f:
         json.dump(temp, f, ensure_ascii=False, indent=4)
     print(f"Survey has been saved to {output_path}.")
     return
 # wza
+def generateSurvey_qwen_new(survey_id, title, collection_list, pipeline, citation_data_list, txt_path=None):
+    if txt_path is None:
+        txt_path = get_path('txt')
+    outline = str(parseOutline(survey_id))
     client = getQwenClient()
     context_list = generate_context_list(outline, collection_list)
     temp = {
         "survey_id": survey_id,
         "outline": outline,
     temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
     temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
     temp["content"] = insert_section(temp["content"], "Future Directions", temp["future_directions"])
+    output_path = get_path('txt', survey_id, 'generated_result.json')
     with open(output_path, 'w', encoding='utf-8') as f:
         json.dump(temp, f, ensure_ascii=False, indent=4)
     print(f"Survey has been saved to {output_path}.")
     Many paradigms have been proposed to asses informativeness of data samples for active learning. One of the popular approaches is selecting the most uncertain data sample, i.e the data sample in which current classifier is least confident. Some other approaches are selecting the sample which yields a model with minimum risk or the data sample which yields fastest convergence in gradient based methods.//
     An active under-sampling approach is presented in this paper to change the data distribution of training datasets, and improve the classification accuracy of minority classes while maintaining overall classification performance.//
     In this paper, we propose an uncertainty-based active learning algorithm which requires only samples of one class and a set of unlabeled data in order to operate.//
+    The principal contribution of our work is twofold: First, we use Bayes' rule and density estimation to avoid the need to have a model of all classes for computing the uncertainty measure.//
     This technique reduces the number of input parameters of the problem. At the rest of this paper, we first review recent related works in the fields of active learning and active one-class learning (section II).//
     The classifier predicts that all the samples are non-fraud, it will have a quite high accuracy. However, for problems like fraud detection, minority class classification accuracy is more critical.//
     The algorithm used and the features selected are always the key points at design time, and many experiments are needed to select the final algorithm and the best suited feature set.//

src/demo/survey_generation_pipeline/asg_retriever.py CHANGED Viewed

@@ -4,10 +4,11 @@ import re
 import os
 import json
 import chromadb
-from asg_splitter import TextSplitting
 from langchain_huggingface import HuggingFaceEmbeddings
 import time
 import concurrent.futures
 class Retriever:
     client = None
@@ -367,7 +368,7 @@ def query_multiple_collections(collection_names: list[str], query_list: list[str
             results[collection_name] = future.result()
     # Automatically save the results to a JSON file
-    file_path = f'./src/static/data/info/{survey_id}/retrieved_context.json'
     with open(file_path, 'w', encoding='utf-8') as f:
         json.dump(results, f, ensure_ascii=False, indent=4)

 import os
 import json
 import chromadb
+from .asg_splitter import TextSplitting
 from langchain_huggingface import HuggingFaceEmbeddings
 import time
 import concurrent.futures
+from ..path_utils import get_path
 class Retriever:
     client = None
             results[collection_name] = future.result()
     # Automatically save the results to a JSON file
+    file_path = get_path('info', survey_id, 'retrieved_context.json')
     with open(file_path, 'w', encoding='utf-8') as f:
         json.dump(results, f, ensure_ascii=False, indent=4)

src/demo/survey_generation_pipeline/category_and_tsne.py CHANGED Viewed

@@ -7,8 +7,9 @@ import matplotlib.pyplot as plt
 from sklearn.manifold import TSNE
 from sklearn.cluster import AgglomerativeClustering
 import json
-IMG_PATH = './src/static/img/'
 plt.switch_backend('agg')
 device = 0
@@ -133,47 +134,11 @@ class ClusteringWithTopic:
         print(f"Best n_topics={self.best_n_topics}, Best silhouette_score={self.best_score}")
         return self.best_labels, self.best_topic_model, self.best_n_topics
-def clustering(df, n_cluster, survey_id):
-    text = df['retrieval_result'].astype(str)
-    clustering = ClusteringWithTopic(text, n_cluster)
-    df['label'] = clustering.fit_and_get_labels(text)
-    print("The clustering result is: ")
-    for col in df.columns:
-        print(f"{col}: {df.iloc[0][col]}")
-    # Save topic model information as JSON
-    topic_json = clustering.topic_model.get_topic_info().to_json()
-    with open(f'./src/static/data/info/{survey_id}/topic.json', 'w', encoding="utf-8") as file:
-        file.write(topic_json)
-    # Create a dictionary from 'ref_title' and 'retrieval_result' columns
-    description_dict = dict(zip(df['ref_title'], df['retrieval_result']))
-    # Save the dictionary to description.json
-    with open(f'./src/static/data/info/{survey_id}/description.json', 'w', encoding="utf-8") as file:
-        json.dump(description_dict, file, ensure_ascii=False, indent=4)
-    # df['top_n_words'] = clustering.topic_model.get_topic_info()['Representation'].tolist()
-    # df['topic_word'] = clustering.topic_model.get_topic_info()['KeyBERT'].tolist()
-    X = np.array(clustering.embeddings)
-    perplexity = 10
-    if X.shape[0] <= perplexity:
-        perplexity = max(1, X.shape[0] // 2)
-    tsne = TSNE(n_components=2, init='pca', perplexity=perplexity, random_state=42)
-    X_tsne = tsne.fit_transform(X)
-    colors = scatter(X_tsne, df['label'])
-    plt.savefig(IMG_PATH + 'tsne_' + survey_id + '.png', dpi=800, transparent=True)
-    plt.close()
-    output_tsv_filename = "./src/static/data/tsv/" + survey_id + '.tsv'
-    df.to_csv(output_tsv_filename, sep='\t')
-    return df, colors
-def clustering(df, n_topics_list, survey_id, info_path='./src/static/data/info', tsv_path='./src/static/data/tsv'):
     text = df['retrieval_result'].astype(str)
     clustering = ClusteringWithTopic(text, n_topics_list)
     df['label'], topic_model, best_n_topics = clustering.fit_and_get_labels()
@@ -184,12 +149,12 @@ def clustering(df, n_topics_list, survey_id, info_path='./src/static/data/info',
     # 保存 topic model 信息
     topic_json = topic_model.get_topic_info().to_json()
-    with open(f'{info_path}/{survey_id}/topic.json', 'w', encoding="utf-8") as file:
         file.write(topic_json)
     # 创建描述信息
     description_dict = dict(zip(df['ref_title'], df['retrieval_result']))
-    with open(f'{info_path}/{survey_id}/description.json', 'w', encoding="utf-8") as file:
         json.dump(description_dict, file, ensure_ascii=False, indent=4)
     # t-SNE 降维可视化
@@ -201,10 +166,10 @@ def clustering(df, n_topics_list, survey_id, info_path='./src/static/data/info',
     colors = scatter(X_tsne, df['label'])  # 计算颜色
-    # plt.savefig(IMG_PATH + 'tsne_' + survey_id + '.png', dpi=800, transparent=True)
-    # plt.close()
-    output_tsv_filename = f"{tsv_path}/{survey_id}.tsv"
     df.to_csv(output_tsv_filename, sep='\t')
     return df, colors, best_n_topics

 from sklearn.manifold import TSNE
 from sklearn.cluster import AgglomerativeClustering
 import json
+from ..path_utils import get_path
+IMG_PATH = get_path('img')
 plt.switch_backend('agg')
 device = 0
         print(f"Best n_topics={self.best_n_topics}, Best silhouette_score={self.best_score}")
         return self.best_labels, self.best_topic_model, self.best_n_topics
+def clustering(df, n_topics_list, survey_id, info_path=None, tsv_path=None):
+    if info_path is None:
+        info_path = get_path('info')
+    if tsv_path is None:
+        tsv_path = get_path('tsv')
     text = df['retrieval_result'].astype(str)
     clustering = ClusteringWithTopic(text, n_topics_list)
     df['label'], topic_model, best_n_topics = clustering.fit_and_get_labels()
     # 保存 topic model 信息
     topic_json = topic_model.get_topic_info().to_json()
+    with open(get_path('info', survey_id, 'topic.json'), 'w', encoding="utf-8") as file:
         file.write(topic_json)
     # 创建描述信息
     description_dict = dict(zip(df['ref_title'], df['retrieval_result']))
+    with open(get_path('info', survey_id, 'description.json'), 'w', encoding="utf-8") as file:
         json.dump(description_dict, file, ensure_ascii=False, indent=4)
     # t-SNE 降维可视化
     colors = scatter(X_tsne, df['label'])  # 计算颜色
+    plt.savefig(get_path('img', filename='tsne_' + survey_id + '.png'), dpi=800, transparent=True)
+    plt.close()
+    output_tsv_filename = get_path('tsv', survey_id + '.tsv')
     df.to_csv(output_tsv_filename, sep='\t')
     return df, colors, best_n_topics

src/demo/views.py CHANGED Viewed

@@ -41,6 +41,10 @@ import glob
 from langchain_huggingface import HuggingFaceEmbeddings
 from dotenv import load_dotenv
 dotenv_path = os.path.join(os.path.dirname(__file__), ".env")
 load_dotenv()
@@ -55,26 +59,45 @@ load_dotenv()
 # print(f"OPENAI_API_KEY: {openai_api_key}")
 # print(f"OPENAI_API_BASE: {openai_api_base}")
-import os
-from pathlib import Path
-from markdown_pdf import MarkdownPdf, Section
-DATA_PATH = './src/static/data/pdf/'
-TXT_PATH = './src/static/data/txt/'
-TSV_PATH = './src/static/data/tsv/'
-MD_PATH = './src/static/data/md/'
-INFO_PATH = './src/static/data/info/'
-IMG_PATH = './src/static/img/'
 paths = [DATA_PATH, TXT_PATH, TSV_PATH, MD_PATH, INFO_PATH, IMG_PATH]
 for path in paths:
-    path_obj = Path(path)
-    if not path_obj.exists():
-        path_obj.mkdir(parents=True, exist_ok=True)
-        print(f"Created directory: {path}")
-    else:
-        print(f"Directory already exists: {path}")
@@ -165,17 +188,19 @@ def index(request):
 def delete_files(request):
     if request.method == 'POST':
         try:
-            folders = ['./src/static/data/pdf/', './src/static/data/tsv/', './src/static/data/txt/', './src/static/data/md/']
             for folder in folders:
-                for filename in os.listdir(folder):
-                    file_path = os.path.join(folder, filename)
-                    try:
-                        if os.path.isfile(file_path) or os.path.islink(file_path):
-                            os.unlink(file_path)
-                        elif os.path.isdir(file_path):
-                            shutil.rmtree(file_path)
-                    except Exception as e:
-                        return JsonResponse({'success': False, 'message': str(e)})
             return JsonResponse({'success': True})
         except Exception as e:
             return JsonResponse({'success': False, 'message': str(e)})
@@ -279,7 +304,7 @@ def sanitize_filename_py(filename):
 def get_existing_survey_ids():
-    tsv_directory = os.path.join("src", "static", "data", "tsv")
     survey_ids = []
     try:
         for file_name in os.listdir(tsv_directory):
@@ -299,7 +324,7 @@ def get_surveys(request):
 def upload_refs(request):
     start_time = time.time()
-    RECOMMENDED_PDF_DIR = os.path.join("src", "static", "data", "pdf", "recommend_pdfs")
     if request.method == 'POST':
         if not request.FILES:
             if not os.path.exists(RECOMMENDED_PDF_DIR):
@@ -369,7 +394,7 @@ def upload_refs(request):
                     continue
                 sanitized_filename = f"{sanitized_filename}{file_extension}"
-                file_path = os.path.join('src', 'static', 'data', 'pdf', Global_survey_id, sanitized_filename)
                 if default_storage.exists(file_path):
                     default_storage.delete(file_path)
@@ -388,7 +413,7 @@ def upload_refs(request):
         csvfile_name = new_file_name + '.'+ file_name.split('.')[-1]
         json_data_pd = pd.DataFrame()
-        json_files_path = f'./src/static/data/txt/{Global_survey_id}/*.json'
         json_files = glob.glob(json_files_path)
         # Dictionary to hold title and abstract pairs
@@ -425,7 +450,7 @@ def upload_refs(request):
                 title_abstract_dict[title] = abstract
         input_pd = json_data_pd
-        output_path = f'./src/static/data/info/{Global_survey_id}/title_abstract_pairs.json'
         os.makedirs(os.path.dirname(output_path), exist_ok=True)
         with open(output_path, 'w', encoding="utf-8") as outfile:
@@ -446,7 +471,8 @@ def upload_refs(request):
             input_pd["label"] = input_pd["reference paper category label (optional)"].apply(lambda x: str(x) if len(str(x))>0 else '')
             try:
-                output_tsv_filename = "./src/static/data/tsv/" + new_file_name + '.tsv'
                 output_df = input_pd[["ref_title","ref_context","ref_entry","abstract","intro"]]
@@ -558,7 +584,7 @@ def generate_arxiv_query(request):
                         new_count += 1
                 attempts += 1
-                current_query = generic_query  # 将本轮的宽松查询作为“新的严格查询”
                 if len(total_papers) >= min_results:
                     # 一旦达到 min_results，就返回此时的查询
@@ -596,7 +622,7 @@ def download_pdfs(request):
             if not pdf_links:
                 return JsonResponse({"message": "No PDFs to download."}, status=400)
-            base_dir = os.path.join(os.getcwd(), "src", "static", "data", "pdf", "recommend_pdfs")
             os.makedirs(base_dir, exist_ok=True)  # 确保文件夹存在
             downloaded_files = []
@@ -668,33 +694,14 @@ def automatic_taxonomy(request):
         description = generate(context, query, name)
         Global_description_list.append(description)
-    # Save citation data to file for debugging or reference
-    citation_path = f'./src/static/data/info/{Global_survey_id}/citation_data.json'
     os.makedirs(os.path.dirname(citation_path), exist_ok=True)
-    with open(citation_path, 'w', encoding="utf-8") as outfile:
-        json.dump(Global_citation_data, outfile, indent=4, ensure_ascii=False)
-    file_path = f'./src/static/data/tsv/{Global_survey_id}.tsv'
-    with open(file_path, 'r', newline='', encoding='utf-8') as infile:
-        reader = csv.reader(infile, delimiter='\t')
-        rows = list(reader)
-    if rows:
-        headers = rows[0]
-        headers.append('retrieval_result')
-        updated_rows = [headers]
-        for row, description in zip(rows[1:], Global_description_list):
-            row.append(description)
-            updated_rows.append(row)
-        with open(file_path, 'w', newline='', encoding='utf-8') as outfile:
-            writer = csv.writer(outfile, delimiter='\t')
-            writer.writerows(updated_rows)
-        print('Updated file has been saved to', file_path)
-    else:
-        print('Input file is empty.')
     Global_ref_list = ref_list
@@ -708,11 +715,11 @@ def automatic_taxonomy(request):
     ref_titles = list(df_tmp.groupby(df_tmp['label'])['ref_title'].apply(list))
     ref_indexs = list(df_tmp.groupby(df_tmp['label'])['index'].apply(list))
-    info = pd.read_json(f'./src/static/data/info/{Global_survey_id}/topic.json')
     category_label = info['KeyBERT'].to_list()
     category_label_summarized=[]
-    tsv_path = f'./src/static/data/tsv/{Global_survey_id}.tsv'
     cluster_num = Global_cluster_num
     category_label_summarized = generate_cluster_name_new(tsv_path, Global_survey_title, cluster_num)
@@ -733,7 +740,7 @@ def automatic_taxonomy(request):
         temp = [legal_pdf(i) for i in value]
         cluster_info[key] = temp
         Global_collection_names_clustered.append(temp)
-    cluster_info_path = f'./src/static/data/info/{Global_survey_id}/cluster_info.json'
     with open(cluster_info_path, 'w', encoding="utf-8") as outfile:
         json.dump(cluster_info, outfile, indent=4, ensure_ascii=False)
@@ -743,7 +750,7 @@ def automatic_taxonomy(request):
     messages, outline = outline_generator.generate_outline_qwen(Global_survey_title, Global_cluster_num)
     outline_json = {'messages':messages, 'outline': outline}
-    output_path = TXT_PATH + Global_survey_id + '/outline.json'
     os.makedirs(os.path.dirname(output_path), exist_ok=True)
     with open(output_path, 'w', encoding="utf-8") as outfile:
         json.dump(outline_json, outfile, indent=4, ensure_ascii=False)
@@ -767,7 +774,7 @@ def save_updated_cluster_info(request):
             if not survey_id or not updated_cate_list:
                 return JsonResponse({"error": "Missing survey_id or updated_cate_list"}, status=400)
-            save_dir = os.path.join('./src/static/data/info/', str(survey_id))
             os.makedirs(save_dir, exist_ok=True)
             save_path = os.path.join(save_dir, 'cluster_info_updated.json')
@@ -807,7 +814,7 @@ def save_outline(request):
                 "outline": str(updated_outline)
             }
-            file_path = os.path.join(settings.BASE_DIR, 'static', 'data', 'txt', Global_survey_id,'outline.json')
             os.makedirs(os.path.dirname(file_path), exist_ok=True)
             with open(file_path, 'w', encoding='utf-8') as file:
@@ -952,7 +959,7 @@ def generate_pdf(request):
     if request.method == 'POST':
         survey_id = request.POST.get('survey_id', '')
         markdown_content = request.POST.get('content', '')
-        markdown_dir = f'./src/static/data/info/{survey_id}/'
         markdown_filename = f'survey_{survey_id}_vanilla.md'
         markdown_filepath = os.path.join(markdown_dir, markdown_filename)
@@ -970,7 +977,7 @@ def generate_pdf(request):
         markdown_content = finalize_survey_paper(markdown_content, Global_collection_names, Global_file_names)
         # 设置 Markdown 文件的保存路径1
-        markdown_dir = f'./src/static/data/info/{survey_id}/'
         markdown_filename = f'survey_{survey_id}_processed.md'
         markdown_filepath = os.path.join(markdown_dir, markdown_filename)
@@ -990,7 +997,7 @@ def generate_pdf(request):
         # 配置 PDF 文件的保存路径
         pdf_filename = f'survey_{survey_id}.pdf'
-        pdf_dir = './src/static/data/results'
         pdf_filepath = os.path.join(pdf_dir, pdf_filename)
         # 检查并创建 results 目录
@@ -1022,13 +1029,13 @@ def generate_pdf_from_tex(request):
     global Global_survey_id, Global_survey_title
     if request.method == 'POST':
-        base_dir = f'./src/static/data/info/{Global_survey_id}'
         md_path = os.path.join(base_dir, f'survey_{Global_survey_id}_processed.md')
         new_md_path = os.path.join(base_dir, f'survey_{Global_survey_id}_preprocessed.md')
         tex_path = os.path.join(base_dir, 'template.tex')
         new_tex_path = os.path.join(base_dir, 'template_with_figure.tex')
         sty_path = os.path.join(base_dir, 'acl.sty')
-        pdf_dir = './src/static/data/results'
         os.makedirs(base_dir, exist_ok=True)
         print(f"Directory '{base_dir}' checked or created.")
@@ -1044,9 +1051,9 @@ def generate_pdf_from_tex(request):
         md_to_tex(new_md_path, tex_path, Global_survey_title)
         insert_figures(
-            png_path=f'src/static/data/info/{Global_survey_id}/outline.png',
             tex_path= tex_path,
-            json_path=f'src/static/data/info/{Global_survey_id}/flowchart_results.json',
             ref_names= Global_ref_list,
             survey_title=Global_survey_title,
             new_tex_path=new_tex_path
@@ -1155,7 +1162,7 @@ def get_survey_text(refs=Global_ref_list):
 def Clustering_refs(n_clusters):
     global Global_cluster_num
-    df = pd.read_csv(TSV_PATH + Global_survey_id + '.tsv', sep='\t', index_col=0, encoding='utf-8')
     print(Global_ref_list)
     df_selected = df.iloc[Global_ref_list]
@@ -1232,10 +1239,10 @@ def finalize_survey_paper(paper_text,
     Global_ref_list = ref_list
     print(ref_list)
-    json_path = os.path.join("src", "static", "data", "txt", Global_survey_id, "outline.json")
-    output_png_path = os.path.join("src", "static", "data", "info", Global_survey_id, "outline")
-    md_path = os.path.join("src", "static", "data", "info", Global_survey_id, f"survey_{Global_survey_id}_processed.md")
-    flowchart_results_path = os.path.join("src", "static", "data", "info", Global_survey_id, "flowchart_results.json")
     detect_flowcharts(Global_survey_id)
     png_path = generate_graphviz_png(
         json_path=json_path,

 from langchain_huggingface import HuggingFaceEmbeddings
 from dotenv import load_dotenv
+from pathlib import Path
+from markdown_pdf import MarkdownPdf, Section
+import tempfile
+from .path_utils import get_path
 dotenv_path = os.path.join(os.path.dirname(__file__), ".env")
 load_dotenv()
 # print(f"OPENAI_API_KEY: {openai_api_key}")
 # print(f"OPENAI_API_BASE: {openai_api_base}")
+# 获取路径配置
+paths_config = get_path('pdf')  # 使用 get_path 函数获取路径配置
+DATA_PATH = get_path('pdf')
+TXT_PATH = get_path('txt')
+TSV_PATH = get_path('tsv')
+MD_PATH = get_path('md')
+INFO_PATH = get_path('info')
+IMG_PATH = get_path('img')
 paths = [DATA_PATH, TXT_PATH, TSV_PATH, MD_PATH, INFO_PATH, IMG_PATH]
+# 安全地创建目录
 for path in paths:
+    try:
+        path_obj = Path(path)
+        if not path_obj.exists():
+            path_obj.mkdir(parents=True, exist_ok=True)
+            print(f"Created directory: {path}")
+        else:
+            print(f"Directory already exists: {path}")
+    except (PermissionError, OSError) as e:
+        print(f"Warning: Could not create directory {path}: {e}")
+        # 在 Hugging Face Spaces 中，如果无法创建目录，使用临时目录
+        if os.environ.get('SPACE_ID') or os.environ.get('HF_SPACE_ID'):
+            temp_dir = tempfile.mkdtemp()
+            # 更新路径为临时目录
+            if 'pdf' in path:
+                DATA_PATH = os.path.join(temp_dir, 'pdf/')
+            elif 'txt' in path:
+                TXT_PATH = os.path.join(temp_dir, 'txt/')
+            elif 'tsv' in path:
+                TSV_PATH = os.path.join(temp_dir, 'tsv/')
+            elif 'md' in path:
+                MD_PATH = os.path.join(temp_dir, 'md/')
+            elif 'info' in path:
+                INFO_PATH = os.path.join(temp_dir, 'info/')
+            elif 'img' in path:
+                IMG_PATH = os.path.join(temp_dir, 'img/')
+            print(f"Using temporary directory: {temp_dir}")
 def delete_files(request):
     if request.method == 'POST':
         try:
+            # 使用动态路径而不是硬编码路径
+            folders = [DATA_PATH, TSV_PATH, TXT_PATH, MD_PATH]
             for folder in folders:
+                if os.path.exists(folder):
+                    for filename in os.listdir(folder):
+                        file_path = os.path.join(folder, filename)
+                        try:
+                            if os.path.isfile(file_path) or os.path.islink(file_path):
+                                os.unlink(file_path)
+                            elif os.path.isdir(file_path):
+                                shutil.rmtree(file_path)
+                        except Exception as e:
+                            return JsonResponse({'success': False, 'message': str(e)})
             return JsonResponse({'success': True})
         except Exception as e:
             return JsonResponse({'success': False, 'message': str(e)})
 def get_existing_survey_ids():
+    tsv_directory = get_path('tsv')
     survey_ids = []
     try:
         for file_name in os.listdir(tsv_directory):
 def upload_refs(request):
     start_time = time.time()
+    RECOMMENDED_PDF_DIR = get_path('pdf', 'recommend_pdfs')
     if request.method == 'POST':
         if not request.FILES:
             if not os.path.exists(RECOMMENDED_PDF_DIR):
                     continue
                 sanitized_filename = f"{sanitized_filename}{file_extension}"
+                file_path = os.path.join(get_path('pdf', Global_survey_id), sanitized_filename)
                 if default_storage.exists(file_path):
                     default_storage.delete(file_path)
         csvfile_name = new_file_name + '.'+ file_name.split('.')[-1]
         json_data_pd = pd.DataFrame()
+        json_files_path = get_path('txt', Global_survey_id) + '/*.json'
         json_files = glob.glob(json_files_path)
         # Dictionary to hold title and abstract pairs
                 title_abstract_dict[title] = abstract
         input_pd = json_data_pd
+        output_path = get_path('txt', Global_survey_id, 'title_abstract_pairs.json')
         os.makedirs(os.path.dirname(output_path), exist_ok=True)
         with open(output_path, 'w', encoding="utf-8") as outfile:
             input_pd["label"] = input_pd["reference paper category label (optional)"].apply(lambda x: str(x) if len(str(x))>0 else '')
             try:
+                output_tsv_filename = get_path('tsv', filename=new_file_name + '.tsv')
+                os.makedirs(os.path.dirname(output_tsv_filename), exist_ok=True)
                 output_df = input_pd[["ref_title","ref_context","ref_entry","abstract","intro"]]
                         new_count += 1
                 attempts += 1
+                current_query = generic_query  # 将本轮的宽松查询作为"新的严格查询"
                 if len(total_papers) >= min_results:
                     # 一旦达到 min_results，就返回此时的查询
             if not pdf_links:
                 return JsonResponse({"message": "No PDFs to download."}, status=400)
+            base_dir = get_path('pdf', 'recommend_pdfs')
             os.makedirs(base_dir, exist_ok=True)  # 确保文件夹存在
             downloaded_files = []
         description = generate(context, query, name)
         Global_description_list.append(description)
+    # 保存引用数据
+    citation_path = get_path('info', Global_survey_id, 'citation_data.json')
     os.makedirs(os.path.dirname(citation_path), exist_ok=True)
+    with open(citation_path, 'w', encoding='utf-8') as f:
+        json.dump(Global_citation_data, f, ensure_ascii=False, indent=2)
+    # 读取TSV文件
+    file_path = get_path('tsv', Global_survey_id + '.tsv')
     Global_ref_list = ref_list
     ref_titles = list(df_tmp.groupby(df_tmp['label'])['ref_title'].apply(list))
     ref_indexs = list(df_tmp.groupby(df_tmp['label'])['index'].apply(list))
+    info = pd.read_json(get_path('info', Global_survey_id, 'topic.json'))
     category_label = info['KeyBERT'].to_list()
     category_label_summarized=[]
+    tsv_path = get_path('tsv', Global_survey_id + '.tsv')
     cluster_num = Global_cluster_num
     category_label_summarized = generate_cluster_name_new(tsv_path, Global_survey_title, cluster_num)
         temp = [legal_pdf(i) for i in value]
         cluster_info[key] = temp
         Global_collection_names_clustered.append(temp)
+    cluster_info_path = get_path('info', Global_survey_id, 'cluster_info.json')
     with open(cluster_info_path, 'w', encoding="utf-8") as outfile:
         json.dump(cluster_info, outfile, indent=4, ensure_ascii=False)
     messages, outline = outline_generator.generate_outline_qwen(Global_survey_title, Global_cluster_num)
     outline_json = {'messages':messages, 'outline': outline}
+    output_path = get_path('txt', Global_survey_id, 'outline.json')
     os.makedirs(os.path.dirname(output_path), exist_ok=True)
     with open(output_path, 'w', encoding="utf-8") as outfile:
         json.dump(outline_json, outfile, indent=4, ensure_ascii=False)
             if not survey_id or not updated_cate_list:
                 return JsonResponse({"error": "Missing survey_id or updated_cate_list"}, status=400)
+            save_dir = get_path('info', str(survey_id))
             os.makedirs(save_dir, exist_ok=True)
             save_path = os.path.join(save_dir, 'cluster_info_updated.json')
                 "outline": str(updated_outline)
             }
+            file_path = get_path('txt', Global_survey_id, 'outline.json')
             os.makedirs(os.path.dirname(file_path), exist_ok=True)
             with open(file_path, 'w', encoding='utf-8') as file:
     if request.method == 'POST':
         survey_id = request.POST.get('survey_id', '')
         markdown_content = request.POST.get('content', '')
+        markdown_dir = get_path('info', survey_id) + '/'
         markdown_filename = f'survey_{survey_id}_vanilla.md'
         markdown_filepath = os.path.join(markdown_dir, markdown_filename)
         markdown_content = finalize_survey_paper(markdown_content, Global_collection_names, Global_file_names)
         # 设置 Markdown 文件的保存路径1
+        markdown_dir = get_path('info', survey_id) + '/'
         markdown_filename = f'survey_{survey_id}_processed.md'
         markdown_filepath = os.path.join(markdown_dir, markdown_filename)
         # 配置 PDF 文件的保存路径
         pdf_filename = f'survey_{survey_id}.pdf'
+        pdf_dir = get_path('results')
         pdf_filepath = os.path.join(pdf_dir, pdf_filename)
         # 检查并创建 results 目录
     global Global_survey_id, Global_survey_title
     if request.method == 'POST':
+        base_dir = get_path('info', Global_survey_id)
         md_path = os.path.join(base_dir, f'survey_{Global_survey_id}_processed.md')
         new_md_path = os.path.join(base_dir, f'survey_{Global_survey_id}_preprocessed.md')
         tex_path = os.path.join(base_dir, 'template.tex')
         new_tex_path = os.path.join(base_dir, 'template_with_figure.tex')
         sty_path = os.path.join(base_dir, 'acl.sty')
+        pdf_dir = get_path('results')
         os.makedirs(base_dir, exist_ok=True)
         print(f"Directory '{base_dir}' checked or created.")
         md_to_tex(new_md_path, tex_path, Global_survey_title)
         insert_figures(
+            png_path=get_path('info', Global_survey_id, 'outline.png'),
             tex_path= tex_path,
+            json_path=get_path('info', Global_survey_id, 'flowchart_results.json'),
             ref_names= Global_ref_list,
             survey_title=Global_survey_title,
             new_tex_path=new_tex_path
 def Clustering_refs(n_clusters):
     global Global_cluster_num
+    df = pd.read_csv(get_path('tsv', Global_survey_id + '.tsv'), sep='\t', index_col=0, encoding='utf-8')
     print(Global_ref_list)
     df_selected = df.iloc[Global_ref_list]
     Global_ref_list = ref_list
     print(ref_list)
+    json_path = get_path('txt', Global_survey_id, 'outline.json')
+    output_png_path = get_path('info', Global_survey_id, 'outline')
+    md_path = get_path('info', Global_survey_id, f'survey_{Global_survey_id}_processed.md')
+    flowchart_results_path = get_path('info', Global_survey_id, 'flowchart_results.json')
     detect_flowcharts(Global_survey_id)
     png_path = generate_graphviz_png(
         json_path=json_path,