technicolor commited on
Commit
92d8c87
·
1 Parent(s): c320a1b
src/demo/asg_add_flowchart.py CHANGED
@@ -2,17 +2,23 @@ import json
2
  import os
3
  import re
4
  from urllib.parse import quote
 
 
 
 
 
 
 
 
5
 
6
- import os
7
- import json
8
  import torch
9
  import torchvision.transforms as transforms
10
  from torchvision import models
11
  from PIL import Image
12
 
13
- # 常量定义
14
- BASE_DIR = os.path.normpath("src/static/data/md") # 根目录
15
- INFO_DIR = os.path.normpath("src/static/data/info") # 存放 JSON 结果的目录
16
 
17
  # 加载 PyTorch EfficientNet 训练好的 3 类分类模型
18
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -288,9 +294,9 @@ def insert_tex_images(json_path, ref_names, text):
288
  # 示例用法
289
  if __name__ == "__main__":
290
  # Markdown 文件路径
291
- md_file_path = "src/static/data/info/test/survey_test_processed.md"
292
  # JSON 文件路径
293
- json_file_path = "src/static/data/info/test/flowchart_results.json"
294
 
295
  try:
296
  with open(md_file_path, "r", encoding="utf-8") as f:
 
2
  import os
3
  import re
4
  from urllib.parse import quote
5
+ import cv2
6
+ import numpy as np
7
+ from PIL import Image, ImageDraw, ImageFont
8
+ import matplotlib.pyplot as plt
9
+ import matplotlib.patches as patches
10
+ from matplotlib.patches import Rectangle
11
+ import matplotlib.patches as mpatches
12
+ from .path_utils import get_path
13
 
 
 
14
  import torch
15
  import torchvision.transforms as transforms
16
  from torchvision import models
17
  from PIL import Image
18
 
19
+ # 使用动态路径
20
+ BASE_DIR = get_path('md') # 根目录
21
+ INFO_DIR = get_path('info') # 存放 JSON 结果的目录
22
 
23
  # 加载 PyTorch EfficientNet 训练好的 3 类分类模型
24
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
294
  # 示例用法
295
  if __name__ == "__main__":
296
  # Markdown 文件路径
297
+ md_file_path = get_path('info', 'test', 'survey_test_processed.md')
298
  # JSON 文件路径
299
+ json_file_path = get_path('info', 'test', 'flowchart_results.json')
300
 
301
  try:
302
  with open(md_file_path, "r", encoding="utf-8") as f:
src/demo/asg_latex.py CHANGED
@@ -1,12 +1,14 @@
 
1
  import re
2
  import subprocess
3
- import os
4
-
5
- from openai import OpenAI
6
  import dotenv
7
  from .asg_add_flowchart import insert_tex_images
8
  from .asg_mindmap import insert_outline_figure
9
 
 
 
10
 
11
  def _remove_div_blocks(lines):
12
  """
@@ -435,7 +437,7 @@ def md_to_tex_section_without_jpg(section):
435
  # - 标题看起来只是一个段落号, 形如"3"、"3.1"、"3.1.1" 等 (可根据需要调宽或调窄判断规则)
436
 
437
  # 例:用一个正则匹配 `数字(.数字)*`,可带可不带后缀空格
438
- # 如果 heading_text 完全匹配这个模式,就认为它是个“纯编号标题”,不必调用 LLM
439
  pure_number_pattern = re.compile(r'^\d+(\.\d+)*$')
440
 
441
  # 先去一下两端空格
@@ -548,7 +550,7 @@ def md_to_tex_section_without_jpg(section):
548
 
549
  def insert_section(tex_path: str, section_content: str):
550
  """
551
- 将 section_content 追加到 .tex 文件“最后一个 section(或子节)的正文末尾”。
552
  具体逻辑如下:
553
  1. 如果文件内找不到任何 \section{...}、\subsection{...}、\subsubsection{...},
554
  那么就将 section_content 插入到 \end{abstract} 之后。
@@ -564,7 +566,7 @@ def insert_section(tex_path: str, section_content: str):
564
 
565
  注意:
566
  - 这段逻辑会将新的内容**追加**到最后一个标题所对应正文的末尾,
567
- 这样可以避免把之前的内容“分割”或“顶开”。
568
  """
569
 
570
  if not os.path.exists(tex_path):
@@ -616,11 +618,11 @@ def insert_section(tex_path: str, section_content: str):
616
  )
617
 
618
  else:
619
- # 有标题时,将内容追加到“最后一个标题对应正文”的末尾
620
  last_title_line = title_lines[-1]
621
 
622
  # 找到下一个标题的行号(如果有),或 \end{document} 行号,以确定正文区间结束
623
- # “最后标题正文”从 last_title_line+1 一直到 next_title_line-1(或结束)
624
  next_boundaries = [end_document_line if end_document_line is not None else len(lines)]
625
  for t_line in title_lines:
626
  if t_line > last_title_line:
@@ -628,8 +630,8 @@ def insert_section(tex_path: str, section_content: str):
628
  # next_boundary 是最后标题之后遇到的第一个 boundary(若没有, 就是文件末尾)
629
  next_boundary = min(next_boundaries) if next_boundaries else len(lines)
630
 
631
- # 我们希望将新的内容插在“最后标题正文的最末尾”之后,也就是说在 next_boundary 前。
632
- # 不过若“最后标题”本身就处于全文件最终,next_boundary 可能表示文件末尾/文档结束。
633
  # 这里为了避免把最后一行顶下去,可以先把其中的正文行都保留,再在最后插入 section_content。
634
  new_lines = []
635
  new_lines.extend(lines[:next_boundary]) # 保留从头到最后正文结束
@@ -804,8 +806,8 @@ if __name__ == "__main__":
804
  # 读取环境变量
805
  dotenv.load_dotenv()
806
  # md_path = preprocess_md("src/demo/latex_template/test copy.md", "src/demo/latex_template/test_preprocessed.md")
807
- md_path = 'src/static/data/info/undefined/survey_undefined_preprocessed.md'
808
- tex_path = "src/static/data/info/undefined/template.tex"
809
  md_to_tex(md_path, tex_path, title="A Comprehensive Review of ADMM On Consensus Distributed Optimization")
810
  # insert_figures('src/static/data/info/undefined/outline.png',
811
  # 'src/demo/latex_template/template.tex',
 
1
+ import os
2
  import re
3
  import subprocess
4
+ import shutil
5
+ from .path_utils import get_path
 
6
  import dotenv
7
  from .asg_add_flowchart import insert_tex_images
8
  from .asg_mindmap import insert_outline_figure
9
 
10
+ from openai import OpenAI
11
+
12
 
13
  def _remove_div_blocks(lines):
14
  """
 
437
  # - 标题看起来只是一个段落号, 形如"3"、"3.1"、"3.1.1" 等 (可根据需要调宽或调窄判断规则)
438
 
439
  # 例:用一个正则匹配 `数字(.数字)*`,可带可不带后缀空格
440
+ # 如果 heading_text 完全匹配这个模式,就认为它是个"纯编号标题",不必调用 LLM
441
  pure_number_pattern = re.compile(r'^\d+(\.\d+)*$')
442
 
443
  # 先去一下两端空格
 
550
 
551
  def insert_section(tex_path: str, section_content: str):
552
  """
553
+ 将 section_content 追加到 .tex 文件"最后一个 section(或子节)的正文末尾"。
554
  具体逻辑如下:
555
  1. 如果文件内找不到任何 \section{...}、\subsection{...}、\subsubsection{...},
556
  那么就将 section_content 插入到 \end{abstract} 之后。
 
566
 
567
  注意:
568
  - 这段逻辑会将新的内容**追加**到最后一个标题所对应正文的末尾,
569
+ 这样可以避免把之前的内容"分割"或"顶开"。
570
  """
571
 
572
  if not os.path.exists(tex_path):
 
618
  )
619
 
620
  else:
621
+ # 有标题时,将内容追加到"最后一个标题对应正文"的末尾
622
  last_title_line = title_lines[-1]
623
 
624
  # 找到下一个标题的行号(如果有),或 \end{document} 行号,以确定正文区间结束
625
+ # "最后标题正文"从 last_title_line+1 一直到 next_title_line-1(或结束)
626
  next_boundaries = [end_document_line if end_document_line is not None else len(lines)]
627
  for t_line in title_lines:
628
  if t_line > last_title_line:
 
630
  # next_boundary 是最后标题之后遇到的第一个 boundary(若没有, 就是文件末尾)
631
  next_boundary = min(next_boundaries) if next_boundaries else len(lines)
632
 
633
+ # 我们希望将新的内容插在"最后标题正文的最末尾"之后,也就是说在 next_boundary 前。
634
+ # 不过若"最后标题"本身就处于全文件最终,next_boundary 可能表示文件末尾/文档结束。
635
  # 这里为了避免把最后一行顶下去,可以先把其中的正文行都保留,再在最后插入 section_content。
636
  new_lines = []
637
  new_lines.extend(lines[:next_boundary]) # 保留从头到最后正文结束
 
806
  # 读取环境变量
807
  dotenv.load_dotenv()
808
  # md_path = preprocess_md("src/demo/latex_template/test copy.md", "src/demo/latex_template/test_preprocessed.md")
809
+ md_path = get_path('info', 'undefined', 'survey_undefined_preprocessed.md')
810
+ tex_path = get_path('info', 'undefined', 'template.tex')
811
  md_to_tex(md_path, tex_path, title="A Comprehensive Review of ADMM On Consensus Distributed Optimization")
812
  # insert_figures('src/static/data/info/undefined/outline.png',
813
  # 'src/demo/latex_template/template.tex',
src/demo/asg_loader.py CHANGED
@@ -2,9 +2,14 @@ import os
2
  import re
3
  import json
4
  import subprocess
 
 
 
5
  from langchain_community.document_loaders import UnstructuredMarkdownLoader
6
- from langchain_core.documents import Document
7
  import shutil
 
 
8
 
9
  class DocumentLoading:
10
  def convert_pdf_to_md(self, pdf_file, output_dir="output", method="auto"):
@@ -128,8 +133,8 @@ class DocumentLoading:
128
  for char in invalid_chars:
129
  title_new = title_new.replace(char, ' ')
130
 
131
- os.makedirs(f'./src/static/data/txt/{survey_id}', exist_ok=True)
132
- with open(f'./src/static/data/txt/{survey_id}/{title_new}.json', 'w', encoding='utf-8') as f:
133
  json.dump(extracted_data, f, ensure_ascii=False, indent=4)
134
  return extracted_data['introduction']
135
 
@@ -150,69 +155,73 @@ class DocumentLoading:
150
  for char in invalid_chars:
151
  title_new = title_new.replace(char, ' ')
152
 
153
- os.makedirs(f'./src/static/data/txt/{survey_id}', exist_ok=True)
154
- with open(f'./src/static/data/txt/{survey_id}/{title_new}.json', 'w', encoding='utf-8') as f:
155
  json.dump(extracted_data, f, ensure_ascii=False, indent=4)
156
  return extracted_data['abstract'] + extracted_data['introduction'] + extracted_data['main_content']
157
 
158
-
159
  def load_pdf(self, pdf_file, survey_id, mode):
160
- os.makedirs(f'./src/static/data/md/{survey_id}', exist_ok=True)
161
- output_dir = f"./src/static/data/md/{survey_id}"
162
  base_name = os.path.splitext(os.path.basename(pdf_file))[0]
163
- target_dir = os.path.join(output_dir, base_name, "auto")
164
-
165
- # 1. Convert PDF to markdown if the folder doesn't exist
166
- self.convert_pdf_to_md(pdf_file, output_dir)
167
-
168
- # 2. Process the markdown file in the output directory
169
- md_file_path = os.path.join(target_dir, f"{base_name}.md")
170
- if not os.path.exists(md_file_path):
171
- raise FileNotFoundError(f"Markdown file {md_file_path} does not exist. Conversion might have failed.")
172
 
173
- if mode == "intro":
 
174
  return self.process_md_file(md_file_path, survey_id)
175
- elif mode == "full":
176
- return self.process_md_file_full(md_file_path, survey_id)
177
-
178
- # wrong, still being tested
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  def load_pdf_new(self, pdf_dir, survey_id):
180
- os.makedirs(f'./src/static/data/md/{survey_id}', exist_ok=True)
181
- output_dir = f"./src/static/data/md/{survey_id}"
182
- self.convert_pdf_to_md_new(pdf_dir, output_dir)
183
- markdown_files = glob.glob(os.path.join(output_dir, "*", "auto", "*.md"))
184
- all_introductions = []
185
-
186
- for md_file_path in markdown_files:
187
- try:
188
- introduction = self.process_md_file(md_file_path, survey_id)
189
- all_introductions.append(introduction)
190
- except FileNotFoundError as e:
191
- print(f"Markdown file {md_file_path} does not exist. Conversion might have failed.")
192
-
193
- return all_introductions
194
 
 
 
 
195
 
 
 
 
 
 
 
 
 
 
196
 
197
  def parallel_load_pdfs(self, pdf_files, survey_id, max_workers=4):
 
198
  with ProcessPoolExecutor(max_workers=max_workers) as executor:
199
- # Submit tasks for parallel execution
200
- futures = [executor.submit(self.load_pdf, pdf, survey_id) for pdf in pdf_files]
201
-
202
- # Collect results
203
  for future in futures:
204
  try:
205
- result = future.result()
206
- print(f"Processed result: {result}")
207
- except Exception as e:
208
- print(f"Error processing PDF: {e}")
209
-
210
  def ensure_non_empty_introduction(self, introduction, full_text):
211
- """
212
- Ensure introduction is not empty. If empty, replace with full text.
213
- """
214
- if introduction == "N/A" or len(introduction.strip()) < 50:
215
- return full_text.strip()
216
  return introduction
217
 
218
  def extract_information_from_md_new(self, md_text):
@@ -240,17 +249,30 @@ class DocumentLoading:
240
 
241
  # Introduction extraction
242
  introduction_match = re.search(
243
- r'\n\n([1I][\.\- ]?\s*)?[Ii]\s*[nN]\s*[tT]\s*[rR]\s*[oO]\s*[dD]\s*[uU]\s*[cC]\s*[tT]\s*[iI]\s*[oO]\s*[nN][\.\- ]?\s*\n\n(.*?)',
244
- md_text, re.DOTALL
 
 
245
  )
246
  introduction = introduction_match.group(2).strip() if introduction_match else "N/A"
247
 
248
- # Ensure introduction is not empty
249
- introduction = self.ensure_non_empty_introduction(introduction, md_text)
 
 
 
 
 
 
 
 
 
250
 
251
- return {
252
  "title": title,
253
  "authors": authors,
254
  "abstract": abstract,
255
- "introduction": introduction
256
- }
 
 
 
2
  import re
3
  import json
4
  import subprocess
5
+ import glob
6
+ from pathlib import Path
7
+ from concurrent.futures import ProcessPoolExecutor
8
  from langchain_community.document_loaders import UnstructuredMarkdownLoader
9
+ from langchain.schema import Document
10
  import shutil
11
+ import tempfile
12
+ from .path_utils import get_path
13
 
14
  class DocumentLoading:
15
  def convert_pdf_to_md(self, pdf_file, output_dir="output", method="auto"):
 
133
  for char in invalid_chars:
134
  title_new = title_new.replace(char, ' ')
135
 
136
+ os.makedirs(get_path('txt', survey_id), exist_ok=True)
137
+ with open(get_path('txt', survey_id, f'{title_new}.json'), 'w', encoding='utf-8') as f:
138
  json.dump(extracted_data, f, ensure_ascii=False, indent=4)
139
  return extracted_data['introduction']
140
 
 
155
  for char in invalid_chars:
156
  title_new = title_new.replace(char, ' ')
157
 
158
+ os.makedirs(get_path('txt', survey_id), exist_ok=True)
159
+ with open(get_path('txt', survey_id, f'{title_new}.json'), 'w', encoding='utf-8') as f:
160
  json.dump(extracted_data, f, ensure_ascii=False, indent=4)
161
  return extracted_data['abstract'] + extracted_data['introduction'] + extracted_data['main_content']
162
 
 
163
  def load_pdf(self, pdf_file, survey_id, mode):
 
 
164
  base_name = os.path.splitext(os.path.basename(pdf_file))[0]
165
+ target_dir = os.path.join(get_path('md', survey_id), base_name)
166
+ md_file_path = os.path.join(target_dir, mode, f"{base_name}.md")
167
+ print("The md file path is: ", md_file_path)
 
 
 
 
 
 
168
 
169
+ if os.path.exists(md_file_path):
170
+ print(f"Markdown file for {pdf_file} already exists at {md_file_path}. Skipping conversion.", flush=True)
171
  return self.process_md_file(md_file_path, survey_id)
172
+
173
+ command = ["mineru", "-p", pdf_file, "-o", get_path('md', survey_id), "-m", mode]
174
+ try:
175
+ subprocess.run(command, check=True)
176
+ # 检查是否生成了 Markdown 文件
177
+ if not os.path.exists(md_file_path):
178
+ print(f"Conversion failed: Markdown file not found at {md_file_path}. Cleaning up folder...")
179
+ shutil.rmtree(target_dir) # 删除生成的文件夹
180
+ return None
181
+ else:
182
+ print(f"Successfully converted {pdf_file} to markdown format in {target_dir}.")
183
+ return self.process_md_file(md_file_path, survey_id)
184
+ except subprocess.CalledProcessError as e:
185
+ print(f"An error occurred during conversion: {e}")
186
+ # 如果发生错误且文件夹已生成,则删除文件夹
187
+ if os.path.exists(target_dir):
188
+ print(f"Cleaning up incomplete folder: {target_dir}")
189
+ shutil.rmtree(target_dir)
190
+ return None
191
+
192
  def load_pdf_new(self, pdf_dir, survey_id):
193
+ pdf_files = glob.glob(os.path.join(pdf_dir, "*.pdf"))
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
+ for pdf_file in pdf_files:
196
+ base_name = os.path.splitext(os.path.basename(pdf_file))[0]
197
+ target_dir = os.path.join(get_path('md', survey_id), base_name)
198
 
199
+ if os.path.exists(target_dir):
200
+ print(f"Folder for {pdf_file} already exists in {get_path('md', survey_id)}. Skipping conversion.")
201
+ else:
202
+ command = ["mineru", "-p", pdf_file, "-o", get_path('md', survey_id), "-m", "auto"]
203
+ try:
204
+ subprocess.run(command, check=True)
205
+ print(f"Successfully converted {pdf_file} to markdown format in {target_dir}.")
206
+ except subprocess.CalledProcessError as e:
207
+ print(f"An error occurred: {e}")
208
 
209
  def parallel_load_pdfs(self, pdf_files, survey_id, max_workers=4):
210
+ # Create a process pool to run the conversion in parallel
211
  with ProcessPoolExecutor(max_workers=max_workers) as executor:
212
+ # Submit each PDF file to the process pool for conversion
213
+ futures = [executor.submit(self.load_pdf, pdf, survey_id, "auto") for pdf in pdf_files]
214
+
215
+ # Optionally, you can monitor the status of each future as they complete
216
  for future in futures:
217
  try:
218
+ future.result() # This will raise any exceptions that occurred during the processing
219
+ except Exception as exc:
220
+ print(f"An error occurred during processing: {exc}")
221
+
 
222
  def ensure_non_empty_introduction(self, introduction, full_text):
223
+ if len(introduction) < 50:
224
+ return full_text[:1000]
 
 
 
225
  return introduction
226
 
227
  def extract_information_from_md_new(self, md_text):
 
249
 
250
  # Introduction extraction
251
  introduction_match = re.search(
252
+ r'\n\n([1I][\.\- ]?\s*)?[Ii]\s*[nN]\s*[tT]\s*[rR]\s*[oO]\s*[dD]\s*[uU]\s*[cC]\s*[tT]\s*[iI]\s*[oO]\s*[nN][\.\- ]?\s*\n\n(.*?)'
253
+ r'(?=\n\n(?:([2I][I]|\s*2)[^\n]*?\n\n|\n\n(?:[2I][I][^\n]*?\n\n)))',
254
+ md_text,
255
+ re.DOTALL
256
  )
257
  introduction = introduction_match.group(2).strip() if introduction_match else "N/A"
258
 
259
+ # Main content extraction
260
+ main_content_match = re.search(
261
+ r'(.*?)(\n\n([3I][\.\- ]?\s*)?[Rr][Ee][Ff][Ee][Rr][Ee][Nn][Cc][Ee][Ss][^\n]*\n\n|\Z)',
262
+ md_text,
263
+ re.DOTALL
264
+ )
265
+
266
+ if main_content_match:
267
+ main_content = main_content_match.group(1).strip()
268
+ else:
269
+ main_content = "N/A"
270
 
271
+ extracted_data = {
272
  "title": title,
273
  "authors": authors,
274
  "abstract": abstract,
275
+ "introduction": introduction,
276
+ "main_content": main_content
277
+ }
278
+ return extracted_data
src/demo/asg_outline.py CHANGED
@@ -10,6 +10,7 @@ from .asg_conclusion import ConclusionGenerator
10
  from .asg_retriever import *
11
  import pandas as df
12
  from .references import generate_references
 
13
 
14
 
15
  class OutlineGenerator():
@@ -260,7 +261,7 @@ class OutlineGenerator():
260
  return messages, clean_text
261
 
262
  def parseOutline(survey_id):
263
- file_path = f'./src/static/data/txt/{survey_id}/outline.json'
264
  try:
265
  with open(file_path, 'r', encoding='utf-8') as file:
266
  data = json.load(file)
@@ -286,7 +287,7 @@ def parseOutline(survey_id):
286
  print("Failed to extract a valid list string from the outline content.")
287
  return []
288
 
289
- # 检查提取结果是否为“列表的列表”格式(应该以 "[[" 开头)
290
  fixed_str = response_extracted.strip()
291
  if not fixed_str.startswith("[["):
292
  # 如果不是,则去掉原有的首尾括号,再重新包装:[[ ... ]]
@@ -531,45 +532,45 @@ def generateOutlineHTML_qwen(survey_id):
531
  }
532
 
533
  // 确认编辑并提交数据
534
- function confirmOutline() {
535
- const outlineData = []; // 用于存储提交到后端的数据
536
-
537
- // 遍历所有的可编辑输入框
538
- document.querySelectorAll("#edit-outline .list-group-item").forEach((item) => {
539
- const level = item.classList.contains("level-1") ? 1 :
540
- item.classList.contains("level-2") ? 2 : 3; // 获取层级
541
- const content = item.querySelector("input").value.trim(); // 获取编辑框的值
542
-
543
- // 将数据转换为数组格式 [level, content]
544
- outlineData.push([level, content]);
545
- });
546
-
547
- console.log("Submitting to backend:", outlineData); // 打印提交数据以供调试
548
-
549
- // 使用 AJAX 提交数据到后端
550
- const csrftoken = getCookie("csrftoken"); // 获取 CSRF token
551
- fetch("/save_outline/", {
552
- method: "POST",
553
- headers: {
554
- "Content-Type": "application/json",
555
- "X-CSRFToken": csrftoken, // Django 的 CSRF 令牌
556
- },
557
- body: JSON.stringify({ outline: outlineData }) // 将数据转换为 JSON 字符串
558
- })
559
- .then((response) => response.json())
560
- .then((data) => {
561
- if (data.status === "success") {
562
- $('#sections_').html(data.html);
563
- alert("Outline updated successfully!");
564
- } else {
565
- alert("Error updating outline: " + data.message);
566
- }
567
- })
568
- .catch((error) => {
569
- console.error("Error:", error);
570
- alert("Error updating outline. Please check the console for details.");
571
- });
572
- }
573
  </script>
574
  '''
575
  return html
@@ -825,7 +826,7 @@ def generateSurvey(survey_id, title, collection_list, pipeline):
825
  temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
826
  temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
827
 
828
- output_path = f'./src/static/data/txt/{survey_id}/generated_result.json'
829
  with open(output_path, 'w', encoding='utf-8') as f:
830
  json.dump(temp, f, ensure_ascii=False, indent=4)
831
  print(f"Survey has been saved to {output_path}.")
@@ -910,7 +911,8 @@ def generateSurvey_qwen(survey_id, title, collection_list, pipeline):
910
  temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
911
  temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
912
  temp["content"] = insert_section(temp["content"], "Future Directions", temp["future_directions"])
913
- output_path = f'./src/static/data/txt/{survey_id}/generated_result.json'
 
914
  with open(output_path, 'w', encoding='utf-8') as f:
915
  json.dump(temp, f, ensure_ascii=False, indent=4)
916
  print(f"Survey has been saved to {output_path}.")
@@ -962,7 +964,7 @@ def generateSurvey_qwen_new(survey_id, title, collection_list, pipeline, citatio
962
  temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
963
  temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
964
  temp["content"] = insert_section(temp["content"], "Future Directions", temp["future_directions"])
965
- output_path = f'./src/static/data/txt/{survey_id}/generated_result.json'
966
  with open(output_path, 'w', encoding='utf-8') as f:
967
  json.dump(temp, f, ensure_ascii=False, indent=4)
968
  print(f"Survey has been saved to {output_path}.")
@@ -994,7 +996,7 @@ if __name__ == '__main__':
994
  Many paradigms have been proposed to asses informativeness of data samples for active learning. One of the popular approaches is selecting the most uncertain data sample, i.e the data sample in which current classifier is least confident. Some other approaches are selecting the sample which yields a model with minimum risk or the data sample which yields fastest convergence in gradient based methods.//
995
  An active under-sampling approach is presented in this paper to change the data distribution of training datasets, and improve the classification accuracy of minority classes while maintaining overall classification performance.//
996
  In this paper, we propose an uncertainty-based active learning algorithm which requires only samples of one class and a set of unlabeled data in order to operate.//
997
- The principal contribution of our work is twofold: First, we use Bayes rule and density estimation to avoid the need to have a model of all classes for computing the uncertainty measure.//
998
  This technique reduces the number of input parameters of the problem. At the rest of this paper, we first review recent related works in the fields of active learning and active one-class learning (section II).//
999
  The classifier predicts that all the samples are non-fraud, it will have a quite high accuracy. However, for problems like fraud detection, minority class classification accuracy is more critical.//
1000
  The algorithm used and the features selected are always the key points at design time, and many experiments are needed to select the final algorithm and the best suited feature set.//
 
10
  from .asg_retriever import *
11
  import pandas as df
12
  from .references import generate_references
13
+ from .path_utils import get_path
14
 
15
 
16
  class OutlineGenerator():
 
261
  return messages, clean_text
262
 
263
  def parseOutline(survey_id):
264
+ file_path = get_path('txt', survey_id, 'outline.json')
265
  try:
266
  with open(file_path, 'r', encoding='utf-8') as file:
267
  data = json.load(file)
 
287
  print("Failed to extract a valid list string from the outline content.")
288
  return []
289
 
290
+ # 检查提取结果是否为"列表的列表"格式(应该以 "[[" 开头)
291
  fixed_str = response_extracted.strip()
292
  if not fixed_str.startswith("[["):
293
  # 如果不是,则去掉原有的首尾括号,再重新包装:[[ ... ]]
 
532
  }
533
 
534
  // 确认编辑并提交数据
535
+ function confirmOutline() {
536
+ const outlineData = []; // 用于存储提交到后端的数据
537
+
538
+ // 遍历所有的可编辑输入框
539
+ document.querySelectorAll("#edit-outline .list-group-item").forEach((item) => {
540
+ const level = item.classList.contains("level-1") ? 1 :
541
+ item.classList.contains("level-2") ? 2 : 3; // 获取层级
542
+ const content = item.querySelector("input").value.trim(); // 获取编辑框的值
543
+
544
+ // 将数据转换为数组格式 [level, content]
545
+ outlineData.push([level, content]);
546
+ });
547
+
548
+ console.log("Submitting to backend:", outlineData); // 打印提交数据以供调试
549
+
550
+ // 使用 AJAX 提交数据到后端
551
+ const csrftoken = getCookie("csrftoken"); // 获取 CSRF token
552
+ fetch("/save_outline/", {
553
+ method: "POST",
554
+ headers: {
555
+ "Content-Type": "application/json",
556
+ "X-CSRFToken": csrftoken, // Django 的 CSRF 令牌
557
+ },
558
+ body: JSON.stringify({ outline: outlineData }) // 将数据转换为 JSON 字符串
559
+ })
560
+ .then((response) => response.json())
561
+ .then((data) => {
562
+ if (data.status === "success") {
563
+ $('#sections_').html(data.html);
564
+ alert("Outline updated successfully!");
565
+ } else {
566
+ alert("Error updating outline: " + data.message);
567
+ }
568
+ })
569
+ .catch((error) => {
570
+ console.error("Error:", error);
571
+ alert("Error updating outline. Please check the console for details.");
572
+ });
573
+ }
574
  </script>
575
  '''
576
  return html
 
826
  temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
827
  temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
828
 
829
+ output_path = get_path('txt', survey_id, 'generated_result.json')
830
  with open(output_path, 'w', encoding='utf-8') as f:
831
  json.dump(temp, f, ensure_ascii=False, indent=4)
832
  print(f"Survey has been saved to {output_path}.")
 
911
  temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
912
  temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
913
  temp["content"] = insert_section(temp["content"], "Future Directions", temp["future_directions"])
914
+ # references = generate_references_dir(get_path('txt', survey_id))
915
+ output_path = get_path('txt', survey_id, 'generated_result.json')
916
  with open(output_path, 'w', encoding='utf-8') as f:
917
  json.dump(temp, f, ensure_ascii=False, indent=4)
918
  print(f"Survey has been saved to {output_path}.")
 
964
  temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
965
  temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
966
  temp["content"] = insert_section(temp["content"], "Future Directions", temp["future_directions"])
967
+ output_path = get_path('txt', survey_id, 'generated_result.json')
968
  with open(output_path, 'w', encoding='utf-8') as f:
969
  json.dump(temp, f, ensure_ascii=False, indent=4)
970
  print(f"Survey has been saved to {output_path}.")
 
996
  Many paradigms have been proposed to asses informativeness of data samples for active learning. One of the popular approaches is selecting the most uncertain data sample, i.e the data sample in which current classifier is least confident. Some other approaches are selecting the sample which yields a model with minimum risk or the data sample which yields fastest convergence in gradient based methods.//
997
  An active under-sampling approach is presented in this paper to change the data distribution of training datasets, and improve the classification accuracy of minority classes while maintaining overall classification performance.//
998
  In this paper, we propose an uncertainty-based active learning algorithm which requires only samples of one class and a set of unlabeled data in order to operate.//
999
+ The principal contribution of our work is twofold: First, we use Bayes' rule and density estimation to avoid the need to have a model of all classes for computing the uncertainty measure.//
1000
  This technique reduces the number of input parameters of the problem. At the rest of this paper, we first review recent related works in the fields of active learning and active one-class learning (section II).//
1001
  The classifier predicts that all the samples are non-fraud, it will have a quite high accuracy. However, for problems like fraud detection, minority class classification accuracy is more critical.//
1002
  The algorithm used and the features selected are always the key points at design time, and many experiments are needed to select the final algorithm and the best suited feature set.//
src/demo/asg_retriever.py CHANGED
@@ -8,6 +8,7 @@ from .asg_splitter import TextSplitting
8
  from langchain_huggingface import HuggingFaceEmbeddings
9
  import time
10
  import concurrent.futures
 
11
 
12
  class Retriever:
13
  client = None
@@ -355,7 +356,7 @@ def query_multiple_collections(collection_names: list[str], query_list: list[str
355
  results[collection_name] = future.result()
356
 
357
  # Automatically save the results to a JSON file
358
- file_path = f'./src/static/data/info/{survey_id}/retrieved_context.json'
359
  with open(file_path, 'w', encoding='utf-8') as f:
360
  json.dump(results, f, ensure_ascii=False, indent=4)
361
 
 
8
  from langchain_huggingface import HuggingFaceEmbeddings
9
  import time
10
  import concurrent.futures
11
+ from .path_utils import get_path
12
 
13
  class Retriever:
14
  client = None
 
356
  results[collection_name] = future.result()
357
 
358
  # Automatically save the results to a JSON file
359
+ file_path = get_path('info', survey_id, 'retrieved_context.json')
360
  with open(file_path, 'w', encoding='utf-8') as f:
361
  json.dump(results, f, ensure_ascii=False, indent=4)
362
 
src/demo/category_and_tsne.py CHANGED
@@ -1,14 +1,20 @@
1
  from sklearn.metrics import silhouette_score
2
 
3
  import numpy as np
 
4
  import matplotlib.pyplot as plt
5
  import seaborn as sns
6
- import matplotlib.pyplot as plt
7
  from sklearn.manifold import TSNE
8
  from sklearn.cluster import AgglomerativeClustering
9
- import json
10
 
11
- IMG_PATH = './src/static/img/'
 
 
 
 
 
 
12
 
13
  plt.switch_backend('agg')
14
  device = 0
@@ -133,46 +139,6 @@ class ClusteringWithTopic:
133
  print(f"Best n_topics={self.best_n_topics}, Best silhouette_score={self.best_score}")
134
  return self.best_labels, self.best_topic_model, self.best_n_topics
135
 
136
- def clustering(df, n_cluster, survey_id):
137
- text = df['retrieval_result'].astype(str)
138
- clustering = ClusteringWithTopic(text, n_cluster)
139
- df['label'] = clustering.fit_and_get_labels(text)
140
-
141
- print("The clustering result is: ")
142
- for col in df.columns:
143
- print(f"{col}: {df.iloc[0][col]}")
144
-
145
- # Save topic model information as JSON
146
- topic_json = clustering.topic_model.get_topic_info().to_json()
147
- with open(f'./src/static/data/info/{survey_id}/topic.json', 'w', encoding="utf-8") as file:
148
- file.write(topic_json)
149
-
150
- # Create a dictionary from 'ref_title' and 'retrieval_result' columns
151
- description_dict = dict(zip(df['ref_title'], df['retrieval_result']))
152
-
153
- # Save the dictionary to description.json
154
- with open(f'./src/static/data/info/{survey_id}/description.json', 'w', encoding="utf-8") as file:
155
- json.dump(description_dict, file, ensure_ascii=False, indent=4)
156
- # df['top_n_words'] = clustering.topic_model.get_topic_info()['Representation'].tolist()
157
- # df['topic_word'] = clustering.topic_model.get_topic_info()['KeyBERT'].tolist()
158
-
159
-
160
- X = np.array(clustering.embeddings)
161
- perplexity = 10
162
- if X.shape[0] <= perplexity:
163
- perplexity = max(1, X.shape[0] // 2)
164
-
165
- tsne = TSNE(n_components=2, init='pca', perplexity=perplexity, random_state=42)
166
- X_tsne = tsne.fit_transform(X)
167
- colors = scatter(X_tsne, df['label'])
168
-
169
- plt.savefig(IMG_PATH + 'tsne_' + survey_id + '.png', dpi=800, transparent=True)
170
-
171
- plt.close()
172
- output_tsv_filename = "./src/static/data/tsv/" + survey_id + '.tsv'
173
- df.to_csv(output_tsv_filename, sep='\t')
174
- return df, colors
175
-
176
  def clustering(df, n_topics_list, survey_id):
177
  text = df['retrieval_result'].astype(str)
178
  clustering = ClusteringWithTopic(text, n_topics_list)
@@ -184,12 +150,12 @@ def clustering(df, n_topics_list, survey_id):
184
 
185
  # 保存 topic model 信息
186
  topic_json = topic_model.get_topic_info().to_json()
187
- with open(f'./src/static/data/info/{survey_id}/topic.json', 'w', encoding="utf-8") as file:
188
  file.write(topic_json)
189
 
190
  # 创建描述信息
191
  description_dict = dict(zip(df['ref_title'], df['retrieval_result']))
192
- with open(f'./src/static/data/info/{survey_id}/description.json', 'w', encoding="utf-8") as file:
193
  json.dump(description_dict, file, ensure_ascii=False, indent=4)
194
 
195
  # t-SNE 降维可视化
@@ -201,10 +167,10 @@ def clustering(df, n_topics_list, survey_id):
201
 
202
  colors = scatter(X_tsne, df['label']) # 计算颜色
203
 
204
- plt.savefig(IMG_PATH + 'tsne_' + survey_id + '.png', dpi=800, transparent=True)
205
 
206
  plt.close()
207
- output_tsv_filename = "./src/static/data/tsv/" + survey_id + '.tsv'
208
  df.to_csv(output_tsv_filename, sep='\t')
209
  return df, colors, best_n_topics
210
 
 
1
  from sklearn.metrics import silhouette_score
2
 
3
  import numpy as np
4
+ import pandas as pd
5
  import matplotlib.pyplot as plt
6
  import seaborn as sns
7
+ import json
8
  from sklearn.manifold import TSNE
9
  from sklearn.cluster import AgglomerativeClustering
 
10
 
11
+ from sentence_transformers import SentenceTransformer
12
+ from bertopic import BERTopic
13
+ from bertopic.representation import KeyBERTInspired
14
+ from sklearn.feature_extraction.text import CountVectorizer
15
+ from bertopic.vectorizers import ClassTfidfTransformer
16
+ from umap import UMAP
17
+ from .path_utils import get_path
18
 
19
  plt.switch_backend('agg')
20
  device = 0
 
139
  print(f"Best n_topics={self.best_n_topics}, Best silhouette_score={self.best_score}")
140
  return self.best_labels, self.best_topic_model, self.best_n_topics
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  def clustering(df, n_topics_list, survey_id):
143
  text = df['retrieval_result'].astype(str)
144
  clustering = ClusteringWithTopic(text, n_topics_list)
 
150
 
151
  # 保存 topic model 信息
152
  topic_json = topic_model.get_topic_info().to_json()
153
+ with open(get_path('info', survey_id, 'topic.json'), 'w', encoding="utf-8") as file:
154
  file.write(topic_json)
155
 
156
  # 创建描述信息
157
  description_dict = dict(zip(df['ref_title'], df['retrieval_result']))
158
+ with open(get_path('info', survey_id, 'description.json'), 'w', encoding="utf-8") as file:
159
  json.dump(description_dict, file, ensure_ascii=False, indent=4)
160
 
161
  # t-SNE 降维可视化
 
167
 
168
  colors = scatter(X_tsne, df['label']) # 计算颜色
169
 
170
+ plt.savefig(get_path('img', filename='tsne_' + survey_id + '.png'), dpi=800, transparent=True)
171
 
172
  plt.close()
173
+ output_tsv_filename = get_path('tsv', survey_id + '.tsv')
174
  df.to_csv(output_tsv_filename, sep='\t')
175
  return df, colors, best_n_topics
176
 
src/demo/path_utils.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+
4
+ # 检查是否在 Hugging Face Spaces 环境中
5
+ def get_data_paths():
6
+ # 如果在 Hugging Face Spaces 中,使用临时目录
7
+ if os.environ.get('SPACE_ID') or os.environ.get('HF_SPACE_ID'):
8
+ # 使用临时目录
9
+ temp_dir = tempfile.mkdtemp()
10
+ return {
11
+ 'DATA_PATH': os.path.join(temp_dir, 'pdf/'),
12
+ 'TXT_PATH': os.path.join(temp_dir, 'txt/'),
13
+ 'TSV_PATH': os.path.join(temp_dir, 'tsv/'),
14
+ 'MD_PATH': os.path.join(temp_dir, 'md/'),
15
+ 'INFO_PATH': os.path.join(temp_dir, 'info/'),
16
+ 'IMG_PATH': os.path.join(temp_dir, 'img/'),
17
+ 'RESULTS_PATH': os.path.join(temp_dir, 'results/')
18
+ }
19
+ else:
20
+ # 本地环境使用原来的路径
21
+ return {
22
+ 'DATA_PATH': './src/static/data/pdf/',
23
+ 'TXT_PATH': './src/static/data/txt/',
24
+ 'TSV_PATH': './src/static/data/tsv/',
25
+ 'MD_PATH': './src/static/data/md/',
26
+ 'INFO_PATH': './src/static/data/info/',
27
+ 'IMG_PATH': './src/static/img/',
28
+ 'RESULTS_PATH': './src/static/data/results/'
29
+ }
30
+
31
+ # 全局路径管理函数
32
+ def get_path(path_type, survey_id=None, filename=None):
33
+ """
34
+ 获取动态路径
35
+ path_type: 'pdf', 'txt', 'tsv', 'md', 'info', 'img', 'results'
36
+ survey_id: 可选的调查ID
37
+ filename: 可选的文件名
38
+ """
39
+ paths_config = get_data_paths()
40
+
41
+ if path_type == 'pdf':
42
+ base_path = paths_config['DATA_PATH']
43
+ elif path_type == 'txt':
44
+ base_path = paths_config['TXT_PATH']
45
+ elif path_type == 'tsv':
46
+ base_path = paths_config['TSV_PATH']
47
+ elif path_type == 'md':
48
+ base_path = paths_config['MD_PATH']
49
+ elif path_type == 'info':
50
+ base_path = paths_config['INFO_PATH']
51
+ elif path_type == 'img':
52
+ base_path = paths_config['IMG_PATH']
53
+ elif path_type == 'results':
54
+ base_path = paths_config['RESULTS_PATH']
55
+ else:
56
+ raise ValueError(f"Unknown path type: {path_type}")
57
+
58
+ if survey_id:
59
+ base_path = os.path.join(base_path, str(survey_id))
60
+
61
+ if filename:
62
+ return os.path.join(base_path, filename)
63
+
64
+ return base_path
src/demo/survey_generation_pipeline/asg_loader.py CHANGED
@@ -6,9 +6,14 @@ import os
6
  import re
7
  import json
8
  import subprocess
 
 
 
9
  from langchain_community.document_loaders import UnstructuredMarkdownLoader
10
- from langchain_core.documents import Document
11
  import shutil
 
 
12
 
13
  # load spaCy model
14
  # nlp = spacy.load("en_core_web_sm")
@@ -130,7 +135,9 @@ class DocumentLoading:
130
  }
131
  return extracted_data
132
 
133
- def process_md_file(self, md_file_path, survey_id, txt_path='./src/static/data/txt/'):
 
 
134
  loader = UnstructuredMarkdownLoader(md_file_path)
135
  data = loader.load()
136
  assert len(data) == 1, "Expected exactly one document in the markdown file."
@@ -146,15 +153,15 @@ class DocumentLoading:
146
  invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '_']
147
  for char in invalid_chars:
148
  title_new = title_new.replace(char, ' ')
149
- # print("============================")
150
- # print(title_new)
151
- os.makedirs(f'{txt_path}/{survey_id}', exist_ok=True)
152
- with open(f'{txt_path}/{survey_id}/{title_new}.json', 'w', encoding='utf-8') as f:
153
  json.dump(extracted_data, f, ensure_ascii=False, indent=4)
154
- # print(extracted_data)
155
  return extracted_data['introduction']
156
 
157
- def process_md_file_full(self, md_file_path, survey_id, txt_path='./src/static/data/txt/'):
 
 
158
  loader = UnstructuredMarkdownLoader(md_file_path)
159
  data = loader.load()
160
  assert len(data) == 1, "Expected exactly one document in the markdown file."
@@ -170,18 +177,16 @@ class DocumentLoading:
170
  invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '_']
171
  for char in invalid_chars:
172
  title_new = title_new.replace(char, ' ')
173
- # print("============================")
174
- # print(title_new)
175
- os.makedirs(f'{txt_path}/{survey_id}', exist_ok=True)
176
- with open(f'{txt_path}/{survey_id}/{title_new}.json', 'w', encoding='utf-8') as f:
177
  json.dump(extracted_data, f, ensure_ascii=False, indent=4)
178
- # print(extracted_data)
179
  return extracted_data['abstract'] + extracted_data['introduction'] + extracted_data['main_content']
180
 
181
 
182
  def load_pdf(self, pdf_file, survey_id, mode):
183
- os.makedirs(f'./src/static/data/md/{survey_id}', exist_ok=True)
184
- output_dir = f"./src/static/data/md/{survey_id}"
185
  base_name = os.path.splitext(os.path.basename(pdf_file))[0]
186
  target_dir = os.path.join(output_dir, base_name, "auto")
187
 
@@ -200,8 +205,8 @@ class DocumentLoading:
200
 
201
  # wrong, still being tested
202
  def load_pdf_new(self, pdf_dir, survey_id):
203
- os.makedirs(f'./src/static/data/md/{survey_id}', exist_ok=True)
204
- output_dir = f"./src/static/data/md/{survey_id}"
205
  self.convert_pdf_to_md_new(pdf_dir, output_dir)
206
  markdown_files = glob.glob(os.path.join(output_dir, "*", "auto", "*.md"))
207
  all_introductions = []
@@ -416,7 +421,7 @@ class DocumentLoading:
416
  # # clear blocks that are likely annotations
417
  # if re.search(r'\d{4}\s\d+\s\w+\sConference\s.*?\|\s.*?\|\sDOI:.*?\s\|\s\w+:\s.*?\n', block, flags=re.DOTALL) or \
418
  # re.search(r'http\S+', block) or \
419
- # re.search(r'\d+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+', block, flags=re.DOTALL):
420
  # continue
421
  # cleaned_blocks.append(block)
422
  # return cleaned_blocks
 
6
  import re
7
  import json
8
  import subprocess
9
+ import glob
10
+ from pathlib import Path
11
+ from concurrent.futures import ProcessPoolExecutor
12
  from langchain_community.document_loaders import UnstructuredMarkdownLoader
13
+ from langchain.schema import Document
14
  import shutil
15
+ import tempfile
16
+ from ..path_utils import get_path
17
 
18
  # load spaCy model
19
  # nlp = spacy.load("en_core_web_sm")
 
135
  }
136
  return extracted_data
137
 
138
+ def process_md_file(self, md_file_path, survey_id, txt_path=None):
139
+ if txt_path is None:
140
+ txt_path = get_path('txt')
141
  loader = UnstructuredMarkdownLoader(md_file_path)
142
  data = loader.load()
143
  assert len(data) == 1, "Expected exactly one document in the markdown file."
 
153
  invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '_']
154
  for char in invalid_chars:
155
  title_new = title_new.replace(char, ' ')
156
+
157
+ os.makedirs(get_path('txt', survey_id), exist_ok=True)
158
+ with open(get_path('txt', survey_id, f'{title_new}.json'), 'w', encoding='utf-8') as f:
 
159
  json.dump(extracted_data, f, ensure_ascii=False, indent=4)
 
160
  return extracted_data['introduction']
161
 
162
+ def process_md_file_full(self, md_file_path, survey_id, txt_path=None):
163
+ if txt_path is None:
164
+ txt_path = get_path('txt')
165
  loader = UnstructuredMarkdownLoader(md_file_path)
166
  data = loader.load()
167
  assert len(data) == 1, "Expected exactly one document in the markdown file."
 
177
  invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '_']
178
  for char in invalid_chars:
179
  title_new = title_new.replace(char, ' ')
180
+
181
+ os.makedirs(get_path('txt', survey_id), exist_ok=True)
182
+ with open(get_path('txt', survey_id, f'{title_new}.json'), 'w', encoding='utf-8') as f:
 
183
  json.dump(extracted_data, f, ensure_ascii=False, indent=4)
 
184
  return extracted_data['abstract'] + extracted_data['introduction'] + extracted_data['main_content']
185
 
186
 
187
  def load_pdf(self, pdf_file, survey_id, mode):
188
+ os.makedirs(get_path('md', survey_id), exist_ok=True)
189
+ output_dir = get_path('md', survey_id)
190
  base_name = os.path.splitext(os.path.basename(pdf_file))[0]
191
  target_dir = os.path.join(output_dir, base_name, "auto")
192
 
 
205
 
206
  # wrong, still being tested
207
  def load_pdf_new(self, pdf_dir, survey_id):
208
+ os.makedirs(get_path('md', survey_id), exist_ok=True)
209
+ output_dir = get_path('md', survey_id)
210
  self.convert_pdf_to_md_new(pdf_dir, output_dir)
211
  markdown_files = glob.glob(os.path.join(output_dir, "*", "auto", "*.md"))
212
  all_introductions = []
 
421
  # # clear blocks that are likely annotations
422
  # if re.search(r'\d{4}\s\d+\s\w+\sConference\s.*?\|\s.*?\|\sDOI:.*?\s\|\s\w+:\s.*?\n', block, flags=re.DOTALL) or \
423
  # re.search(r'http\S+', block) or \
424
+ # re.search(r'\d+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+', block, flags=re.DOTALL):
425
  # continue
426
  # cleaned_blocks.append(block)
427
  # return cleaned_blocks
src/demo/survey_generation_pipeline/asg_outline.py CHANGED
@@ -9,7 +9,8 @@ from asg_abstract import AbstractGenerator
9
  from asg_conclusion import ConclusionGenerator
10
  from asg_retriever import *
11
  import pandas as df
12
- from references import generate_references
 
13
 
14
 
15
  class OutlineGenerator():
@@ -259,8 +260,10 @@ class OutlineGenerator():
259
  clean_text = re.sub(r'\s+', ' ', text).strip()
260
  return messages, clean_text
261
 
262
- def parseOutline(survey_id, info_path = './src/static/data/txt'):
263
- file_path = f'{info_path}/{survey_id}/outline.json'
 
 
264
  try:
265
  with open(file_path, 'r', encoding='utf-8') as file:
266
  data = json.load(file)
@@ -286,7 +289,7 @@ def parseOutline(survey_id, info_path = './src/static/data/txt'):
286
  print("Failed to extract a valid list string from the outline content.")
287
  return []
288
 
289
- # 检查提取结果是否为“列表的列表”格式(应该以 "[[" 开头)
290
  fixed_str = response_extracted.strip()
291
  if not fixed_str.startswith("[["):
292
  # 如果不是,则去掉原有的首尾括号,再重新包装:[[ ... ]]
@@ -825,7 +828,7 @@ def generateSurvey(survey_id, title, collection_list, pipeline):
825
  temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
826
  temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
827
 
828
- output_path = f'./src/static/data/txt/{survey_id}/generated_result.json'
829
  with open(output_path, 'w', encoding='utf-8') as f:
830
  json.dump(temp, f, ensure_ascii=False, indent=4)
831
  print(f"Survey has been saved to {output_path}.")
@@ -910,25 +913,21 @@ def generateSurvey_qwen(survey_id, title, collection_list, pipeline):
910
  temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
911
  temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
912
  temp["content"] = insert_section(temp["content"], "Future Directions", temp["future_directions"])
913
- output_path = f'./src/static/data/txt/{survey_id}/generated_result.json'
 
914
  with open(output_path, 'w', encoding='utf-8') as f:
915
  json.dump(temp, f, ensure_ascii=False, indent=4)
916
  print(f"Survey has been saved to {output_path}.")
917
  return
918
 
919
  # wza
920
- def generateSurvey_qwen_new(survey_id, title, collection_list, pipeline, citation_data_list, txt_path = "./src/static/data/txt"):
921
- outline = str(parseOutline(survey_id, info_path ='./info'))
 
 
922
  client = getQwenClient()
923
  context_list = generate_context_list(outline, collection_list)
924
 
925
- # print("!!!!!!!!")
926
- # print(context_list)
927
- # print("2025")
928
-
929
- # 不再重复查询citation数据,而是直接使用传入的citation_data_list
930
- # citation_data_list来自get_survey_id传入的Global_citation_data
931
-
932
  temp = {
933
  "survey_id": survey_id,
934
  "outline": outline,
@@ -969,7 +968,7 @@ def generateSurvey_qwen_new(survey_id, title, collection_list, pipeline, citatio
969
  temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
970
  temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
971
  temp["content"] = insert_section(temp["content"], "Future Directions", temp["future_directions"])
972
- output_path = f'{txt_path}/{survey_id}/generated_result.json'
973
  with open(output_path, 'w', encoding='utf-8') as f:
974
  json.dump(temp, f, ensure_ascii=False, indent=4)
975
  print(f"Survey has been saved to {output_path}.")
@@ -1002,7 +1001,7 @@ if __name__ == '__main__':
1002
  Many paradigms have been proposed to asses informativeness of data samples for active learning. One of the popular approaches is selecting the most uncertain data sample, i.e the data sample in which current classifier is least confident. Some other approaches are selecting the sample which yields a model with minimum risk or the data sample which yields fastest convergence in gradient based methods.//
1003
  An active under-sampling approach is presented in this paper to change the data distribution of training datasets, and improve the classification accuracy of minority classes while maintaining overall classification performance.//
1004
  In this paper, we propose an uncertainty-based active learning algorithm which requires only samples of one class and a set of unlabeled data in order to operate.//
1005
- The principal contribution of our work is twofold: First, we use Bayes rule and density estimation to avoid the need to have a model of all classes for computing the uncertainty measure.//
1006
  This technique reduces the number of input parameters of the problem. At the rest of this paper, we first review recent related works in the fields of active learning and active one-class learning (section II).//
1007
  The classifier predicts that all the samples are non-fraud, it will have a quite high accuracy. However, for problems like fraud detection, minority class classification accuracy is more critical.//
1008
  The algorithm used and the features selected are always the key points at design time, and many experiments are needed to select the final algorithm and the best suited feature set.//
 
9
  from asg_conclusion import ConclusionGenerator
10
  from asg_retriever import *
11
  import pandas as df
12
+ from .references import generate_references
13
+ from ..path_utils import get_path
14
 
15
 
16
  class OutlineGenerator():
 
260
  clean_text = re.sub(r'\s+', ' ', text).strip()
261
  return messages, clean_text
262
 
263
+ def parseOutline(survey_id, info_path=None):
264
+ if info_path is None:
265
+ info_path = get_path('txt')
266
+ file_path = get_path('txt', survey_id, 'outline.json')
267
  try:
268
  with open(file_path, 'r', encoding='utf-8') as file:
269
  data = json.load(file)
 
289
  print("Failed to extract a valid list string from the outline content.")
290
  return []
291
 
292
+ # 检查提取结果是否为"列表的列表"格式(应该以 "[[" 开头)
293
  fixed_str = response_extracted.strip()
294
  if not fixed_str.startswith("[["):
295
  # 如果不是,则去掉原有的首尾括号,再重新包装:[[ ... ]]
 
828
  temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
829
  temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
830
 
831
+ output_path = get_path('txt', survey_id, 'generated_result.json')
832
  with open(output_path, 'w', encoding='utf-8') as f:
833
  json.dump(temp, f, ensure_ascii=False, indent=4)
834
  print(f"Survey has been saved to {output_path}.")
 
913
  temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
914
  temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
915
  temp["content"] = insert_section(temp["content"], "Future Directions", temp["future_directions"])
916
+ # references = generate_references_dir(get_path('txt', survey_id))
917
+ output_path = get_path('txt', survey_id, 'generated_result.json')
918
  with open(output_path, 'w', encoding='utf-8') as f:
919
  json.dump(temp, f, ensure_ascii=False, indent=4)
920
  print(f"Survey has been saved to {output_path}.")
921
  return
922
 
923
  # wza
924
+ def generateSurvey_qwen_new(survey_id, title, collection_list, pipeline, citation_data_list, txt_path=None):
925
+ if txt_path is None:
926
+ txt_path = get_path('txt')
927
+ outline = str(parseOutline(survey_id))
928
  client = getQwenClient()
929
  context_list = generate_context_list(outline, collection_list)
930
 
 
 
 
 
 
 
 
931
  temp = {
932
  "survey_id": survey_id,
933
  "outline": outline,
 
968
  temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
969
  temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
970
  temp["content"] = insert_section(temp["content"], "Future Directions", temp["future_directions"])
971
+ output_path = get_path('txt', survey_id, 'generated_result.json')
972
  with open(output_path, 'w', encoding='utf-8') as f:
973
  json.dump(temp, f, ensure_ascii=False, indent=4)
974
  print(f"Survey has been saved to {output_path}.")
 
1001
  Many paradigms have been proposed to asses informativeness of data samples for active learning. One of the popular approaches is selecting the most uncertain data sample, i.e the data sample in which current classifier is least confident. Some other approaches are selecting the sample which yields a model with minimum risk or the data sample which yields fastest convergence in gradient based methods.//
1002
  An active under-sampling approach is presented in this paper to change the data distribution of training datasets, and improve the classification accuracy of minority classes while maintaining overall classification performance.//
1003
  In this paper, we propose an uncertainty-based active learning algorithm which requires only samples of one class and a set of unlabeled data in order to operate.//
1004
+ The principal contribution of our work is twofold: First, we use Bayes' rule and density estimation to avoid the need to have a model of all classes for computing the uncertainty measure.//
1005
  This technique reduces the number of input parameters of the problem. At the rest of this paper, we first review recent related works in the fields of active learning and active one-class learning (section II).//
1006
  The classifier predicts that all the samples are non-fraud, it will have a quite high accuracy. However, for problems like fraud detection, minority class classification accuracy is more critical.//
1007
  The algorithm used and the features selected are always the key points at design time, and many experiments are needed to select the final algorithm and the best suited feature set.//
src/demo/survey_generation_pipeline/asg_retriever.py CHANGED
@@ -4,10 +4,11 @@ import re
4
  import os
5
  import json
6
  import chromadb
7
- from asg_splitter import TextSplitting
8
  from langchain_huggingface import HuggingFaceEmbeddings
9
  import time
10
  import concurrent.futures
 
11
 
12
  class Retriever:
13
  client = None
@@ -367,7 +368,7 @@ def query_multiple_collections(collection_names: list[str], query_list: list[str
367
  results[collection_name] = future.result()
368
 
369
  # Automatically save the results to a JSON file
370
- file_path = f'./src/static/data/info/{survey_id}/retrieved_context.json'
371
  with open(file_path, 'w', encoding='utf-8') as f:
372
  json.dump(results, f, ensure_ascii=False, indent=4)
373
 
 
4
  import os
5
  import json
6
  import chromadb
7
+ from .asg_splitter import TextSplitting
8
  from langchain_huggingface import HuggingFaceEmbeddings
9
  import time
10
  import concurrent.futures
11
+ from ..path_utils import get_path
12
 
13
  class Retriever:
14
  client = None
 
368
  results[collection_name] = future.result()
369
 
370
  # Automatically save the results to a JSON file
371
+ file_path = get_path('info', survey_id, 'retrieved_context.json')
372
  with open(file_path, 'w', encoding='utf-8') as f:
373
  json.dump(results, f, ensure_ascii=False, indent=4)
374
 
src/demo/survey_generation_pipeline/category_and_tsne.py CHANGED
@@ -7,8 +7,9 @@ import matplotlib.pyplot as plt
7
  from sklearn.manifold import TSNE
8
  from sklearn.cluster import AgglomerativeClustering
9
  import json
 
10
 
11
- IMG_PATH = './src/static/img/'
12
 
13
  plt.switch_backend('agg')
14
  device = 0
@@ -133,47 +134,11 @@ class ClusteringWithTopic:
133
  print(f"Best n_topics={self.best_n_topics}, Best silhouette_score={self.best_score}")
134
  return self.best_labels, self.best_topic_model, self.best_n_topics
135
 
136
- def clustering(df, n_cluster, survey_id):
137
- text = df['retrieval_result'].astype(str)
138
- clustering = ClusteringWithTopic(text, n_cluster)
139
- df['label'] = clustering.fit_and_get_labels(text)
140
-
141
- print("The clustering result is: ")
142
- for col in df.columns:
143
- print(f"{col}: {df.iloc[0][col]}")
144
-
145
- # Save topic model information as JSON
146
- topic_json = clustering.topic_model.get_topic_info().to_json()
147
- with open(f'./src/static/data/info/{survey_id}/topic.json', 'w', encoding="utf-8") as file:
148
- file.write(topic_json)
149
-
150
- # Create a dictionary from 'ref_title' and 'retrieval_result' columns
151
- description_dict = dict(zip(df['ref_title'], df['retrieval_result']))
152
-
153
- # Save the dictionary to description.json
154
- with open(f'./src/static/data/info/{survey_id}/description.json', 'w', encoding="utf-8") as file:
155
- json.dump(description_dict, file, ensure_ascii=False, indent=4)
156
- # df['top_n_words'] = clustering.topic_model.get_topic_info()['Representation'].tolist()
157
- # df['topic_word'] = clustering.topic_model.get_topic_info()['KeyBERT'].tolist()
158
-
159
-
160
- X = np.array(clustering.embeddings)
161
- perplexity = 10
162
- if X.shape[0] <= perplexity:
163
- perplexity = max(1, X.shape[0] // 2)
164
-
165
- tsne = TSNE(n_components=2, init='pca', perplexity=perplexity, random_state=42)
166
- X_tsne = tsne.fit_transform(X)
167
- colors = scatter(X_tsne, df['label'])
168
-
169
- plt.savefig(IMG_PATH + 'tsne_' + survey_id + '.png', dpi=800, transparent=True)
170
-
171
- plt.close()
172
- output_tsv_filename = "./src/static/data/tsv/" + survey_id + '.tsv'
173
- df.to_csv(output_tsv_filename, sep='\t')
174
- return df, colors
175
-
176
- def clustering(df, n_topics_list, survey_id, info_path='./src/static/data/info', tsv_path='./src/static/data/tsv'):
177
  text = df['retrieval_result'].astype(str)
178
  clustering = ClusteringWithTopic(text, n_topics_list)
179
  df['label'], topic_model, best_n_topics = clustering.fit_and_get_labels()
@@ -184,12 +149,12 @@ def clustering(df, n_topics_list, survey_id, info_path='./src/static/data/info',
184
 
185
  # 保存 topic model 信息
186
  topic_json = topic_model.get_topic_info().to_json()
187
- with open(f'{info_path}/{survey_id}/topic.json', 'w', encoding="utf-8") as file:
188
  file.write(topic_json)
189
 
190
  # 创建描述信息
191
  description_dict = dict(zip(df['ref_title'], df['retrieval_result']))
192
- with open(f'{info_path}/{survey_id}/description.json', 'w', encoding="utf-8") as file:
193
  json.dump(description_dict, file, ensure_ascii=False, indent=4)
194
 
195
  # t-SNE 降维可视化
@@ -201,10 +166,10 @@ def clustering(df, n_topics_list, survey_id, info_path='./src/static/data/info',
201
 
202
  colors = scatter(X_tsne, df['label']) # 计算颜色
203
 
204
- # plt.savefig(IMG_PATH + 'tsne_' + survey_id + '.png', dpi=800, transparent=True)
205
 
206
- # plt.close()
207
- output_tsv_filename = f"{tsv_path}/{survey_id}.tsv"
208
  df.to_csv(output_tsv_filename, sep='\t')
209
  return df, colors, best_n_topics
210
 
 
7
  from sklearn.manifold import TSNE
8
  from sklearn.cluster import AgglomerativeClustering
9
  import json
10
+ from ..path_utils import get_path
11
 
12
+ IMG_PATH = get_path('img')
13
 
14
  plt.switch_backend('agg')
15
  device = 0
 
134
  print(f"Best n_topics={self.best_n_topics}, Best silhouette_score={self.best_score}")
135
  return self.best_labels, self.best_topic_model, self.best_n_topics
136
 
137
+ def clustering(df, n_topics_list, survey_id, info_path=None, tsv_path=None):
138
+ if info_path is None:
139
+ info_path = get_path('info')
140
+ if tsv_path is None:
141
+ tsv_path = get_path('tsv')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  text = df['retrieval_result'].astype(str)
143
  clustering = ClusteringWithTopic(text, n_topics_list)
144
  df['label'], topic_model, best_n_topics = clustering.fit_and_get_labels()
 
149
 
150
  # 保存 topic model 信息
151
  topic_json = topic_model.get_topic_info().to_json()
152
+ with open(get_path('info', survey_id, 'topic.json'), 'w', encoding="utf-8") as file:
153
  file.write(topic_json)
154
 
155
  # 创建描述信息
156
  description_dict = dict(zip(df['ref_title'], df['retrieval_result']))
157
+ with open(get_path('info', survey_id, 'description.json'), 'w', encoding="utf-8") as file:
158
  json.dump(description_dict, file, ensure_ascii=False, indent=4)
159
 
160
  # t-SNE 降维可视化
 
166
 
167
  colors = scatter(X_tsne, df['label']) # 计算颜色
168
 
169
+ plt.savefig(get_path('img', filename='tsne_' + survey_id + '.png'), dpi=800, transparent=True)
170
 
171
+ plt.close()
172
+ output_tsv_filename = get_path('tsv', survey_id + '.tsv')
173
  df.to_csv(output_tsv_filename, sep='\t')
174
  return df, colors, best_n_topics
175
 
src/demo/views.py CHANGED
@@ -41,6 +41,10 @@ import glob
41
 
42
  from langchain_huggingface import HuggingFaceEmbeddings
43
  from dotenv import load_dotenv
 
 
 
 
44
 
45
  dotenv_path = os.path.join(os.path.dirname(__file__), ".env")
46
  load_dotenv()
@@ -55,26 +59,45 @@ load_dotenv()
55
  # print(f"OPENAI_API_KEY: {openai_api_key}")
56
  # print(f"OPENAI_API_BASE: {openai_api_base}")
57
 
58
- import os
59
- from pathlib import Path
60
- from markdown_pdf import MarkdownPdf, Section
61
-
62
- DATA_PATH = './src/static/data/pdf/'
63
- TXT_PATH = './src/static/data/txt/'
64
- TSV_PATH = './src/static/data/tsv/'
65
- MD_PATH = './src/static/data/md/'
66
- INFO_PATH = './src/static/data/info/'
67
- IMG_PATH = './src/static/img/'
68
 
69
  paths = [DATA_PATH, TXT_PATH, TSV_PATH, MD_PATH, INFO_PATH, IMG_PATH]
70
 
 
71
  for path in paths:
72
- path_obj = Path(path)
73
- if not path_obj.exists():
74
- path_obj.mkdir(parents=True, exist_ok=True)
75
- print(f"Created directory: {path}")
76
- else:
77
- print(f"Directory already exists: {path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
 
80
 
@@ -165,17 +188,19 @@ def index(request):
165
  def delete_files(request):
166
  if request.method == 'POST':
167
  try:
168
- folders = ['./src/static/data/pdf/', './src/static/data/tsv/', './src/static/data/txt/', './src/static/data/md/']
 
169
  for folder in folders:
170
- for filename in os.listdir(folder):
171
- file_path = os.path.join(folder, filename)
172
- try:
173
- if os.path.isfile(file_path) or os.path.islink(file_path):
174
- os.unlink(file_path)
175
- elif os.path.isdir(file_path):
176
- shutil.rmtree(file_path)
177
- except Exception as e:
178
- return JsonResponse({'success': False, 'message': str(e)})
 
179
  return JsonResponse({'success': True})
180
  except Exception as e:
181
  return JsonResponse({'success': False, 'message': str(e)})
@@ -279,7 +304,7 @@ def sanitize_filename_py(filename):
279
 
280
  def get_existing_survey_ids():
281
 
282
- tsv_directory = os.path.join("src", "static", "data", "tsv")
283
  survey_ids = []
284
  try:
285
  for file_name in os.listdir(tsv_directory):
@@ -299,7 +324,7 @@ def get_surveys(request):
299
  def upload_refs(request):
300
 
301
  start_time = time.time()
302
- RECOMMENDED_PDF_DIR = os.path.join("src", "static", "data", "pdf", "recommend_pdfs")
303
  if request.method == 'POST':
304
  if not request.FILES:
305
  if not os.path.exists(RECOMMENDED_PDF_DIR):
@@ -369,7 +394,7 @@ def upload_refs(request):
369
  continue
370
  sanitized_filename = f"{sanitized_filename}{file_extension}"
371
 
372
- file_path = os.path.join('src', 'static', 'data', 'pdf', Global_survey_id, sanitized_filename)
373
  if default_storage.exists(file_path):
374
  default_storage.delete(file_path)
375
 
@@ -388,7 +413,7 @@ def upload_refs(request):
388
  csvfile_name = new_file_name + '.'+ file_name.split('.')[-1]
389
 
390
  json_data_pd = pd.DataFrame()
391
- json_files_path = f'./src/static/data/txt/{Global_survey_id}/*.json'
392
  json_files = glob.glob(json_files_path)
393
 
394
  # Dictionary to hold title and abstract pairs
@@ -425,7 +450,7 @@ def upload_refs(request):
425
  title_abstract_dict[title] = abstract
426
 
427
  input_pd = json_data_pd
428
- output_path = f'./src/static/data/info/{Global_survey_id}/title_abstract_pairs.json'
429
  os.makedirs(os.path.dirname(output_path), exist_ok=True)
430
 
431
  with open(output_path, 'w', encoding="utf-8") as outfile:
@@ -446,7 +471,8 @@ def upload_refs(request):
446
  input_pd["label"] = input_pd["reference paper category label (optional)"].apply(lambda x: str(x) if len(str(x))>0 else '')
447
 
448
  try:
449
- output_tsv_filename = "./src/static/data/tsv/" + new_file_name + '.tsv'
 
450
 
451
  output_df = input_pd[["ref_title","ref_context","ref_entry","abstract","intro"]]
452
 
@@ -558,7 +584,7 @@ def generate_arxiv_query(request):
558
  new_count += 1
559
 
560
  attempts += 1
561
- current_query = generic_query # 将本轮的宽松查询作为“新的严格查询”
562
 
563
  if len(total_papers) >= min_results:
564
  # 一旦达到 min_results,就返回此时的查询
@@ -596,7 +622,7 @@ def download_pdfs(request):
596
  if not pdf_links:
597
  return JsonResponse({"message": "No PDFs to download."}, status=400)
598
 
599
- base_dir = os.path.join(os.getcwd(), "src", "static", "data", "pdf", "recommend_pdfs")
600
  os.makedirs(base_dir, exist_ok=True) # 确保文件夹存在
601
 
602
  downloaded_files = []
@@ -668,33 +694,14 @@ def automatic_taxonomy(request):
668
  description = generate(context, query, name)
669
  Global_description_list.append(description)
670
 
671
- # Save citation data to file for debugging or reference
672
- citation_path = f'./src/static/data/info/{Global_survey_id}/citation_data.json'
673
  os.makedirs(os.path.dirname(citation_path), exist_ok=True)
674
- with open(citation_path, 'w', encoding="utf-8") as outfile:
675
- json.dump(Global_citation_data, outfile, indent=4, ensure_ascii=False)
676
-
677
- file_path = f'./src/static/data/tsv/{Global_survey_id}.tsv'
678
- with open(file_path, 'r', newline='', encoding='utf-8') as infile:
679
- reader = csv.reader(infile, delimiter='\t')
680
- rows = list(reader)
681
-
682
- if rows:
683
- headers = rows[0]
684
- headers.append('retrieval_result')
685
 
686
- updated_rows = [headers]
687
- for row, description in zip(rows[1:], Global_description_list):
688
- row.append(description)
689
- updated_rows.append(row)
690
-
691
- with open(file_path, 'w', newline='', encoding='utf-8') as outfile:
692
- writer = csv.writer(outfile, delimiter='\t')
693
- writer.writerows(updated_rows)
694
-
695
- print('Updated file has been saved to', file_path)
696
- else:
697
- print('Input file is empty.')
698
 
699
  Global_ref_list = ref_list
700
 
@@ -708,11 +715,11 @@ def automatic_taxonomy(request):
708
  ref_titles = list(df_tmp.groupby(df_tmp['label'])['ref_title'].apply(list))
709
  ref_indexs = list(df_tmp.groupby(df_tmp['label'])['index'].apply(list))
710
 
711
- info = pd.read_json(f'./src/static/data/info/{Global_survey_id}/topic.json')
712
  category_label = info['KeyBERT'].to_list()
713
  category_label_summarized=[]
714
 
715
- tsv_path = f'./src/static/data/tsv/{Global_survey_id}.tsv'
716
 
717
  cluster_num = Global_cluster_num
718
  category_label_summarized = generate_cluster_name_new(tsv_path, Global_survey_title, cluster_num)
@@ -733,7 +740,7 @@ def automatic_taxonomy(request):
733
  temp = [legal_pdf(i) for i in value]
734
  cluster_info[key] = temp
735
  Global_collection_names_clustered.append(temp)
736
- cluster_info_path = f'./src/static/data/info/{Global_survey_id}/cluster_info.json'
737
  with open(cluster_info_path, 'w', encoding="utf-8") as outfile:
738
  json.dump(cluster_info, outfile, indent=4, ensure_ascii=False)
739
 
@@ -743,7 +750,7 @@ def automatic_taxonomy(request):
743
  messages, outline = outline_generator.generate_outline_qwen(Global_survey_title, Global_cluster_num)
744
 
745
  outline_json = {'messages':messages, 'outline': outline}
746
- output_path = TXT_PATH + Global_survey_id + '/outline.json'
747
  os.makedirs(os.path.dirname(output_path), exist_ok=True)
748
  with open(output_path, 'w', encoding="utf-8") as outfile:
749
  json.dump(outline_json, outfile, indent=4, ensure_ascii=False)
@@ -767,7 +774,7 @@ def save_updated_cluster_info(request):
767
  if not survey_id or not updated_cate_list:
768
  return JsonResponse({"error": "Missing survey_id or updated_cate_list"}, status=400)
769
 
770
- save_dir = os.path.join('./src/static/data/info/', str(survey_id))
771
  os.makedirs(save_dir, exist_ok=True)
772
  save_path = os.path.join(save_dir, 'cluster_info_updated.json')
773
 
@@ -807,7 +814,7 @@ def save_outline(request):
807
  "outline": str(updated_outline)
808
  }
809
 
810
- file_path = os.path.join(settings.BASE_DIR, 'static', 'data', 'txt', Global_survey_id,'outline.json')
811
  os.makedirs(os.path.dirname(file_path), exist_ok=True)
812
 
813
  with open(file_path, 'w', encoding='utf-8') as file:
@@ -952,7 +959,7 @@ def generate_pdf(request):
952
  if request.method == 'POST':
953
  survey_id = request.POST.get('survey_id', '')
954
  markdown_content = request.POST.get('content', '')
955
- markdown_dir = f'./src/static/data/info/{survey_id}/'
956
  markdown_filename = f'survey_{survey_id}_vanilla.md'
957
  markdown_filepath = os.path.join(markdown_dir, markdown_filename)
958
 
@@ -970,7 +977,7 @@ def generate_pdf(request):
970
 
971
  markdown_content = finalize_survey_paper(markdown_content, Global_collection_names, Global_file_names)
972
  # 设置 Markdown 文件的保存路径1
973
- markdown_dir = f'./src/static/data/info/{survey_id}/'
974
  markdown_filename = f'survey_{survey_id}_processed.md'
975
  markdown_filepath = os.path.join(markdown_dir, markdown_filename)
976
 
@@ -990,7 +997,7 @@ def generate_pdf(request):
990
 
991
  # 配置 PDF 文件的保存路径
992
  pdf_filename = f'survey_{survey_id}.pdf'
993
- pdf_dir = './src/static/data/results'
994
  pdf_filepath = os.path.join(pdf_dir, pdf_filename)
995
 
996
  # 检查并创建 results 目录
@@ -1022,13 +1029,13 @@ def generate_pdf_from_tex(request):
1022
 
1023
  global Global_survey_id, Global_survey_title
1024
  if request.method == 'POST':
1025
- base_dir = f'./src/static/data/info/{Global_survey_id}'
1026
  md_path = os.path.join(base_dir, f'survey_{Global_survey_id}_processed.md')
1027
  new_md_path = os.path.join(base_dir, f'survey_{Global_survey_id}_preprocessed.md')
1028
  tex_path = os.path.join(base_dir, 'template.tex')
1029
  new_tex_path = os.path.join(base_dir, 'template_with_figure.tex')
1030
  sty_path = os.path.join(base_dir, 'acl.sty')
1031
- pdf_dir = './src/static/data/results'
1032
 
1033
  os.makedirs(base_dir, exist_ok=True)
1034
  print(f"Directory '{base_dir}' checked or created.")
@@ -1044,9 +1051,9 @@ def generate_pdf_from_tex(request):
1044
  md_to_tex(new_md_path, tex_path, Global_survey_title)
1045
 
1046
  insert_figures(
1047
- png_path=f'src/static/data/info/{Global_survey_id}/outline.png',
1048
  tex_path= tex_path,
1049
- json_path=f'src/static/data/info/{Global_survey_id}/flowchart_results.json',
1050
  ref_names= Global_ref_list,
1051
  survey_title=Global_survey_title,
1052
  new_tex_path=new_tex_path
@@ -1155,7 +1162,7 @@ def get_survey_text(refs=Global_ref_list):
1155
 
1156
  def Clustering_refs(n_clusters):
1157
  global Global_cluster_num
1158
- df = pd.read_csv(TSV_PATH + Global_survey_id + '.tsv', sep='\t', index_col=0, encoding='utf-8')
1159
 
1160
  print(Global_ref_list)
1161
  df_selected = df.iloc[Global_ref_list]
@@ -1232,10 +1239,10 @@ def finalize_survey_paper(paper_text,
1232
  Global_ref_list = ref_list
1233
  print(ref_list)
1234
 
1235
- json_path = os.path.join("src", "static", "data", "txt", Global_survey_id, "outline.json")
1236
- output_png_path = os.path.join("src", "static", "data", "info", Global_survey_id, "outline")
1237
- md_path = os.path.join("src", "static", "data", "info", Global_survey_id, f"survey_{Global_survey_id}_processed.md")
1238
- flowchart_results_path = os.path.join("src", "static", "data", "info", Global_survey_id, "flowchart_results.json")
1239
  detect_flowcharts(Global_survey_id)
1240
  png_path = generate_graphviz_png(
1241
  json_path=json_path,
 
41
 
42
  from langchain_huggingface import HuggingFaceEmbeddings
43
  from dotenv import load_dotenv
44
+ from pathlib import Path
45
+ from markdown_pdf import MarkdownPdf, Section
46
+ import tempfile
47
+ from .path_utils import get_path
48
 
49
  dotenv_path = os.path.join(os.path.dirname(__file__), ".env")
50
  load_dotenv()
 
59
  # print(f"OPENAI_API_KEY: {openai_api_key}")
60
  # print(f"OPENAI_API_BASE: {openai_api_base}")
61
 
62
+ # 获取路径配置
63
+ paths_config = get_path('pdf') # 使用 get_path 函数获取路径配置
64
+ DATA_PATH = get_path('pdf')
65
+ TXT_PATH = get_path('txt')
66
+ TSV_PATH = get_path('tsv')
67
+ MD_PATH = get_path('md')
68
+ INFO_PATH = get_path('info')
69
+ IMG_PATH = get_path('img')
 
 
70
 
71
  paths = [DATA_PATH, TXT_PATH, TSV_PATH, MD_PATH, INFO_PATH, IMG_PATH]
72
 
73
+ # 安全地创建目录
74
  for path in paths:
75
+ try:
76
+ path_obj = Path(path)
77
+ if not path_obj.exists():
78
+ path_obj.mkdir(parents=True, exist_ok=True)
79
+ print(f"Created directory: {path}")
80
+ else:
81
+ print(f"Directory already exists: {path}")
82
+ except (PermissionError, OSError) as e:
83
+ print(f"Warning: Could not create directory {path}: {e}")
84
+ # 在 Hugging Face Spaces 中,如果无法创建目录,使用临时目录
85
+ if os.environ.get('SPACE_ID') or os.environ.get('HF_SPACE_ID'):
86
+ temp_dir = tempfile.mkdtemp()
87
+ # 更新路径为临时目录
88
+ if 'pdf' in path:
89
+ DATA_PATH = os.path.join(temp_dir, 'pdf/')
90
+ elif 'txt' in path:
91
+ TXT_PATH = os.path.join(temp_dir, 'txt/')
92
+ elif 'tsv' in path:
93
+ TSV_PATH = os.path.join(temp_dir, 'tsv/')
94
+ elif 'md' in path:
95
+ MD_PATH = os.path.join(temp_dir, 'md/')
96
+ elif 'info' in path:
97
+ INFO_PATH = os.path.join(temp_dir, 'info/')
98
+ elif 'img' in path:
99
+ IMG_PATH = os.path.join(temp_dir, 'img/')
100
+ print(f"Using temporary directory: {temp_dir}")
101
 
102
 
103
 
 
188
  def delete_files(request):
189
  if request.method == 'POST':
190
  try:
191
+ # 使用动态路径而不是硬编码路径
192
+ folders = [DATA_PATH, TSV_PATH, TXT_PATH, MD_PATH]
193
  for folder in folders:
194
+ if os.path.exists(folder):
195
+ for filename in os.listdir(folder):
196
+ file_path = os.path.join(folder, filename)
197
+ try:
198
+ if os.path.isfile(file_path) or os.path.islink(file_path):
199
+ os.unlink(file_path)
200
+ elif os.path.isdir(file_path):
201
+ shutil.rmtree(file_path)
202
+ except Exception as e:
203
+ return JsonResponse({'success': False, 'message': str(e)})
204
  return JsonResponse({'success': True})
205
  except Exception as e:
206
  return JsonResponse({'success': False, 'message': str(e)})
 
304
 
305
  def get_existing_survey_ids():
306
 
307
+ tsv_directory = get_path('tsv')
308
  survey_ids = []
309
  try:
310
  for file_name in os.listdir(tsv_directory):
 
324
  def upload_refs(request):
325
 
326
  start_time = time.time()
327
+ RECOMMENDED_PDF_DIR = get_path('pdf', 'recommend_pdfs')
328
  if request.method == 'POST':
329
  if not request.FILES:
330
  if not os.path.exists(RECOMMENDED_PDF_DIR):
 
394
  continue
395
  sanitized_filename = f"{sanitized_filename}{file_extension}"
396
 
397
+ file_path = os.path.join(get_path('pdf', Global_survey_id), sanitized_filename)
398
  if default_storage.exists(file_path):
399
  default_storage.delete(file_path)
400
 
 
413
  csvfile_name = new_file_name + '.'+ file_name.split('.')[-1]
414
 
415
  json_data_pd = pd.DataFrame()
416
+ json_files_path = get_path('txt', Global_survey_id) + '/*.json'
417
  json_files = glob.glob(json_files_path)
418
 
419
  # Dictionary to hold title and abstract pairs
 
450
  title_abstract_dict[title] = abstract
451
 
452
  input_pd = json_data_pd
453
+ output_path = get_path('txt', Global_survey_id, 'title_abstract_pairs.json')
454
  os.makedirs(os.path.dirname(output_path), exist_ok=True)
455
 
456
  with open(output_path, 'w', encoding="utf-8") as outfile:
 
471
  input_pd["label"] = input_pd["reference paper category label (optional)"].apply(lambda x: str(x) if len(str(x))>0 else '')
472
 
473
  try:
474
+ output_tsv_filename = get_path('tsv', filename=new_file_name + '.tsv')
475
+ os.makedirs(os.path.dirname(output_tsv_filename), exist_ok=True)
476
 
477
  output_df = input_pd[["ref_title","ref_context","ref_entry","abstract","intro"]]
478
 
 
584
  new_count += 1
585
 
586
  attempts += 1
587
+ current_query = generic_query # 将本轮的宽松查询作为"新的严格查询"
588
 
589
  if len(total_papers) >= min_results:
590
  # 一旦达到 min_results,就返回此时的查询
 
622
  if not pdf_links:
623
  return JsonResponse({"message": "No PDFs to download."}, status=400)
624
 
625
+ base_dir = get_path('pdf', 'recommend_pdfs')
626
  os.makedirs(base_dir, exist_ok=True) # 确保文件夹存在
627
 
628
  downloaded_files = []
 
694
  description = generate(context, query, name)
695
  Global_description_list.append(description)
696
 
697
+ # 保存引用数据
698
+ citation_path = get_path('info', Global_survey_id, 'citation_data.json')
699
  os.makedirs(os.path.dirname(citation_path), exist_ok=True)
700
+ with open(citation_path, 'w', encoding='utf-8') as f:
701
+ json.dump(Global_citation_data, f, ensure_ascii=False, indent=2)
 
 
 
 
 
 
 
 
 
702
 
703
+ # 读取TSV文件
704
+ file_path = get_path('tsv', Global_survey_id + '.tsv')
 
 
 
 
 
 
 
 
 
 
705
 
706
  Global_ref_list = ref_list
707
 
 
715
  ref_titles = list(df_tmp.groupby(df_tmp['label'])['ref_title'].apply(list))
716
  ref_indexs = list(df_tmp.groupby(df_tmp['label'])['index'].apply(list))
717
 
718
+ info = pd.read_json(get_path('info', Global_survey_id, 'topic.json'))
719
  category_label = info['KeyBERT'].to_list()
720
  category_label_summarized=[]
721
 
722
+ tsv_path = get_path('tsv', Global_survey_id + '.tsv')
723
 
724
  cluster_num = Global_cluster_num
725
  category_label_summarized = generate_cluster_name_new(tsv_path, Global_survey_title, cluster_num)
 
740
  temp = [legal_pdf(i) for i in value]
741
  cluster_info[key] = temp
742
  Global_collection_names_clustered.append(temp)
743
+ cluster_info_path = get_path('info', Global_survey_id, 'cluster_info.json')
744
  with open(cluster_info_path, 'w', encoding="utf-8") as outfile:
745
  json.dump(cluster_info, outfile, indent=4, ensure_ascii=False)
746
 
 
750
  messages, outline = outline_generator.generate_outline_qwen(Global_survey_title, Global_cluster_num)
751
 
752
  outline_json = {'messages':messages, 'outline': outline}
753
+ output_path = get_path('txt', Global_survey_id, 'outline.json')
754
  os.makedirs(os.path.dirname(output_path), exist_ok=True)
755
  with open(output_path, 'w', encoding="utf-8") as outfile:
756
  json.dump(outline_json, outfile, indent=4, ensure_ascii=False)
 
774
  if not survey_id or not updated_cate_list:
775
  return JsonResponse({"error": "Missing survey_id or updated_cate_list"}, status=400)
776
 
777
+ save_dir = get_path('info', str(survey_id))
778
  os.makedirs(save_dir, exist_ok=True)
779
  save_path = os.path.join(save_dir, 'cluster_info_updated.json')
780
 
 
814
  "outline": str(updated_outline)
815
  }
816
 
817
+ file_path = get_path('txt', Global_survey_id, 'outline.json')
818
  os.makedirs(os.path.dirname(file_path), exist_ok=True)
819
 
820
  with open(file_path, 'w', encoding='utf-8') as file:
 
959
  if request.method == 'POST':
960
  survey_id = request.POST.get('survey_id', '')
961
  markdown_content = request.POST.get('content', '')
962
+ markdown_dir = get_path('info', survey_id) + '/'
963
  markdown_filename = f'survey_{survey_id}_vanilla.md'
964
  markdown_filepath = os.path.join(markdown_dir, markdown_filename)
965
 
 
977
 
978
  markdown_content = finalize_survey_paper(markdown_content, Global_collection_names, Global_file_names)
979
  # 设置 Markdown 文件的保存路径1
980
+ markdown_dir = get_path('info', survey_id) + '/'
981
  markdown_filename = f'survey_{survey_id}_processed.md'
982
  markdown_filepath = os.path.join(markdown_dir, markdown_filename)
983
 
 
997
 
998
  # 配置 PDF 文件的保存路径
999
  pdf_filename = f'survey_{survey_id}.pdf'
1000
+ pdf_dir = get_path('results')
1001
  pdf_filepath = os.path.join(pdf_dir, pdf_filename)
1002
 
1003
  # 检查并创建 results 目录
 
1029
 
1030
  global Global_survey_id, Global_survey_title
1031
  if request.method == 'POST':
1032
+ base_dir = get_path('info', Global_survey_id)
1033
  md_path = os.path.join(base_dir, f'survey_{Global_survey_id}_processed.md')
1034
  new_md_path = os.path.join(base_dir, f'survey_{Global_survey_id}_preprocessed.md')
1035
  tex_path = os.path.join(base_dir, 'template.tex')
1036
  new_tex_path = os.path.join(base_dir, 'template_with_figure.tex')
1037
  sty_path = os.path.join(base_dir, 'acl.sty')
1038
+ pdf_dir = get_path('results')
1039
 
1040
  os.makedirs(base_dir, exist_ok=True)
1041
  print(f"Directory '{base_dir}' checked or created.")
 
1051
  md_to_tex(new_md_path, tex_path, Global_survey_title)
1052
 
1053
  insert_figures(
1054
+ png_path=get_path('info', Global_survey_id, 'outline.png'),
1055
  tex_path= tex_path,
1056
+ json_path=get_path('info', Global_survey_id, 'flowchart_results.json'),
1057
  ref_names= Global_ref_list,
1058
  survey_title=Global_survey_title,
1059
  new_tex_path=new_tex_path
 
1162
 
1163
  def Clustering_refs(n_clusters):
1164
  global Global_cluster_num
1165
+ df = pd.read_csv(get_path('tsv', Global_survey_id + '.tsv'), sep='\t', index_col=0, encoding='utf-8')
1166
 
1167
  print(Global_ref_list)
1168
  df_selected = df.iloc[Global_ref_list]
 
1239
  Global_ref_list = ref_list
1240
  print(ref_list)
1241
 
1242
+ json_path = get_path('txt', Global_survey_id, 'outline.json')
1243
+ output_png_path = get_path('info', Global_survey_id, 'outline')
1244
+ md_path = get_path('info', Global_survey_id, f'survey_{Global_survey_id}_processed.md')
1245
+ flowchart_results_path = get_path('info', Global_survey_id, 'flowchart_results.json')
1246
  detect_flowcharts(Global_survey_id)
1247
  png_path = generate_graphviz_png(
1248
  json_path=json_path,