Spaces:
Sleeping
Sleeping
Commit
·
92d8c87
1
Parent(s):
c320a1b
update
Browse files- src/demo/asg_add_flowchart.py +13 -7
- src/demo/asg_latex.py +14 -12
- src/demo/asg_loader.py +79 -57
- src/demo/asg_outline.py +47 -45
- src/demo/asg_retriever.py +2 -1
- src/demo/category_and_tsne.py +13 -47
- src/demo/path_utils.py +64 -0
- src/demo/survey_generation_pipeline/asg_loader.py +23 -18
- src/demo/survey_generation_pipeline/asg_outline.py +16 -17
- src/demo/survey_generation_pipeline/asg_retriever.py +3 -2
- src/demo/survey_generation_pipeline/category_and_tsne.py +12 -47
- src/demo/views.py +84 -77
src/demo/asg_add_flowchart.py
CHANGED
@@ -2,17 +2,23 @@ import json
|
|
2 |
import os
|
3 |
import re
|
4 |
from urllib.parse import quote
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
-
import os
|
7 |
-
import json
|
8 |
import torch
|
9 |
import torchvision.transforms as transforms
|
10 |
from torchvision import models
|
11 |
from PIL import Image
|
12 |
|
13 |
-
#
|
14 |
-
BASE_DIR =
|
15 |
-
INFO_DIR =
|
16 |
|
17 |
# 加载 PyTorch EfficientNet 训练好的 3 类分类模型
|
18 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
@@ -288,9 +294,9 @@ def insert_tex_images(json_path, ref_names, text):
|
|
288 |
# 示例用法
|
289 |
if __name__ == "__main__":
|
290 |
# Markdown 文件路径
|
291 |
-
md_file_path =
|
292 |
# JSON 文件路径
|
293 |
-
json_file_path =
|
294 |
|
295 |
try:
|
296 |
with open(md_file_path, "r", encoding="utf-8") as f:
|
|
|
2 |
import os
|
3 |
import re
|
4 |
from urllib.parse import quote
|
5 |
+
import cv2
|
6 |
+
import numpy as np
|
7 |
+
from PIL import Image, ImageDraw, ImageFont
|
8 |
+
import matplotlib.pyplot as plt
|
9 |
+
import matplotlib.patches as patches
|
10 |
+
from matplotlib.patches import Rectangle
|
11 |
+
import matplotlib.patches as mpatches
|
12 |
+
from .path_utils import get_path
|
13 |
|
|
|
|
|
14 |
import torch
|
15 |
import torchvision.transforms as transforms
|
16 |
from torchvision import models
|
17 |
from PIL import Image
|
18 |
|
19 |
+
# 使用动态路径
|
20 |
+
BASE_DIR = get_path('md') # 根目录
|
21 |
+
INFO_DIR = get_path('info') # 存放 JSON 结果的目录
|
22 |
|
23 |
# 加载 PyTorch EfficientNet 训练好的 3 类分类模型
|
24 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
294 |
# 示例用法
|
295 |
if __name__ == "__main__":
|
296 |
# Markdown 文件路径
|
297 |
+
md_file_path = get_path('info', 'test', 'survey_test_processed.md')
|
298 |
# JSON 文件路径
|
299 |
+
json_file_path = get_path('info', 'test', 'flowchart_results.json')
|
300 |
|
301 |
try:
|
302 |
with open(md_file_path, "r", encoding="utf-8") as f:
|
src/demo/asg_latex.py
CHANGED
@@ -1,12 +1,14 @@
|
|
|
|
1 |
import re
|
2 |
import subprocess
|
3 |
-
import
|
4 |
-
|
5 |
-
from openai import OpenAI
|
6 |
import dotenv
|
7 |
from .asg_add_flowchart import insert_tex_images
|
8 |
from .asg_mindmap import insert_outline_figure
|
9 |
|
|
|
|
|
10 |
|
11 |
def _remove_div_blocks(lines):
|
12 |
"""
|
@@ -435,7 +437,7 @@ def md_to_tex_section_without_jpg(section):
|
|
435 |
# - 标题看起来只是一个段落号, 形如"3"、"3.1"、"3.1.1" 等 (可根据需要调宽或调窄判断规则)
|
436 |
|
437 |
# 例:用一个正则匹配 `数字(.数字)*`,可带可不带后缀空格
|
438 |
-
# 如果 heading_text
|
439 |
pure_number_pattern = re.compile(r'^\d+(\.\d+)*$')
|
440 |
|
441 |
# 先去一下两端空格
|
@@ -548,7 +550,7 @@ def md_to_tex_section_without_jpg(section):
|
|
548 |
|
549 |
def insert_section(tex_path: str, section_content: str):
|
550 |
"""
|
551 |
-
将 section_content 追加到 .tex
|
552 |
具体逻辑如下:
|
553 |
1. 如果文件内找不到任何 \section{...}、\subsection{...}、\subsubsection{...},
|
554 |
那么就将 section_content 插入到 \end{abstract} 之后。
|
@@ -564,7 +566,7 @@ def insert_section(tex_path: str, section_content: str):
|
|
564 |
|
565 |
注意:
|
566 |
- 这段逻辑会将新的内容**追加**到最后一个标题所对应正文的末尾,
|
567 |
-
|
568 |
"""
|
569 |
|
570 |
if not os.path.exists(tex_path):
|
@@ -616,11 +618,11 @@ def insert_section(tex_path: str, section_content: str):
|
|
616 |
)
|
617 |
|
618 |
else:
|
619 |
-
#
|
620 |
last_title_line = title_lines[-1]
|
621 |
|
622 |
# 找到下一个标题的行号(如果有),或 \end{document} 行号,以确定正文区间结束
|
623 |
-
#
|
624 |
next_boundaries = [end_document_line if end_document_line is not None else len(lines)]
|
625 |
for t_line in title_lines:
|
626 |
if t_line > last_title_line:
|
@@ -628,8 +630,8 @@ def insert_section(tex_path: str, section_content: str):
|
|
628 |
# next_boundary 是最后标题之后遇到的第一个 boundary(若没有, 就是文件末尾)
|
629 |
next_boundary = min(next_boundaries) if next_boundaries else len(lines)
|
630 |
|
631 |
-
#
|
632 |
-
#
|
633 |
# 这里为了避免把最后一行顶下去,可以先把其中的正文行都保留,再在最后插入 section_content。
|
634 |
new_lines = []
|
635 |
new_lines.extend(lines[:next_boundary]) # 保留从头到最后正文结束
|
@@ -804,8 +806,8 @@ if __name__ == "__main__":
|
|
804 |
# 读取环境变量
|
805 |
dotenv.load_dotenv()
|
806 |
# md_path = preprocess_md("src/demo/latex_template/test copy.md", "src/demo/latex_template/test_preprocessed.md")
|
807 |
-
md_path = '
|
808 |
-
tex_path =
|
809 |
md_to_tex(md_path, tex_path, title="A Comprehensive Review of ADMM On Consensus Distributed Optimization")
|
810 |
# insert_figures('src/static/data/info/undefined/outline.png',
|
811 |
# 'src/demo/latex_template/template.tex',
|
|
|
1 |
+
import os
|
2 |
import re
|
3 |
import subprocess
|
4 |
+
import shutil
|
5 |
+
from .path_utils import get_path
|
|
|
6 |
import dotenv
|
7 |
from .asg_add_flowchart import insert_tex_images
|
8 |
from .asg_mindmap import insert_outline_figure
|
9 |
|
10 |
+
from openai import OpenAI
|
11 |
+
|
12 |
|
13 |
def _remove_div_blocks(lines):
|
14 |
"""
|
|
|
437 |
# - 标题看起来只是一个段落号, 形如"3"、"3.1"、"3.1.1" 等 (可根据需要调宽或调窄判断规则)
|
438 |
|
439 |
# 例:用一个正则匹配 `数字(.数字)*`,可带可不带后缀空格
|
440 |
+
# 如果 heading_text 完全匹配这个模式,就认为它是个"纯编号标题",不必调用 LLM
|
441 |
pure_number_pattern = re.compile(r'^\d+(\.\d+)*$')
|
442 |
|
443 |
# 先去一下两端空格
|
|
|
550 |
|
551 |
def insert_section(tex_path: str, section_content: str):
|
552 |
"""
|
553 |
+
将 section_content 追加到 .tex 文件"最后一个 section(或子节)的正文末尾"。
|
554 |
具体逻辑如下:
|
555 |
1. 如果文件内找不到任何 \section{...}、\subsection{...}、\subsubsection{...},
|
556 |
那么就将 section_content 插入到 \end{abstract} 之后。
|
|
|
566 |
|
567 |
注意:
|
568 |
- 这段逻辑会将新的内容**追加**到最后一个标题所对应正文的末尾,
|
569 |
+
这样可以避免把之前的内容"分割"或"顶开"。
|
570 |
"""
|
571 |
|
572 |
if not os.path.exists(tex_path):
|
|
|
618 |
)
|
619 |
|
620 |
else:
|
621 |
+
# 有标题时,将内容追加到"最后一个标题对应正文"的末尾
|
622 |
last_title_line = title_lines[-1]
|
623 |
|
624 |
# 找到下一个标题的行号(如果有),或 \end{document} 行号,以确定正文区间结束
|
625 |
+
# "最后标题正文"从 last_title_line+1 一直到 next_title_line-1(或结束)
|
626 |
next_boundaries = [end_document_line if end_document_line is not None else len(lines)]
|
627 |
for t_line in title_lines:
|
628 |
if t_line > last_title_line:
|
|
|
630 |
# next_boundary 是最后标题之后遇到的第一个 boundary(若没有, 就是文件末尾)
|
631 |
next_boundary = min(next_boundaries) if next_boundaries else len(lines)
|
632 |
|
633 |
+
# 我们希望将新的内容插在"最后标题正文的最末尾"之后,也就是说在 next_boundary 前。
|
634 |
+
# 不过若"最后标题"本身就处于全文件最终,next_boundary 可能表示文件末尾/文档结束。
|
635 |
# 这里为了避免把最后一行顶下去,可以先把其中的正文行都保留,再在最后插入 section_content。
|
636 |
new_lines = []
|
637 |
new_lines.extend(lines[:next_boundary]) # 保留从头到最后正文结束
|
|
|
806 |
# 读取环境变量
|
807 |
dotenv.load_dotenv()
|
808 |
# md_path = preprocess_md("src/demo/latex_template/test copy.md", "src/demo/latex_template/test_preprocessed.md")
|
809 |
+
md_path = get_path('info', 'undefined', 'survey_undefined_preprocessed.md')
|
810 |
+
tex_path = get_path('info', 'undefined', 'template.tex')
|
811 |
md_to_tex(md_path, tex_path, title="A Comprehensive Review of ADMM On Consensus Distributed Optimization")
|
812 |
# insert_figures('src/static/data/info/undefined/outline.png',
|
813 |
# 'src/demo/latex_template/template.tex',
|
src/demo/asg_loader.py
CHANGED
@@ -2,9 +2,14 @@ import os
|
|
2 |
import re
|
3 |
import json
|
4 |
import subprocess
|
|
|
|
|
|
|
5 |
from langchain_community.document_loaders import UnstructuredMarkdownLoader
|
6 |
-
from
|
7 |
import shutil
|
|
|
|
|
8 |
|
9 |
class DocumentLoading:
|
10 |
def convert_pdf_to_md(self, pdf_file, output_dir="output", method="auto"):
|
@@ -128,8 +133,8 @@ class DocumentLoading:
|
|
128 |
for char in invalid_chars:
|
129 |
title_new = title_new.replace(char, ' ')
|
130 |
|
131 |
-
os.makedirs(
|
132 |
-
with open(
|
133 |
json.dump(extracted_data, f, ensure_ascii=False, indent=4)
|
134 |
return extracted_data['introduction']
|
135 |
|
@@ -150,69 +155,73 @@ class DocumentLoading:
|
|
150 |
for char in invalid_chars:
|
151 |
title_new = title_new.replace(char, ' ')
|
152 |
|
153 |
-
os.makedirs(
|
154 |
-
with open(
|
155 |
json.dump(extracted_data, f, ensure_ascii=False, indent=4)
|
156 |
return extracted_data['abstract'] + extracted_data['introduction'] + extracted_data['main_content']
|
157 |
|
158 |
-
|
159 |
def load_pdf(self, pdf_file, survey_id, mode):
|
160 |
-
os.makedirs(f'./src/static/data/md/{survey_id}', exist_ok=True)
|
161 |
-
output_dir = f"./src/static/data/md/{survey_id}"
|
162 |
base_name = os.path.splitext(os.path.basename(pdf_file))[0]
|
163 |
-
target_dir = os.path.join(
|
164 |
-
|
165 |
-
|
166 |
-
self.convert_pdf_to_md(pdf_file, output_dir)
|
167 |
-
|
168 |
-
# 2. Process the markdown file in the output directory
|
169 |
-
md_file_path = os.path.join(target_dir, f"{base_name}.md")
|
170 |
-
if not os.path.exists(md_file_path):
|
171 |
-
raise FileNotFoundError(f"Markdown file {md_file_path} does not exist. Conversion might have failed.")
|
172 |
|
173 |
-
if
|
|
|
174 |
return self.process_md_file(md_file_path, survey_id)
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
def load_pdf_new(self, pdf_dir, survey_id):
|
180 |
-
os.
|
181 |
-
output_dir = f"./src/static/data/md/{survey_id}"
|
182 |
-
self.convert_pdf_to_md_new(pdf_dir, output_dir)
|
183 |
-
markdown_files = glob.glob(os.path.join(output_dir, "*", "auto", "*.md"))
|
184 |
-
all_introductions = []
|
185 |
-
|
186 |
-
for md_file_path in markdown_files:
|
187 |
-
try:
|
188 |
-
introduction = self.process_md_file(md_file_path, survey_id)
|
189 |
-
all_introductions.append(introduction)
|
190 |
-
except FileNotFoundError as e:
|
191 |
-
print(f"Markdown file {md_file_path} does not exist. Conversion might have failed.")
|
192 |
-
|
193 |
-
return all_introductions
|
194 |
|
|
|
|
|
|
|
195 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
|
197 |
def parallel_load_pdfs(self, pdf_files, survey_id, max_workers=4):
|
|
|
198 |
with ProcessPoolExecutor(max_workers=max_workers) as executor:
|
199 |
-
# Submit
|
200 |
-
futures = [executor.submit(self.load_pdf, pdf, survey_id) for pdf in pdf_files]
|
201 |
-
|
202 |
-
#
|
203 |
for future in futures:
|
204 |
try:
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
def ensure_non_empty_introduction(self, introduction, full_text):
|
211 |
-
|
212 |
-
|
213 |
-
"""
|
214 |
-
if introduction == "N/A" or len(introduction.strip()) < 50:
|
215 |
-
return full_text.strip()
|
216 |
return introduction
|
217 |
|
218 |
def extract_information_from_md_new(self, md_text):
|
@@ -240,17 +249,30 @@ class DocumentLoading:
|
|
240 |
|
241 |
# Introduction extraction
|
242 |
introduction_match = re.search(
|
243 |
-
r'\n\n([1I][\.\- ]?\s*)?[Ii]\s*[nN]\s*[tT]\s*[rR]\s*[oO]\s*[dD]\s*[uU]\s*[cC]\s*[tT]\s*[iI]\s*[oO]\s*[nN][\.\- ]?\s*\n\n(.*?)'
|
244 |
-
|
|
|
|
|
245 |
)
|
246 |
introduction = introduction_match.group(2).strip() if introduction_match else "N/A"
|
247 |
|
248 |
-
#
|
249 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
250 |
|
251 |
-
|
252 |
"title": title,
|
253 |
"authors": authors,
|
254 |
"abstract": abstract,
|
255 |
-
"introduction": introduction
|
256 |
-
|
|
|
|
|
|
2 |
import re
|
3 |
import json
|
4 |
import subprocess
|
5 |
+
import glob
|
6 |
+
from pathlib import Path
|
7 |
+
from concurrent.futures import ProcessPoolExecutor
|
8 |
from langchain_community.document_loaders import UnstructuredMarkdownLoader
|
9 |
+
from langchain.schema import Document
|
10 |
import shutil
|
11 |
+
import tempfile
|
12 |
+
from .path_utils import get_path
|
13 |
|
14 |
class DocumentLoading:
|
15 |
def convert_pdf_to_md(self, pdf_file, output_dir="output", method="auto"):
|
|
|
133 |
for char in invalid_chars:
|
134 |
title_new = title_new.replace(char, ' ')
|
135 |
|
136 |
+
os.makedirs(get_path('txt', survey_id), exist_ok=True)
|
137 |
+
with open(get_path('txt', survey_id, f'{title_new}.json'), 'w', encoding='utf-8') as f:
|
138 |
json.dump(extracted_data, f, ensure_ascii=False, indent=4)
|
139 |
return extracted_data['introduction']
|
140 |
|
|
|
155 |
for char in invalid_chars:
|
156 |
title_new = title_new.replace(char, ' ')
|
157 |
|
158 |
+
os.makedirs(get_path('txt', survey_id), exist_ok=True)
|
159 |
+
with open(get_path('txt', survey_id, f'{title_new}.json'), 'w', encoding='utf-8') as f:
|
160 |
json.dump(extracted_data, f, ensure_ascii=False, indent=4)
|
161 |
return extracted_data['abstract'] + extracted_data['introduction'] + extracted_data['main_content']
|
162 |
|
|
|
163 |
def load_pdf(self, pdf_file, survey_id, mode):
|
|
|
|
|
164 |
base_name = os.path.splitext(os.path.basename(pdf_file))[0]
|
165 |
+
target_dir = os.path.join(get_path('md', survey_id), base_name)
|
166 |
+
md_file_path = os.path.join(target_dir, mode, f"{base_name}.md")
|
167 |
+
print("The md file path is: ", md_file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
|
169 |
+
if os.path.exists(md_file_path):
|
170 |
+
print(f"Markdown file for {pdf_file} already exists at {md_file_path}. Skipping conversion.", flush=True)
|
171 |
return self.process_md_file(md_file_path, survey_id)
|
172 |
+
|
173 |
+
command = ["mineru", "-p", pdf_file, "-o", get_path('md', survey_id), "-m", mode]
|
174 |
+
try:
|
175 |
+
subprocess.run(command, check=True)
|
176 |
+
# 检查是否生成了 Markdown 文件
|
177 |
+
if not os.path.exists(md_file_path):
|
178 |
+
print(f"Conversion failed: Markdown file not found at {md_file_path}. Cleaning up folder...")
|
179 |
+
shutil.rmtree(target_dir) # 删除生成的文件夹
|
180 |
+
return None
|
181 |
+
else:
|
182 |
+
print(f"Successfully converted {pdf_file} to markdown format in {target_dir}.")
|
183 |
+
return self.process_md_file(md_file_path, survey_id)
|
184 |
+
except subprocess.CalledProcessError as e:
|
185 |
+
print(f"An error occurred during conversion: {e}")
|
186 |
+
# 如果发生错误且文件夹已生成,则删除文件夹
|
187 |
+
if os.path.exists(target_dir):
|
188 |
+
print(f"Cleaning up incomplete folder: {target_dir}")
|
189 |
+
shutil.rmtree(target_dir)
|
190 |
+
return None
|
191 |
+
|
192 |
def load_pdf_new(self, pdf_dir, survey_id):
|
193 |
+
pdf_files = glob.glob(os.path.join(pdf_dir, "*.pdf"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
|
195 |
+
for pdf_file in pdf_files:
|
196 |
+
base_name = os.path.splitext(os.path.basename(pdf_file))[0]
|
197 |
+
target_dir = os.path.join(get_path('md', survey_id), base_name)
|
198 |
|
199 |
+
if os.path.exists(target_dir):
|
200 |
+
print(f"Folder for {pdf_file} already exists in {get_path('md', survey_id)}. Skipping conversion.")
|
201 |
+
else:
|
202 |
+
command = ["mineru", "-p", pdf_file, "-o", get_path('md', survey_id), "-m", "auto"]
|
203 |
+
try:
|
204 |
+
subprocess.run(command, check=True)
|
205 |
+
print(f"Successfully converted {pdf_file} to markdown format in {target_dir}.")
|
206 |
+
except subprocess.CalledProcessError as e:
|
207 |
+
print(f"An error occurred: {e}")
|
208 |
|
209 |
def parallel_load_pdfs(self, pdf_files, survey_id, max_workers=4):
|
210 |
+
# Create a process pool to run the conversion in parallel
|
211 |
with ProcessPoolExecutor(max_workers=max_workers) as executor:
|
212 |
+
# Submit each PDF file to the process pool for conversion
|
213 |
+
futures = [executor.submit(self.load_pdf, pdf, survey_id, "auto") for pdf in pdf_files]
|
214 |
+
|
215 |
+
# Optionally, you can monitor the status of each future as they complete
|
216 |
for future in futures:
|
217 |
try:
|
218 |
+
future.result() # This will raise any exceptions that occurred during the processing
|
219 |
+
except Exception as exc:
|
220 |
+
print(f"An error occurred during processing: {exc}")
|
221 |
+
|
|
|
222 |
def ensure_non_empty_introduction(self, introduction, full_text):
|
223 |
+
if len(introduction) < 50:
|
224 |
+
return full_text[:1000]
|
|
|
|
|
|
|
225 |
return introduction
|
226 |
|
227 |
def extract_information_from_md_new(self, md_text):
|
|
|
249 |
|
250 |
# Introduction extraction
|
251 |
introduction_match = re.search(
|
252 |
+
r'\n\n([1I][\.\- ]?\s*)?[Ii]\s*[nN]\s*[tT]\s*[rR]\s*[oO]\s*[dD]\s*[uU]\s*[cC]\s*[tT]\s*[iI]\s*[oO]\s*[nN][\.\- ]?\s*\n\n(.*?)'
|
253 |
+
r'(?=\n\n(?:([2I][I]|\s*2)[^\n]*?\n\n|\n\n(?:[2I][I][^\n]*?\n\n)))',
|
254 |
+
md_text,
|
255 |
+
re.DOTALL
|
256 |
)
|
257 |
introduction = introduction_match.group(2).strip() if introduction_match else "N/A"
|
258 |
|
259 |
+
# Main content extraction
|
260 |
+
main_content_match = re.search(
|
261 |
+
r'(.*?)(\n\n([3I][\.\- ]?\s*)?[Rr][Ee][Ff][Ee][Rr][Ee][Nn][Cc][Ee][Ss][^\n]*\n\n|\Z)',
|
262 |
+
md_text,
|
263 |
+
re.DOTALL
|
264 |
+
)
|
265 |
+
|
266 |
+
if main_content_match:
|
267 |
+
main_content = main_content_match.group(1).strip()
|
268 |
+
else:
|
269 |
+
main_content = "N/A"
|
270 |
|
271 |
+
extracted_data = {
|
272 |
"title": title,
|
273 |
"authors": authors,
|
274 |
"abstract": abstract,
|
275 |
+
"introduction": introduction,
|
276 |
+
"main_content": main_content
|
277 |
+
}
|
278 |
+
return extracted_data
|
src/demo/asg_outline.py
CHANGED
@@ -10,6 +10,7 @@ from .asg_conclusion import ConclusionGenerator
|
|
10 |
from .asg_retriever import *
|
11 |
import pandas as df
|
12 |
from .references import generate_references
|
|
|
13 |
|
14 |
|
15 |
class OutlineGenerator():
|
@@ -260,7 +261,7 @@ class OutlineGenerator():
|
|
260 |
return messages, clean_text
|
261 |
|
262 |
def parseOutline(survey_id):
|
263 |
-
file_path =
|
264 |
try:
|
265 |
with open(file_path, 'r', encoding='utf-8') as file:
|
266 |
data = json.load(file)
|
@@ -286,7 +287,7 @@ def parseOutline(survey_id):
|
|
286 |
print("Failed to extract a valid list string from the outline content.")
|
287 |
return []
|
288 |
|
289 |
-
#
|
290 |
fixed_str = response_extracted.strip()
|
291 |
if not fixed_str.startswith("[["):
|
292 |
# 如果不是,则去掉原有的首尾括号,再重新包装:[[ ... ]]
|
@@ -531,45 +532,45 @@ def generateOutlineHTML_qwen(survey_id):
|
|
531 |
}
|
532 |
|
533 |
// 确认编辑并提交数据
|
534 |
-
function confirmOutline() {
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
-
|
541 |
-
|
542 |
-
|
543 |
-
|
544 |
-
|
545 |
-
|
546 |
-
|
547 |
-
|
548 |
-
|
549 |
-
|
550 |
-
|
551 |
-
|
552 |
-
|
553 |
-
|
554 |
-
|
555 |
-
|
556 |
-
|
557 |
-
|
558 |
-
|
559 |
-
|
560 |
-
|
561 |
-
|
562 |
-
|
563 |
-
|
564 |
-
|
565 |
-
|
566 |
-
|
567 |
-
|
568 |
-
|
569 |
-
|
570 |
-
|
571 |
-
|
572 |
-
}
|
573 |
</script>
|
574 |
'''
|
575 |
return html
|
@@ -825,7 +826,7 @@ def generateSurvey(survey_id, title, collection_list, pipeline):
|
|
825 |
temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
|
826 |
temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
|
827 |
|
828 |
-
output_path =
|
829 |
with open(output_path, 'w', encoding='utf-8') as f:
|
830 |
json.dump(temp, f, ensure_ascii=False, indent=4)
|
831 |
print(f"Survey has been saved to {output_path}.")
|
@@ -910,7 +911,8 @@ def generateSurvey_qwen(survey_id, title, collection_list, pipeline):
|
|
910 |
temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
|
911 |
temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
|
912 |
temp["content"] = insert_section(temp["content"], "Future Directions", temp["future_directions"])
|
913 |
-
|
|
|
914 |
with open(output_path, 'w', encoding='utf-8') as f:
|
915 |
json.dump(temp, f, ensure_ascii=False, indent=4)
|
916 |
print(f"Survey has been saved to {output_path}.")
|
@@ -962,7 +964,7 @@ def generateSurvey_qwen_new(survey_id, title, collection_list, pipeline, citatio
|
|
962 |
temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
|
963 |
temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
|
964 |
temp["content"] = insert_section(temp["content"], "Future Directions", temp["future_directions"])
|
965 |
-
output_path =
|
966 |
with open(output_path, 'w', encoding='utf-8') as f:
|
967 |
json.dump(temp, f, ensure_ascii=False, indent=4)
|
968 |
print(f"Survey has been saved to {output_path}.")
|
@@ -994,7 +996,7 @@ if __name__ == '__main__':
|
|
994 |
Many paradigms have been proposed to asses informativeness of data samples for active learning. One of the popular approaches is selecting the most uncertain data sample, i.e the data sample in which current classifier is least confident. Some other approaches are selecting the sample which yields a model with minimum risk or the data sample which yields fastest convergence in gradient based methods.//
|
995 |
An active under-sampling approach is presented in this paper to change the data distribution of training datasets, and improve the classification accuracy of minority classes while maintaining overall classification performance.//
|
996 |
In this paper, we propose an uncertainty-based active learning algorithm which requires only samples of one class and a set of unlabeled data in order to operate.//
|
997 |
-
The principal contribution of our work is twofold: First, we use Bayes
|
998 |
This technique reduces the number of input parameters of the problem. At the rest of this paper, we first review recent related works in the fields of active learning and active one-class learning (section II).//
|
999 |
The classifier predicts that all the samples are non-fraud, it will have a quite high accuracy. However, for problems like fraud detection, minority class classification accuracy is more critical.//
|
1000 |
The algorithm used and the features selected are always the key points at design time, and many experiments are needed to select the final algorithm and the best suited feature set.//
|
|
|
10 |
from .asg_retriever import *
|
11 |
import pandas as df
|
12 |
from .references import generate_references
|
13 |
+
from .path_utils import get_path
|
14 |
|
15 |
|
16 |
class OutlineGenerator():
|
|
|
261 |
return messages, clean_text
|
262 |
|
263 |
def parseOutline(survey_id):
|
264 |
+
file_path = get_path('txt', survey_id, 'outline.json')
|
265 |
try:
|
266 |
with open(file_path, 'r', encoding='utf-8') as file:
|
267 |
data = json.load(file)
|
|
|
287 |
print("Failed to extract a valid list string from the outline content.")
|
288 |
return []
|
289 |
|
290 |
+
# 检查提取结果是否为"列表的列表"格式(应该以 "[[" 开头)
|
291 |
fixed_str = response_extracted.strip()
|
292 |
if not fixed_str.startswith("[["):
|
293 |
# 如果不是,则去掉原有的首尾括号,再重新包装:[[ ... ]]
|
|
|
532 |
}
|
533 |
|
534 |
// 确认编辑并提交数据
|
535 |
+
function confirmOutline() {
|
536 |
+
const outlineData = []; // 用于存储提交到后端的数据
|
537 |
+
|
538 |
+
// 遍历所有的可编辑输入框
|
539 |
+
document.querySelectorAll("#edit-outline .list-group-item").forEach((item) => {
|
540 |
+
const level = item.classList.contains("level-1") ? 1 :
|
541 |
+
item.classList.contains("level-2") ? 2 : 3; // 获取层级
|
542 |
+
const content = item.querySelector("input").value.trim(); // 获取编辑框的值
|
543 |
+
|
544 |
+
// 将数据转换为数组格式 [level, content]
|
545 |
+
outlineData.push([level, content]);
|
546 |
+
});
|
547 |
+
|
548 |
+
console.log("Submitting to backend:", outlineData); // 打印提交数据以供调试
|
549 |
+
|
550 |
+
// 使用 AJAX 提交数据到后端
|
551 |
+
const csrftoken = getCookie("csrftoken"); // 获取 CSRF token
|
552 |
+
fetch("/save_outline/", {
|
553 |
+
method: "POST",
|
554 |
+
headers: {
|
555 |
+
"Content-Type": "application/json",
|
556 |
+
"X-CSRFToken": csrftoken, // Django 的 CSRF 令牌
|
557 |
+
},
|
558 |
+
body: JSON.stringify({ outline: outlineData }) // 将数据转换为 JSON 字符串
|
559 |
+
})
|
560 |
+
.then((response) => response.json())
|
561 |
+
.then((data) => {
|
562 |
+
if (data.status === "success") {
|
563 |
+
$('#sections_').html(data.html);
|
564 |
+
alert("Outline updated successfully!");
|
565 |
+
} else {
|
566 |
+
alert("Error updating outline: " + data.message);
|
567 |
+
}
|
568 |
+
})
|
569 |
+
.catch((error) => {
|
570 |
+
console.error("Error:", error);
|
571 |
+
alert("Error updating outline. Please check the console for details.");
|
572 |
+
});
|
573 |
+
}
|
574 |
</script>
|
575 |
'''
|
576 |
return html
|
|
|
826 |
temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
|
827 |
temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
|
828 |
|
829 |
+
output_path = get_path('txt', survey_id, 'generated_result.json')
|
830 |
with open(output_path, 'w', encoding='utf-8') as f:
|
831 |
json.dump(temp, f, ensure_ascii=False, indent=4)
|
832 |
print(f"Survey has been saved to {output_path}.")
|
|
|
911 |
temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
|
912 |
temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
|
913 |
temp["content"] = insert_section(temp["content"], "Future Directions", temp["future_directions"])
|
914 |
+
# references = generate_references_dir(get_path('txt', survey_id))
|
915 |
+
output_path = get_path('txt', survey_id, 'generated_result.json')
|
916 |
with open(output_path, 'w', encoding='utf-8') as f:
|
917 |
json.dump(temp, f, ensure_ascii=False, indent=4)
|
918 |
print(f"Survey has been saved to {output_path}.")
|
|
|
964 |
temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
|
965 |
temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
|
966 |
temp["content"] = insert_section(temp["content"], "Future Directions", temp["future_directions"])
|
967 |
+
output_path = get_path('txt', survey_id, 'generated_result.json')
|
968 |
with open(output_path, 'w', encoding='utf-8') as f:
|
969 |
json.dump(temp, f, ensure_ascii=False, indent=4)
|
970 |
print(f"Survey has been saved to {output_path}.")
|
|
|
996 |
Many paradigms have been proposed to asses informativeness of data samples for active learning. One of the popular approaches is selecting the most uncertain data sample, i.e the data sample in which current classifier is least confident. Some other approaches are selecting the sample which yields a model with minimum risk or the data sample which yields fastest convergence in gradient based methods.//
|
997 |
An active under-sampling approach is presented in this paper to change the data distribution of training datasets, and improve the classification accuracy of minority classes while maintaining overall classification performance.//
|
998 |
In this paper, we propose an uncertainty-based active learning algorithm which requires only samples of one class and a set of unlabeled data in order to operate.//
|
999 |
+
The principal contribution of our work is twofold: First, we use Bayes' rule and density estimation to avoid the need to have a model of all classes for computing the uncertainty measure.//
|
1000 |
This technique reduces the number of input parameters of the problem. At the rest of this paper, we first review recent related works in the fields of active learning and active one-class learning (section II).//
|
1001 |
The classifier predicts that all the samples are non-fraud, it will have a quite high accuracy. However, for problems like fraud detection, minority class classification accuracy is more critical.//
|
1002 |
The algorithm used and the features selected are always the key points at design time, and many experiments are needed to select the final algorithm and the best suited feature set.//
|
src/demo/asg_retriever.py
CHANGED
@@ -8,6 +8,7 @@ from .asg_splitter import TextSplitting
|
|
8 |
from langchain_huggingface import HuggingFaceEmbeddings
|
9 |
import time
|
10 |
import concurrent.futures
|
|
|
11 |
|
12 |
class Retriever:
|
13 |
client = None
|
@@ -355,7 +356,7 @@ def query_multiple_collections(collection_names: list[str], query_list: list[str
|
|
355 |
results[collection_name] = future.result()
|
356 |
|
357 |
# Automatically save the results to a JSON file
|
358 |
-
file_path =
|
359 |
with open(file_path, 'w', encoding='utf-8') as f:
|
360 |
json.dump(results, f, ensure_ascii=False, indent=4)
|
361 |
|
|
|
8 |
from langchain_huggingface import HuggingFaceEmbeddings
|
9 |
import time
|
10 |
import concurrent.futures
|
11 |
+
from .path_utils import get_path
|
12 |
|
13 |
class Retriever:
|
14 |
client = None
|
|
|
356 |
results[collection_name] = future.result()
|
357 |
|
358 |
# Automatically save the results to a JSON file
|
359 |
+
file_path = get_path('info', survey_id, 'retrieved_context.json')
|
360 |
with open(file_path, 'w', encoding='utf-8') as f:
|
361 |
json.dump(results, f, ensure_ascii=False, indent=4)
|
362 |
|
src/demo/category_and_tsne.py
CHANGED
@@ -1,14 +1,20 @@
|
|
1 |
from sklearn.metrics import silhouette_score
|
2 |
|
3 |
import numpy as np
|
|
|
4 |
import matplotlib.pyplot as plt
|
5 |
import seaborn as sns
|
6 |
-
import
|
7 |
from sklearn.manifold import TSNE
|
8 |
from sklearn.cluster import AgglomerativeClustering
|
9 |
-
import json
|
10 |
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
plt.switch_backend('agg')
|
14 |
device = 0
|
@@ -133,46 +139,6 @@ class ClusteringWithTopic:
|
|
133 |
print(f"Best n_topics={self.best_n_topics}, Best silhouette_score={self.best_score}")
|
134 |
return self.best_labels, self.best_topic_model, self.best_n_topics
|
135 |
|
136 |
-
def clustering(df, n_cluster, survey_id):
|
137 |
-
text = df['retrieval_result'].astype(str)
|
138 |
-
clustering = ClusteringWithTopic(text, n_cluster)
|
139 |
-
df['label'] = clustering.fit_and_get_labels(text)
|
140 |
-
|
141 |
-
print("The clustering result is: ")
|
142 |
-
for col in df.columns:
|
143 |
-
print(f"{col}: {df.iloc[0][col]}")
|
144 |
-
|
145 |
-
# Save topic model information as JSON
|
146 |
-
topic_json = clustering.topic_model.get_topic_info().to_json()
|
147 |
-
with open(f'./src/static/data/info/{survey_id}/topic.json', 'w', encoding="utf-8") as file:
|
148 |
-
file.write(topic_json)
|
149 |
-
|
150 |
-
# Create a dictionary from 'ref_title' and 'retrieval_result' columns
|
151 |
-
description_dict = dict(zip(df['ref_title'], df['retrieval_result']))
|
152 |
-
|
153 |
-
# Save the dictionary to description.json
|
154 |
-
with open(f'./src/static/data/info/{survey_id}/description.json', 'w', encoding="utf-8") as file:
|
155 |
-
json.dump(description_dict, file, ensure_ascii=False, indent=4)
|
156 |
-
# df['top_n_words'] = clustering.topic_model.get_topic_info()['Representation'].tolist()
|
157 |
-
# df['topic_word'] = clustering.topic_model.get_topic_info()['KeyBERT'].tolist()
|
158 |
-
|
159 |
-
|
160 |
-
X = np.array(clustering.embeddings)
|
161 |
-
perplexity = 10
|
162 |
-
if X.shape[0] <= perplexity:
|
163 |
-
perplexity = max(1, X.shape[0] // 2)
|
164 |
-
|
165 |
-
tsne = TSNE(n_components=2, init='pca', perplexity=perplexity, random_state=42)
|
166 |
-
X_tsne = tsne.fit_transform(X)
|
167 |
-
colors = scatter(X_tsne, df['label'])
|
168 |
-
|
169 |
-
plt.savefig(IMG_PATH + 'tsne_' + survey_id + '.png', dpi=800, transparent=True)
|
170 |
-
|
171 |
-
plt.close()
|
172 |
-
output_tsv_filename = "./src/static/data/tsv/" + survey_id + '.tsv'
|
173 |
-
df.to_csv(output_tsv_filename, sep='\t')
|
174 |
-
return df, colors
|
175 |
-
|
176 |
def clustering(df, n_topics_list, survey_id):
|
177 |
text = df['retrieval_result'].astype(str)
|
178 |
clustering = ClusteringWithTopic(text, n_topics_list)
|
@@ -184,12 +150,12 @@ def clustering(df, n_topics_list, survey_id):
|
|
184 |
|
185 |
# 保存 topic model 信息
|
186 |
topic_json = topic_model.get_topic_info().to_json()
|
187 |
-
with open(
|
188 |
file.write(topic_json)
|
189 |
|
190 |
# 创建描述信息
|
191 |
description_dict = dict(zip(df['ref_title'], df['retrieval_result']))
|
192 |
-
with open(
|
193 |
json.dump(description_dict, file, ensure_ascii=False, indent=4)
|
194 |
|
195 |
# t-SNE 降维可视化
|
@@ -201,10 +167,10 @@ def clustering(df, n_topics_list, survey_id):
|
|
201 |
|
202 |
colors = scatter(X_tsne, df['label']) # 计算颜色
|
203 |
|
204 |
-
plt.savefig(
|
205 |
|
206 |
plt.close()
|
207 |
-
output_tsv_filename =
|
208 |
df.to_csv(output_tsv_filename, sep='\t')
|
209 |
return df, colors, best_n_topics
|
210 |
|
|
|
1 |
from sklearn.metrics import silhouette_score
|
2 |
|
3 |
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
import matplotlib.pyplot as plt
|
6 |
import seaborn as sns
|
7 |
+
import json
|
8 |
from sklearn.manifold import TSNE
|
9 |
from sklearn.cluster import AgglomerativeClustering
|
|
|
10 |
|
11 |
+
from sentence_transformers import SentenceTransformer
|
12 |
+
from bertopic import BERTopic
|
13 |
+
from bertopic.representation import KeyBERTInspired
|
14 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
15 |
+
from bertopic.vectorizers import ClassTfidfTransformer
|
16 |
+
from umap import UMAP
|
17 |
+
from .path_utils import get_path
|
18 |
|
19 |
plt.switch_backend('agg')
|
20 |
device = 0
|
|
|
139 |
print(f"Best n_topics={self.best_n_topics}, Best silhouette_score={self.best_score}")
|
140 |
return self.best_labels, self.best_topic_model, self.best_n_topics
|
141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
def clustering(df, n_topics_list, survey_id):
|
143 |
text = df['retrieval_result'].astype(str)
|
144 |
clustering = ClusteringWithTopic(text, n_topics_list)
|
|
|
150 |
|
151 |
# 保存 topic model 信息
|
152 |
topic_json = topic_model.get_topic_info().to_json()
|
153 |
+
with open(get_path('info', survey_id, 'topic.json'), 'w', encoding="utf-8") as file:
|
154 |
file.write(topic_json)
|
155 |
|
156 |
# 创建描述信息
|
157 |
description_dict = dict(zip(df['ref_title'], df['retrieval_result']))
|
158 |
+
with open(get_path('info', survey_id, 'description.json'), 'w', encoding="utf-8") as file:
|
159 |
json.dump(description_dict, file, ensure_ascii=False, indent=4)
|
160 |
|
161 |
# t-SNE 降维可视化
|
|
|
167 |
|
168 |
colors = scatter(X_tsne, df['label']) # 计算颜色
|
169 |
|
170 |
+
plt.savefig(get_path('img', filename='tsne_' + survey_id + '.png'), dpi=800, transparent=True)
|
171 |
|
172 |
plt.close()
|
173 |
+
output_tsv_filename = get_path('tsv', survey_id + '.tsv')
|
174 |
df.to_csv(output_tsv_filename, sep='\t')
|
175 |
return df, colors, best_n_topics
|
176 |
|
src/demo/path_utils.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import tempfile
|
3 |
+
|
4 |
+
# 检查是否在 Hugging Face Spaces 环境中
|
5 |
+
def get_data_paths():
|
6 |
+
# 如果在 Hugging Face Spaces 中,使用临时目录
|
7 |
+
if os.environ.get('SPACE_ID') or os.environ.get('HF_SPACE_ID'):
|
8 |
+
# 使用临时目录
|
9 |
+
temp_dir = tempfile.mkdtemp()
|
10 |
+
return {
|
11 |
+
'DATA_PATH': os.path.join(temp_dir, 'pdf/'),
|
12 |
+
'TXT_PATH': os.path.join(temp_dir, 'txt/'),
|
13 |
+
'TSV_PATH': os.path.join(temp_dir, 'tsv/'),
|
14 |
+
'MD_PATH': os.path.join(temp_dir, 'md/'),
|
15 |
+
'INFO_PATH': os.path.join(temp_dir, 'info/'),
|
16 |
+
'IMG_PATH': os.path.join(temp_dir, 'img/'),
|
17 |
+
'RESULTS_PATH': os.path.join(temp_dir, 'results/')
|
18 |
+
}
|
19 |
+
else:
|
20 |
+
# 本地环境使用原来的路径
|
21 |
+
return {
|
22 |
+
'DATA_PATH': './src/static/data/pdf/',
|
23 |
+
'TXT_PATH': './src/static/data/txt/',
|
24 |
+
'TSV_PATH': './src/static/data/tsv/',
|
25 |
+
'MD_PATH': './src/static/data/md/',
|
26 |
+
'INFO_PATH': './src/static/data/info/',
|
27 |
+
'IMG_PATH': './src/static/img/',
|
28 |
+
'RESULTS_PATH': './src/static/data/results/'
|
29 |
+
}
|
30 |
+
|
31 |
+
# 全局路径管理函数
|
32 |
+
def get_path(path_type, survey_id=None, filename=None):
|
33 |
+
"""
|
34 |
+
获取动态路径
|
35 |
+
path_type: 'pdf', 'txt', 'tsv', 'md', 'info', 'img', 'results'
|
36 |
+
survey_id: 可选的调查ID
|
37 |
+
filename: 可选的文件名
|
38 |
+
"""
|
39 |
+
paths_config = get_data_paths()
|
40 |
+
|
41 |
+
if path_type == 'pdf':
|
42 |
+
base_path = paths_config['DATA_PATH']
|
43 |
+
elif path_type == 'txt':
|
44 |
+
base_path = paths_config['TXT_PATH']
|
45 |
+
elif path_type == 'tsv':
|
46 |
+
base_path = paths_config['TSV_PATH']
|
47 |
+
elif path_type == 'md':
|
48 |
+
base_path = paths_config['MD_PATH']
|
49 |
+
elif path_type == 'info':
|
50 |
+
base_path = paths_config['INFO_PATH']
|
51 |
+
elif path_type == 'img':
|
52 |
+
base_path = paths_config['IMG_PATH']
|
53 |
+
elif path_type == 'results':
|
54 |
+
base_path = paths_config['RESULTS_PATH']
|
55 |
+
else:
|
56 |
+
raise ValueError(f"Unknown path type: {path_type}")
|
57 |
+
|
58 |
+
if survey_id:
|
59 |
+
base_path = os.path.join(base_path, str(survey_id))
|
60 |
+
|
61 |
+
if filename:
|
62 |
+
return os.path.join(base_path, filename)
|
63 |
+
|
64 |
+
return base_path
|
src/demo/survey_generation_pipeline/asg_loader.py
CHANGED
@@ -6,9 +6,14 @@ import os
|
|
6 |
import re
|
7 |
import json
|
8 |
import subprocess
|
|
|
|
|
|
|
9 |
from langchain_community.document_loaders import UnstructuredMarkdownLoader
|
10 |
-
from
|
11 |
import shutil
|
|
|
|
|
12 |
|
13 |
# load spaCy model
|
14 |
# nlp = spacy.load("en_core_web_sm")
|
@@ -130,7 +135,9 @@ class DocumentLoading:
|
|
130 |
}
|
131 |
return extracted_data
|
132 |
|
133 |
-
def process_md_file(self, md_file_path, survey_id, txt_path=
|
|
|
|
|
134 |
loader = UnstructuredMarkdownLoader(md_file_path)
|
135 |
data = loader.load()
|
136 |
assert len(data) == 1, "Expected exactly one document in the markdown file."
|
@@ -146,15 +153,15 @@ class DocumentLoading:
|
|
146 |
invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '_']
|
147 |
for char in invalid_chars:
|
148 |
title_new = title_new.replace(char, ' ')
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
with open(f'{txt_path}/{survey_id}/{title_new}.json', 'w', encoding='utf-8') as f:
|
153 |
json.dump(extracted_data, f, ensure_ascii=False, indent=4)
|
154 |
-
# print(extracted_data)
|
155 |
return extracted_data['introduction']
|
156 |
|
157 |
-
def process_md_file_full(self, md_file_path, survey_id, txt_path=
|
|
|
|
|
158 |
loader = UnstructuredMarkdownLoader(md_file_path)
|
159 |
data = loader.load()
|
160 |
assert len(data) == 1, "Expected exactly one document in the markdown file."
|
@@ -170,18 +177,16 @@ class DocumentLoading:
|
|
170 |
invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '_']
|
171 |
for char in invalid_chars:
|
172 |
title_new = title_new.replace(char, ' ')
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
with open(f'{txt_path}/{survey_id}/{title_new}.json', 'w', encoding='utf-8') as f:
|
177 |
json.dump(extracted_data, f, ensure_ascii=False, indent=4)
|
178 |
-
# print(extracted_data)
|
179 |
return extracted_data['abstract'] + extracted_data['introduction'] + extracted_data['main_content']
|
180 |
|
181 |
|
182 |
def load_pdf(self, pdf_file, survey_id, mode):
|
183 |
-
os.makedirs(
|
184 |
-
output_dir =
|
185 |
base_name = os.path.splitext(os.path.basename(pdf_file))[0]
|
186 |
target_dir = os.path.join(output_dir, base_name, "auto")
|
187 |
|
@@ -200,8 +205,8 @@ class DocumentLoading:
|
|
200 |
|
201 |
# wrong, still being tested
|
202 |
def load_pdf_new(self, pdf_dir, survey_id):
|
203 |
-
os.makedirs(
|
204 |
-
output_dir =
|
205 |
self.convert_pdf_to_md_new(pdf_dir, output_dir)
|
206 |
markdown_files = glob.glob(os.path.join(output_dir, "*", "auto", "*.md"))
|
207 |
all_introductions = []
|
@@ -416,7 +421,7 @@ class DocumentLoading:
|
|
416 |
# # clear blocks that are likely annotations
|
417 |
# if re.search(r'\d{4}\s\d+\s\w+\sConference\s.*?\|\s.*?\|\sDOI:.*?\s\|\s\w+:\s.*?\n', block, flags=re.DOTALL) or \
|
418 |
# re.search(r'http\S+', block) or \
|
419 |
-
# re.search(r'\d+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w
|
420 |
# continue
|
421 |
# cleaned_blocks.append(block)
|
422 |
# return cleaned_blocks
|
|
|
6 |
import re
|
7 |
import json
|
8 |
import subprocess
|
9 |
+
import glob
|
10 |
+
from pathlib import Path
|
11 |
+
from concurrent.futures import ProcessPoolExecutor
|
12 |
from langchain_community.document_loaders import UnstructuredMarkdownLoader
|
13 |
+
from langchain.schema import Document
|
14 |
import shutil
|
15 |
+
import tempfile
|
16 |
+
from ..path_utils import get_path
|
17 |
|
18 |
# load spaCy model
|
19 |
# nlp = spacy.load("en_core_web_sm")
|
|
|
135 |
}
|
136 |
return extracted_data
|
137 |
|
138 |
+
def process_md_file(self, md_file_path, survey_id, txt_path=None):
|
139 |
+
if txt_path is None:
|
140 |
+
txt_path = get_path('txt')
|
141 |
loader = UnstructuredMarkdownLoader(md_file_path)
|
142 |
data = loader.load()
|
143 |
assert len(data) == 1, "Expected exactly one document in the markdown file."
|
|
|
153 |
invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '_']
|
154 |
for char in invalid_chars:
|
155 |
title_new = title_new.replace(char, ' ')
|
156 |
+
|
157 |
+
os.makedirs(get_path('txt', survey_id), exist_ok=True)
|
158 |
+
with open(get_path('txt', survey_id, f'{title_new}.json'), 'w', encoding='utf-8') as f:
|
|
|
159 |
json.dump(extracted_data, f, ensure_ascii=False, indent=4)
|
|
|
160 |
return extracted_data['introduction']
|
161 |
|
162 |
+
def process_md_file_full(self, md_file_path, survey_id, txt_path=None):
|
163 |
+
if txt_path is None:
|
164 |
+
txt_path = get_path('txt')
|
165 |
loader = UnstructuredMarkdownLoader(md_file_path)
|
166 |
data = loader.load()
|
167 |
assert len(data) == 1, "Expected exactly one document in the markdown file."
|
|
|
177 |
invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '_']
|
178 |
for char in invalid_chars:
|
179 |
title_new = title_new.replace(char, ' ')
|
180 |
+
|
181 |
+
os.makedirs(get_path('txt', survey_id), exist_ok=True)
|
182 |
+
with open(get_path('txt', survey_id, f'{title_new}.json'), 'w', encoding='utf-8') as f:
|
|
|
183 |
json.dump(extracted_data, f, ensure_ascii=False, indent=4)
|
|
|
184 |
return extracted_data['abstract'] + extracted_data['introduction'] + extracted_data['main_content']
|
185 |
|
186 |
|
187 |
def load_pdf(self, pdf_file, survey_id, mode):
|
188 |
+
os.makedirs(get_path('md', survey_id), exist_ok=True)
|
189 |
+
output_dir = get_path('md', survey_id)
|
190 |
base_name = os.path.splitext(os.path.basename(pdf_file))[0]
|
191 |
target_dir = os.path.join(output_dir, base_name, "auto")
|
192 |
|
|
|
205 |
|
206 |
# wrong, still being tested
|
207 |
def load_pdf_new(self, pdf_dir, survey_id):
|
208 |
+
os.makedirs(get_path('md', survey_id), exist_ok=True)
|
209 |
+
output_dir = get_path('md', survey_id)
|
210 |
self.convert_pdf_to_md_new(pdf_dir, output_dir)
|
211 |
markdown_files = glob.glob(os.path.join(output_dir, "*", "auto", "*.md"))
|
212 |
all_introductions = []
|
|
|
421 |
# # clear blocks that are likely annotations
|
422 |
# if re.search(r'\d{4}\s\d+\s\w+\sConference\s.*?\|\s.*?\|\sDOI:.*?\s\|\s\w+:\s.*?\n', block, flags=re.DOTALL) or \
|
423 |
# re.search(r'http\S+', block) or \
|
424 |
+
# re.search(r'\d+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+\s\w+', block, flags=re.DOTALL):
|
425 |
# continue
|
426 |
# cleaned_blocks.append(block)
|
427 |
# return cleaned_blocks
|
src/demo/survey_generation_pipeline/asg_outline.py
CHANGED
@@ -9,7 +9,8 @@ from asg_abstract import AbstractGenerator
|
|
9 |
from asg_conclusion import ConclusionGenerator
|
10 |
from asg_retriever import *
|
11 |
import pandas as df
|
12 |
-
from references import generate_references
|
|
|
13 |
|
14 |
|
15 |
class OutlineGenerator():
|
@@ -259,8 +260,10 @@ class OutlineGenerator():
|
|
259 |
clean_text = re.sub(r'\s+', ' ', text).strip()
|
260 |
return messages, clean_text
|
261 |
|
262 |
-
def parseOutline(survey_id, info_path
|
263 |
-
|
|
|
|
|
264 |
try:
|
265 |
with open(file_path, 'r', encoding='utf-8') as file:
|
266 |
data = json.load(file)
|
@@ -286,7 +289,7 @@ def parseOutline(survey_id, info_path = './src/static/data/txt'):
|
|
286 |
print("Failed to extract a valid list string from the outline content.")
|
287 |
return []
|
288 |
|
289 |
-
#
|
290 |
fixed_str = response_extracted.strip()
|
291 |
if not fixed_str.startswith("[["):
|
292 |
# 如果不是,则去掉原有的首尾括号,再重新包装:[[ ... ]]
|
@@ -825,7 +828,7 @@ def generateSurvey(survey_id, title, collection_list, pipeline):
|
|
825 |
temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
|
826 |
temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
|
827 |
|
828 |
-
output_path =
|
829 |
with open(output_path, 'w', encoding='utf-8') as f:
|
830 |
json.dump(temp, f, ensure_ascii=False, indent=4)
|
831 |
print(f"Survey has been saved to {output_path}.")
|
@@ -910,25 +913,21 @@ def generateSurvey_qwen(survey_id, title, collection_list, pipeline):
|
|
910 |
temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
|
911 |
temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
|
912 |
temp["content"] = insert_section(temp["content"], "Future Directions", temp["future_directions"])
|
913 |
-
|
|
|
914 |
with open(output_path, 'w', encoding='utf-8') as f:
|
915 |
json.dump(temp, f, ensure_ascii=False, indent=4)
|
916 |
print(f"Survey has been saved to {output_path}.")
|
917 |
return
|
918 |
|
919 |
# wza
|
920 |
-
def generateSurvey_qwen_new(survey_id, title, collection_list, pipeline, citation_data_list, txt_path
|
921 |
-
|
|
|
|
|
922 |
client = getQwenClient()
|
923 |
context_list = generate_context_list(outline, collection_list)
|
924 |
|
925 |
-
# print("!!!!!!!!")
|
926 |
-
# print(context_list)
|
927 |
-
# print("2025")
|
928 |
-
|
929 |
-
# 不再重复查询citation数据,而是直接使用传入的citation_data_list
|
930 |
-
# citation_data_list来自get_survey_id传入的Global_citation_data
|
931 |
-
|
932 |
temp = {
|
933 |
"survey_id": survey_id,
|
934 |
"outline": outline,
|
@@ -969,7 +968,7 @@ def generateSurvey_qwen_new(survey_id, title, collection_list, pipeline, citatio
|
|
969 |
temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
|
970 |
temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
|
971 |
temp["content"] = insert_section(temp["content"], "Future Directions", temp["future_directions"])
|
972 |
-
output_path =
|
973 |
with open(output_path, 'w', encoding='utf-8') as f:
|
974 |
json.dump(temp, f, ensure_ascii=False, indent=4)
|
975 |
print(f"Survey has been saved to {output_path}.")
|
@@ -1002,7 +1001,7 @@ if __name__ == '__main__':
|
|
1002 |
Many paradigms have been proposed to asses informativeness of data samples for active learning. One of the popular approaches is selecting the most uncertain data sample, i.e the data sample in which current classifier is least confident. Some other approaches are selecting the sample which yields a model with minimum risk or the data sample which yields fastest convergence in gradient based methods.//
|
1003 |
An active under-sampling approach is presented in this paper to change the data distribution of training datasets, and improve the classification accuracy of minority classes while maintaining overall classification performance.//
|
1004 |
In this paper, we propose an uncertainty-based active learning algorithm which requires only samples of one class and a set of unlabeled data in order to operate.//
|
1005 |
-
The principal contribution of our work is twofold: First, we use Bayes
|
1006 |
This technique reduces the number of input parameters of the problem. At the rest of this paper, we first review recent related works in the fields of active learning and active one-class learning (section II).//
|
1007 |
The classifier predicts that all the samples are non-fraud, it will have a quite high accuracy. However, for problems like fraud detection, minority class classification accuracy is more critical.//
|
1008 |
The algorithm used and the features selected are always the key points at design time, and many experiments are needed to select the final algorithm and the best suited feature set.//
|
|
|
9 |
from asg_conclusion import ConclusionGenerator
|
10 |
from asg_retriever import *
|
11 |
import pandas as df
|
12 |
+
from .references import generate_references
|
13 |
+
from ..path_utils import get_path
|
14 |
|
15 |
|
16 |
class OutlineGenerator():
|
|
|
260 |
clean_text = re.sub(r'\s+', ' ', text).strip()
|
261 |
return messages, clean_text
|
262 |
|
263 |
+
def parseOutline(survey_id, info_path=None):
|
264 |
+
if info_path is None:
|
265 |
+
info_path = get_path('txt')
|
266 |
+
file_path = get_path('txt', survey_id, 'outline.json')
|
267 |
try:
|
268 |
with open(file_path, 'r', encoding='utf-8') as file:
|
269 |
data = json.load(file)
|
|
|
289 |
print("Failed to extract a valid list string from the outline content.")
|
290 |
return []
|
291 |
|
292 |
+
# 检查提取结果是否为"列表的列表"格式(应该以 "[[" 开头)
|
293 |
fixed_str = response_extracted.strip()
|
294 |
if not fixed_str.startswith("[["):
|
295 |
# 如果不是,则去掉原有的首尾括号,再重新包装:[[ ... ]]
|
|
|
828 |
temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
|
829 |
temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
|
830 |
|
831 |
+
output_path = get_path('txt', survey_id, 'generated_result.json')
|
832 |
with open(output_path, 'w', encoding='utf-8') as f:
|
833 |
json.dump(temp, f, ensure_ascii=False, indent=4)
|
834 |
print(f"Survey has been saved to {output_path}.")
|
|
|
913 |
temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
|
914 |
temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
|
915 |
temp["content"] = insert_section(temp["content"], "Future Directions", temp["future_directions"])
|
916 |
+
# references = generate_references_dir(get_path('txt', survey_id))
|
917 |
+
output_path = get_path('txt', survey_id, 'generated_result.json')
|
918 |
with open(output_path, 'w', encoding='utf-8') as f:
|
919 |
json.dump(temp, f, ensure_ascii=False, indent=4)
|
920 |
print(f"Survey has been saved to {output_path}.")
|
921 |
return
|
922 |
|
923 |
# wza
|
924 |
+
def generateSurvey_qwen_new(survey_id, title, collection_list, pipeline, citation_data_list, txt_path=None):
|
925 |
+
if txt_path is None:
|
926 |
+
txt_path = get_path('txt')
|
927 |
+
outline = str(parseOutline(survey_id))
|
928 |
client = getQwenClient()
|
929 |
context_list = generate_context_list(outline, collection_list)
|
930 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
931 |
temp = {
|
932 |
"survey_id": survey_id,
|
933 |
"outline": outline,
|
|
|
968 |
temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
|
969 |
temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
|
970 |
temp["content"] = insert_section(temp["content"], "Future Directions", temp["future_directions"])
|
971 |
+
output_path = get_path('txt', survey_id, 'generated_result.json')
|
972 |
with open(output_path, 'w', encoding='utf-8') as f:
|
973 |
json.dump(temp, f, ensure_ascii=False, indent=4)
|
974 |
print(f"Survey has been saved to {output_path}.")
|
|
|
1001 |
Many paradigms have been proposed to asses informativeness of data samples for active learning. One of the popular approaches is selecting the most uncertain data sample, i.e the data sample in which current classifier is least confident. Some other approaches are selecting the sample which yields a model with minimum risk or the data sample which yields fastest convergence in gradient based methods.//
|
1002 |
An active under-sampling approach is presented in this paper to change the data distribution of training datasets, and improve the classification accuracy of minority classes while maintaining overall classification performance.//
|
1003 |
In this paper, we propose an uncertainty-based active learning algorithm which requires only samples of one class and a set of unlabeled data in order to operate.//
|
1004 |
+
The principal contribution of our work is twofold: First, we use Bayes' rule and density estimation to avoid the need to have a model of all classes for computing the uncertainty measure.//
|
1005 |
This technique reduces the number of input parameters of the problem. At the rest of this paper, we first review recent related works in the fields of active learning and active one-class learning (section II).//
|
1006 |
The classifier predicts that all the samples are non-fraud, it will have a quite high accuracy. However, for problems like fraud detection, minority class classification accuracy is more critical.//
|
1007 |
The algorithm used and the features selected are always the key points at design time, and many experiments are needed to select the final algorithm and the best suited feature set.//
|
src/demo/survey_generation_pipeline/asg_retriever.py
CHANGED
@@ -4,10 +4,11 @@ import re
|
|
4 |
import os
|
5 |
import json
|
6 |
import chromadb
|
7 |
-
from asg_splitter import TextSplitting
|
8 |
from langchain_huggingface import HuggingFaceEmbeddings
|
9 |
import time
|
10 |
import concurrent.futures
|
|
|
11 |
|
12 |
class Retriever:
|
13 |
client = None
|
@@ -367,7 +368,7 @@ def query_multiple_collections(collection_names: list[str], query_list: list[str
|
|
367 |
results[collection_name] = future.result()
|
368 |
|
369 |
# Automatically save the results to a JSON file
|
370 |
-
file_path =
|
371 |
with open(file_path, 'w', encoding='utf-8') as f:
|
372 |
json.dump(results, f, ensure_ascii=False, indent=4)
|
373 |
|
|
|
4 |
import os
|
5 |
import json
|
6 |
import chromadb
|
7 |
+
from .asg_splitter import TextSplitting
|
8 |
from langchain_huggingface import HuggingFaceEmbeddings
|
9 |
import time
|
10 |
import concurrent.futures
|
11 |
+
from ..path_utils import get_path
|
12 |
|
13 |
class Retriever:
|
14 |
client = None
|
|
|
368 |
results[collection_name] = future.result()
|
369 |
|
370 |
# Automatically save the results to a JSON file
|
371 |
+
file_path = get_path('info', survey_id, 'retrieved_context.json')
|
372 |
with open(file_path, 'w', encoding='utf-8') as f:
|
373 |
json.dump(results, f, ensure_ascii=False, indent=4)
|
374 |
|
src/demo/survey_generation_pipeline/category_and_tsne.py
CHANGED
@@ -7,8 +7,9 @@ import matplotlib.pyplot as plt
|
|
7 |
from sklearn.manifold import TSNE
|
8 |
from sklearn.cluster import AgglomerativeClustering
|
9 |
import json
|
|
|
10 |
|
11 |
-
IMG_PATH = '
|
12 |
|
13 |
plt.switch_backend('agg')
|
14 |
device = 0
|
@@ -133,47 +134,11 @@ class ClusteringWithTopic:
|
|
133 |
print(f"Best n_topics={self.best_n_topics}, Best silhouette_score={self.best_score}")
|
134 |
return self.best_labels, self.best_topic_model, self.best_n_topics
|
135 |
|
136 |
-
def clustering(df,
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
print("The clustering result is: ")
|
142 |
-
for col in df.columns:
|
143 |
-
print(f"{col}: {df.iloc[0][col]}")
|
144 |
-
|
145 |
-
# Save topic model information as JSON
|
146 |
-
topic_json = clustering.topic_model.get_topic_info().to_json()
|
147 |
-
with open(f'./src/static/data/info/{survey_id}/topic.json', 'w', encoding="utf-8") as file:
|
148 |
-
file.write(topic_json)
|
149 |
-
|
150 |
-
# Create a dictionary from 'ref_title' and 'retrieval_result' columns
|
151 |
-
description_dict = dict(zip(df['ref_title'], df['retrieval_result']))
|
152 |
-
|
153 |
-
# Save the dictionary to description.json
|
154 |
-
with open(f'./src/static/data/info/{survey_id}/description.json', 'w', encoding="utf-8") as file:
|
155 |
-
json.dump(description_dict, file, ensure_ascii=False, indent=4)
|
156 |
-
# df['top_n_words'] = clustering.topic_model.get_topic_info()['Representation'].tolist()
|
157 |
-
# df['topic_word'] = clustering.topic_model.get_topic_info()['KeyBERT'].tolist()
|
158 |
-
|
159 |
-
|
160 |
-
X = np.array(clustering.embeddings)
|
161 |
-
perplexity = 10
|
162 |
-
if X.shape[0] <= perplexity:
|
163 |
-
perplexity = max(1, X.shape[0] // 2)
|
164 |
-
|
165 |
-
tsne = TSNE(n_components=2, init='pca', perplexity=perplexity, random_state=42)
|
166 |
-
X_tsne = tsne.fit_transform(X)
|
167 |
-
colors = scatter(X_tsne, df['label'])
|
168 |
-
|
169 |
-
plt.savefig(IMG_PATH + 'tsne_' + survey_id + '.png', dpi=800, transparent=True)
|
170 |
-
|
171 |
-
plt.close()
|
172 |
-
output_tsv_filename = "./src/static/data/tsv/" + survey_id + '.tsv'
|
173 |
-
df.to_csv(output_tsv_filename, sep='\t')
|
174 |
-
return df, colors
|
175 |
-
|
176 |
-
def clustering(df, n_topics_list, survey_id, info_path='./src/static/data/info', tsv_path='./src/static/data/tsv'):
|
177 |
text = df['retrieval_result'].astype(str)
|
178 |
clustering = ClusteringWithTopic(text, n_topics_list)
|
179 |
df['label'], topic_model, best_n_topics = clustering.fit_and_get_labels()
|
@@ -184,12 +149,12 @@ def clustering(df, n_topics_list, survey_id, info_path='./src/static/data/info',
|
|
184 |
|
185 |
# 保存 topic model 信息
|
186 |
topic_json = topic_model.get_topic_info().to_json()
|
187 |
-
with open(
|
188 |
file.write(topic_json)
|
189 |
|
190 |
# 创建描述信息
|
191 |
description_dict = dict(zip(df['ref_title'], df['retrieval_result']))
|
192 |
-
with open(
|
193 |
json.dump(description_dict, file, ensure_ascii=False, indent=4)
|
194 |
|
195 |
# t-SNE 降维可视化
|
@@ -201,10 +166,10 @@ def clustering(df, n_topics_list, survey_id, info_path='./src/static/data/info',
|
|
201 |
|
202 |
colors = scatter(X_tsne, df['label']) # 计算颜色
|
203 |
|
204 |
-
|
205 |
|
206 |
-
|
207 |
-
output_tsv_filename =
|
208 |
df.to_csv(output_tsv_filename, sep='\t')
|
209 |
return df, colors, best_n_topics
|
210 |
|
|
|
7 |
from sklearn.manifold import TSNE
|
8 |
from sklearn.cluster import AgglomerativeClustering
|
9 |
import json
|
10 |
+
from ..path_utils import get_path
|
11 |
|
12 |
+
IMG_PATH = get_path('img')
|
13 |
|
14 |
plt.switch_backend('agg')
|
15 |
device = 0
|
|
|
134 |
print(f"Best n_topics={self.best_n_topics}, Best silhouette_score={self.best_score}")
|
135 |
return self.best_labels, self.best_topic_model, self.best_n_topics
|
136 |
|
137 |
+
def clustering(df, n_topics_list, survey_id, info_path=None, tsv_path=None):
|
138 |
+
if info_path is None:
|
139 |
+
info_path = get_path('info')
|
140 |
+
if tsv_path is None:
|
141 |
+
tsv_path = get_path('tsv')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
text = df['retrieval_result'].astype(str)
|
143 |
clustering = ClusteringWithTopic(text, n_topics_list)
|
144 |
df['label'], topic_model, best_n_topics = clustering.fit_and_get_labels()
|
|
|
149 |
|
150 |
# 保存 topic model 信息
|
151 |
topic_json = topic_model.get_topic_info().to_json()
|
152 |
+
with open(get_path('info', survey_id, 'topic.json'), 'w', encoding="utf-8") as file:
|
153 |
file.write(topic_json)
|
154 |
|
155 |
# 创建描述信息
|
156 |
description_dict = dict(zip(df['ref_title'], df['retrieval_result']))
|
157 |
+
with open(get_path('info', survey_id, 'description.json'), 'w', encoding="utf-8") as file:
|
158 |
json.dump(description_dict, file, ensure_ascii=False, indent=4)
|
159 |
|
160 |
# t-SNE 降维可视化
|
|
|
166 |
|
167 |
colors = scatter(X_tsne, df['label']) # 计算颜色
|
168 |
|
169 |
+
plt.savefig(get_path('img', filename='tsne_' + survey_id + '.png'), dpi=800, transparent=True)
|
170 |
|
171 |
+
plt.close()
|
172 |
+
output_tsv_filename = get_path('tsv', survey_id + '.tsv')
|
173 |
df.to_csv(output_tsv_filename, sep='\t')
|
174 |
return df, colors, best_n_topics
|
175 |
|
src/demo/views.py
CHANGED
@@ -41,6 +41,10 @@ import glob
|
|
41 |
|
42 |
from langchain_huggingface import HuggingFaceEmbeddings
|
43 |
from dotenv import load_dotenv
|
|
|
|
|
|
|
|
|
44 |
|
45 |
dotenv_path = os.path.join(os.path.dirname(__file__), ".env")
|
46 |
load_dotenv()
|
@@ -55,26 +59,45 @@ load_dotenv()
|
|
55 |
# print(f"OPENAI_API_KEY: {openai_api_key}")
|
56 |
# print(f"OPENAI_API_BASE: {openai_api_base}")
|
57 |
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
INFO_PATH = './src/static/data/info/'
|
67 |
-
IMG_PATH = './src/static/img/'
|
68 |
|
69 |
paths = [DATA_PATH, TXT_PATH, TSV_PATH, MD_PATH, INFO_PATH, IMG_PATH]
|
70 |
|
|
|
71 |
for path in paths:
|
72 |
-
|
73 |
-
|
74 |
-
path_obj.
|
75 |
-
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
|
80 |
|
@@ -165,17 +188,19 @@ def index(request):
|
|
165 |
def delete_files(request):
|
166 |
if request.method == 'POST':
|
167 |
try:
|
168 |
-
|
|
|
169 |
for folder in folders:
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
os.
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
|
|
179 |
return JsonResponse({'success': True})
|
180 |
except Exception as e:
|
181 |
return JsonResponse({'success': False, 'message': str(e)})
|
@@ -279,7 +304,7 @@ def sanitize_filename_py(filename):
|
|
279 |
|
280 |
def get_existing_survey_ids():
|
281 |
|
282 |
-
tsv_directory =
|
283 |
survey_ids = []
|
284 |
try:
|
285 |
for file_name in os.listdir(tsv_directory):
|
@@ -299,7 +324,7 @@ def get_surveys(request):
|
|
299 |
def upload_refs(request):
|
300 |
|
301 |
start_time = time.time()
|
302 |
-
RECOMMENDED_PDF_DIR =
|
303 |
if request.method == 'POST':
|
304 |
if not request.FILES:
|
305 |
if not os.path.exists(RECOMMENDED_PDF_DIR):
|
@@ -369,7 +394,7 @@ def upload_refs(request):
|
|
369 |
continue
|
370 |
sanitized_filename = f"{sanitized_filename}{file_extension}"
|
371 |
|
372 |
-
file_path = os.path.join('
|
373 |
if default_storage.exists(file_path):
|
374 |
default_storage.delete(file_path)
|
375 |
|
@@ -388,7 +413,7 @@ def upload_refs(request):
|
|
388 |
csvfile_name = new_file_name + '.'+ file_name.split('.')[-1]
|
389 |
|
390 |
json_data_pd = pd.DataFrame()
|
391 |
-
json_files_path =
|
392 |
json_files = glob.glob(json_files_path)
|
393 |
|
394 |
# Dictionary to hold title and abstract pairs
|
@@ -425,7 +450,7 @@ def upload_refs(request):
|
|
425 |
title_abstract_dict[title] = abstract
|
426 |
|
427 |
input_pd = json_data_pd
|
428 |
-
output_path =
|
429 |
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
430 |
|
431 |
with open(output_path, 'w', encoding="utf-8") as outfile:
|
@@ -446,7 +471,8 @@ def upload_refs(request):
|
|
446 |
input_pd["label"] = input_pd["reference paper category label (optional)"].apply(lambda x: str(x) if len(str(x))>0 else '')
|
447 |
|
448 |
try:
|
449 |
-
output_tsv_filename =
|
|
|
450 |
|
451 |
output_df = input_pd[["ref_title","ref_context","ref_entry","abstract","intro"]]
|
452 |
|
@@ -558,7 +584,7 @@ def generate_arxiv_query(request):
|
|
558 |
new_count += 1
|
559 |
|
560 |
attempts += 1
|
561 |
-
current_query = generic_query #
|
562 |
|
563 |
if len(total_papers) >= min_results:
|
564 |
# 一旦达到 min_results,就返回此时的查询
|
@@ -596,7 +622,7 @@ def download_pdfs(request):
|
|
596 |
if not pdf_links:
|
597 |
return JsonResponse({"message": "No PDFs to download."}, status=400)
|
598 |
|
599 |
-
base_dir =
|
600 |
os.makedirs(base_dir, exist_ok=True) # 确保文件夹存在
|
601 |
|
602 |
downloaded_files = []
|
@@ -668,33 +694,14 @@ def automatic_taxonomy(request):
|
|
668 |
description = generate(context, query, name)
|
669 |
Global_description_list.append(description)
|
670 |
|
671 |
-
#
|
672 |
-
citation_path =
|
673 |
os.makedirs(os.path.dirname(citation_path), exist_ok=True)
|
674 |
-
with open(citation_path, 'w', encoding=
|
675 |
-
json.dump(Global_citation_data,
|
676 |
-
|
677 |
-
file_path = f'./src/static/data/tsv/{Global_survey_id}.tsv'
|
678 |
-
with open(file_path, 'r', newline='', encoding='utf-8') as infile:
|
679 |
-
reader = csv.reader(infile, delimiter='\t')
|
680 |
-
rows = list(reader)
|
681 |
-
|
682 |
-
if rows:
|
683 |
-
headers = rows[0]
|
684 |
-
headers.append('retrieval_result')
|
685 |
|
686 |
-
|
687 |
-
|
688 |
-
row.append(description)
|
689 |
-
updated_rows.append(row)
|
690 |
-
|
691 |
-
with open(file_path, 'w', newline='', encoding='utf-8') as outfile:
|
692 |
-
writer = csv.writer(outfile, delimiter='\t')
|
693 |
-
writer.writerows(updated_rows)
|
694 |
-
|
695 |
-
print('Updated file has been saved to', file_path)
|
696 |
-
else:
|
697 |
-
print('Input file is empty.')
|
698 |
|
699 |
Global_ref_list = ref_list
|
700 |
|
@@ -708,11 +715,11 @@ def automatic_taxonomy(request):
|
|
708 |
ref_titles = list(df_tmp.groupby(df_tmp['label'])['ref_title'].apply(list))
|
709 |
ref_indexs = list(df_tmp.groupby(df_tmp['label'])['index'].apply(list))
|
710 |
|
711 |
-
info = pd.read_json(
|
712 |
category_label = info['KeyBERT'].to_list()
|
713 |
category_label_summarized=[]
|
714 |
|
715 |
-
tsv_path =
|
716 |
|
717 |
cluster_num = Global_cluster_num
|
718 |
category_label_summarized = generate_cluster_name_new(tsv_path, Global_survey_title, cluster_num)
|
@@ -733,7 +740,7 @@ def automatic_taxonomy(request):
|
|
733 |
temp = [legal_pdf(i) for i in value]
|
734 |
cluster_info[key] = temp
|
735 |
Global_collection_names_clustered.append(temp)
|
736 |
-
cluster_info_path =
|
737 |
with open(cluster_info_path, 'w', encoding="utf-8") as outfile:
|
738 |
json.dump(cluster_info, outfile, indent=4, ensure_ascii=False)
|
739 |
|
@@ -743,7 +750,7 @@ def automatic_taxonomy(request):
|
|
743 |
messages, outline = outline_generator.generate_outline_qwen(Global_survey_title, Global_cluster_num)
|
744 |
|
745 |
outline_json = {'messages':messages, 'outline': outline}
|
746 |
-
output_path =
|
747 |
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
748 |
with open(output_path, 'w', encoding="utf-8") as outfile:
|
749 |
json.dump(outline_json, outfile, indent=4, ensure_ascii=False)
|
@@ -767,7 +774,7 @@ def save_updated_cluster_info(request):
|
|
767 |
if not survey_id or not updated_cate_list:
|
768 |
return JsonResponse({"error": "Missing survey_id or updated_cate_list"}, status=400)
|
769 |
|
770 |
-
save_dir =
|
771 |
os.makedirs(save_dir, exist_ok=True)
|
772 |
save_path = os.path.join(save_dir, 'cluster_info_updated.json')
|
773 |
|
@@ -807,7 +814,7 @@ def save_outline(request):
|
|
807 |
"outline": str(updated_outline)
|
808 |
}
|
809 |
|
810 |
-
file_path =
|
811 |
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
812 |
|
813 |
with open(file_path, 'w', encoding='utf-8') as file:
|
@@ -952,7 +959,7 @@ def generate_pdf(request):
|
|
952 |
if request.method == 'POST':
|
953 |
survey_id = request.POST.get('survey_id', '')
|
954 |
markdown_content = request.POST.get('content', '')
|
955 |
-
markdown_dir =
|
956 |
markdown_filename = f'survey_{survey_id}_vanilla.md'
|
957 |
markdown_filepath = os.path.join(markdown_dir, markdown_filename)
|
958 |
|
@@ -970,7 +977,7 @@ def generate_pdf(request):
|
|
970 |
|
971 |
markdown_content = finalize_survey_paper(markdown_content, Global_collection_names, Global_file_names)
|
972 |
# 设置 Markdown 文件的保存路径1
|
973 |
-
markdown_dir =
|
974 |
markdown_filename = f'survey_{survey_id}_processed.md'
|
975 |
markdown_filepath = os.path.join(markdown_dir, markdown_filename)
|
976 |
|
@@ -990,7 +997,7 @@ def generate_pdf(request):
|
|
990 |
|
991 |
# 配置 PDF 文件的保存路径
|
992 |
pdf_filename = f'survey_{survey_id}.pdf'
|
993 |
-
pdf_dir = '
|
994 |
pdf_filepath = os.path.join(pdf_dir, pdf_filename)
|
995 |
|
996 |
# 检查并创建 results 目录
|
@@ -1022,13 +1029,13 @@ def generate_pdf_from_tex(request):
|
|
1022 |
|
1023 |
global Global_survey_id, Global_survey_title
|
1024 |
if request.method == 'POST':
|
1025 |
-
base_dir =
|
1026 |
md_path = os.path.join(base_dir, f'survey_{Global_survey_id}_processed.md')
|
1027 |
new_md_path = os.path.join(base_dir, f'survey_{Global_survey_id}_preprocessed.md')
|
1028 |
tex_path = os.path.join(base_dir, 'template.tex')
|
1029 |
new_tex_path = os.path.join(base_dir, 'template_with_figure.tex')
|
1030 |
sty_path = os.path.join(base_dir, 'acl.sty')
|
1031 |
-
pdf_dir = '
|
1032 |
|
1033 |
os.makedirs(base_dir, exist_ok=True)
|
1034 |
print(f"Directory '{base_dir}' checked or created.")
|
@@ -1044,9 +1051,9 @@ def generate_pdf_from_tex(request):
|
|
1044 |
md_to_tex(new_md_path, tex_path, Global_survey_title)
|
1045 |
|
1046 |
insert_figures(
|
1047 |
-
png_path=
|
1048 |
tex_path= tex_path,
|
1049 |
-
json_path=
|
1050 |
ref_names= Global_ref_list,
|
1051 |
survey_title=Global_survey_title,
|
1052 |
new_tex_path=new_tex_path
|
@@ -1155,7 +1162,7 @@ def get_survey_text(refs=Global_ref_list):
|
|
1155 |
|
1156 |
def Clustering_refs(n_clusters):
|
1157 |
global Global_cluster_num
|
1158 |
-
df = pd.read_csv(
|
1159 |
|
1160 |
print(Global_ref_list)
|
1161 |
df_selected = df.iloc[Global_ref_list]
|
@@ -1232,10 +1239,10 @@ def finalize_survey_paper(paper_text,
|
|
1232 |
Global_ref_list = ref_list
|
1233 |
print(ref_list)
|
1234 |
|
1235 |
-
json_path =
|
1236 |
-
output_png_path =
|
1237 |
-
md_path =
|
1238 |
-
flowchart_results_path =
|
1239 |
detect_flowcharts(Global_survey_id)
|
1240 |
png_path = generate_graphviz_png(
|
1241 |
json_path=json_path,
|
|
|
41 |
|
42 |
from langchain_huggingface import HuggingFaceEmbeddings
|
43 |
from dotenv import load_dotenv
|
44 |
+
from pathlib import Path
|
45 |
+
from markdown_pdf import MarkdownPdf, Section
|
46 |
+
import tempfile
|
47 |
+
from .path_utils import get_path
|
48 |
|
49 |
dotenv_path = os.path.join(os.path.dirname(__file__), ".env")
|
50 |
load_dotenv()
|
|
|
59 |
# print(f"OPENAI_API_KEY: {openai_api_key}")
|
60 |
# print(f"OPENAI_API_BASE: {openai_api_base}")
|
61 |
|
62 |
+
# 获取路径配置
|
63 |
+
paths_config = get_path('pdf') # 使用 get_path 函数获取路径配置
|
64 |
+
DATA_PATH = get_path('pdf')
|
65 |
+
TXT_PATH = get_path('txt')
|
66 |
+
TSV_PATH = get_path('tsv')
|
67 |
+
MD_PATH = get_path('md')
|
68 |
+
INFO_PATH = get_path('info')
|
69 |
+
IMG_PATH = get_path('img')
|
|
|
|
|
70 |
|
71 |
paths = [DATA_PATH, TXT_PATH, TSV_PATH, MD_PATH, INFO_PATH, IMG_PATH]
|
72 |
|
73 |
+
# 安全地创建目录
|
74 |
for path in paths:
|
75 |
+
try:
|
76 |
+
path_obj = Path(path)
|
77 |
+
if not path_obj.exists():
|
78 |
+
path_obj.mkdir(parents=True, exist_ok=True)
|
79 |
+
print(f"Created directory: {path}")
|
80 |
+
else:
|
81 |
+
print(f"Directory already exists: {path}")
|
82 |
+
except (PermissionError, OSError) as e:
|
83 |
+
print(f"Warning: Could not create directory {path}: {e}")
|
84 |
+
# 在 Hugging Face Spaces 中,如果无法创建目录,使用临时目录
|
85 |
+
if os.environ.get('SPACE_ID') or os.environ.get('HF_SPACE_ID'):
|
86 |
+
temp_dir = tempfile.mkdtemp()
|
87 |
+
# 更新路径为临时目录
|
88 |
+
if 'pdf' in path:
|
89 |
+
DATA_PATH = os.path.join(temp_dir, 'pdf/')
|
90 |
+
elif 'txt' in path:
|
91 |
+
TXT_PATH = os.path.join(temp_dir, 'txt/')
|
92 |
+
elif 'tsv' in path:
|
93 |
+
TSV_PATH = os.path.join(temp_dir, 'tsv/')
|
94 |
+
elif 'md' in path:
|
95 |
+
MD_PATH = os.path.join(temp_dir, 'md/')
|
96 |
+
elif 'info' in path:
|
97 |
+
INFO_PATH = os.path.join(temp_dir, 'info/')
|
98 |
+
elif 'img' in path:
|
99 |
+
IMG_PATH = os.path.join(temp_dir, 'img/')
|
100 |
+
print(f"Using temporary directory: {temp_dir}")
|
101 |
|
102 |
|
103 |
|
|
|
188 |
def delete_files(request):
|
189 |
if request.method == 'POST':
|
190 |
try:
|
191 |
+
# 使用动态路径而不是硬编码路径
|
192 |
+
folders = [DATA_PATH, TSV_PATH, TXT_PATH, MD_PATH]
|
193 |
for folder in folders:
|
194 |
+
if os.path.exists(folder):
|
195 |
+
for filename in os.listdir(folder):
|
196 |
+
file_path = os.path.join(folder, filename)
|
197 |
+
try:
|
198 |
+
if os.path.isfile(file_path) or os.path.islink(file_path):
|
199 |
+
os.unlink(file_path)
|
200 |
+
elif os.path.isdir(file_path):
|
201 |
+
shutil.rmtree(file_path)
|
202 |
+
except Exception as e:
|
203 |
+
return JsonResponse({'success': False, 'message': str(e)})
|
204 |
return JsonResponse({'success': True})
|
205 |
except Exception as e:
|
206 |
return JsonResponse({'success': False, 'message': str(e)})
|
|
|
304 |
|
305 |
def get_existing_survey_ids():
|
306 |
|
307 |
+
tsv_directory = get_path('tsv')
|
308 |
survey_ids = []
|
309 |
try:
|
310 |
for file_name in os.listdir(tsv_directory):
|
|
|
324 |
def upload_refs(request):
|
325 |
|
326 |
start_time = time.time()
|
327 |
+
RECOMMENDED_PDF_DIR = get_path('pdf', 'recommend_pdfs')
|
328 |
if request.method == 'POST':
|
329 |
if not request.FILES:
|
330 |
if not os.path.exists(RECOMMENDED_PDF_DIR):
|
|
|
394 |
continue
|
395 |
sanitized_filename = f"{sanitized_filename}{file_extension}"
|
396 |
|
397 |
+
file_path = os.path.join(get_path('pdf', Global_survey_id), sanitized_filename)
|
398 |
if default_storage.exists(file_path):
|
399 |
default_storage.delete(file_path)
|
400 |
|
|
|
413 |
csvfile_name = new_file_name + '.'+ file_name.split('.')[-1]
|
414 |
|
415 |
json_data_pd = pd.DataFrame()
|
416 |
+
json_files_path = get_path('txt', Global_survey_id) + '/*.json'
|
417 |
json_files = glob.glob(json_files_path)
|
418 |
|
419 |
# Dictionary to hold title and abstract pairs
|
|
|
450 |
title_abstract_dict[title] = abstract
|
451 |
|
452 |
input_pd = json_data_pd
|
453 |
+
output_path = get_path('txt', Global_survey_id, 'title_abstract_pairs.json')
|
454 |
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
455 |
|
456 |
with open(output_path, 'w', encoding="utf-8") as outfile:
|
|
|
471 |
input_pd["label"] = input_pd["reference paper category label (optional)"].apply(lambda x: str(x) if len(str(x))>0 else '')
|
472 |
|
473 |
try:
|
474 |
+
output_tsv_filename = get_path('tsv', filename=new_file_name + '.tsv')
|
475 |
+
os.makedirs(os.path.dirname(output_tsv_filename), exist_ok=True)
|
476 |
|
477 |
output_df = input_pd[["ref_title","ref_context","ref_entry","abstract","intro"]]
|
478 |
|
|
|
584 |
new_count += 1
|
585 |
|
586 |
attempts += 1
|
587 |
+
current_query = generic_query # 将本轮的宽松查询作为"新的严格查询"
|
588 |
|
589 |
if len(total_papers) >= min_results:
|
590 |
# 一旦达到 min_results,就返回此时的查询
|
|
|
622 |
if not pdf_links:
|
623 |
return JsonResponse({"message": "No PDFs to download."}, status=400)
|
624 |
|
625 |
+
base_dir = get_path('pdf', 'recommend_pdfs')
|
626 |
os.makedirs(base_dir, exist_ok=True) # 确保文件夹存在
|
627 |
|
628 |
downloaded_files = []
|
|
|
694 |
description = generate(context, query, name)
|
695 |
Global_description_list.append(description)
|
696 |
|
697 |
+
# 保存引用数据
|
698 |
+
citation_path = get_path('info', Global_survey_id, 'citation_data.json')
|
699 |
os.makedirs(os.path.dirname(citation_path), exist_ok=True)
|
700 |
+
with open(citation_path, 'w', encoding='utf-8') as f:
|
701 |
+
json.dump(Global_citation_data, f, ensure_ascii=False, indent=2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
702 |
|
703 |
+
# 读取TSV文件
|
704 |
+
file_path = get_path('tsv', Global_survey_id + '.tsv')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
705 |
|
706 |
Global_ref_list = ref_list
|
707 |
|
|
|
715 |
ref_titles = list(df_tmp.groupby(df_tmp['label'])['ref_title'].apply(list))
|
716 |
ref_indexs = list(df_tmp.groupby(df_tmp['label'])['index'].apply(list))
|
717 |
|
718 |
+
info = pd.read_json(get_path('info', Global_survey_id, 'topic.json'))
|
719 |
category_label = info['KeyBERT'].to_list()
|
720 |
category_label_summarized=[]
|
721 |
|
722 |
+
tsv_path = get_path('tsv', Global_survey_id + '.tsv')
|
723 |
|
724 |
cluster_num = Global_cluster_num
|
725 |
category_label_summarized = generate_cluster_name_new(tsv_path, Global_survey_title, cluster_num)
|
|
|
740 |
temp = [legal_pdf(i) for i in value]
|
741 |
cluster_info[key] = temp
|
742 |
Global_collection_names_clustered.append(temp)
|
743 |
+
cluster_info_path = get_path('info', Global_survey_id, 'cluster_info.json')
|
744 |
with open(cluster_info_path, 'w', encoding="utf-8") as outfile:
|
745 |
json.dump(cluster_info, outfile, indent=4, ensure_ascii=False)
|
746 |
|
|
|
750 |
messages, outline = outline_generator.generate_outline_qwen(Global_survey_title, Global_cluster_num)
|
751 |
|
752 |
outline_json = {'messages':messages, 'outline': outline}
|
753 |
+
output_path = get_path('txt', Global_survey_id, 'outline.json')
|
754 |
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
755 |
with open(output_path, 'w', encoding="utf-8") as outfile:
|
756 |
json.dump(outline_json, outfile, indent=4, ensure_ascii=False)
|
|
|
774 |
if not survey_id or not updated_cate_list:
|
775 |
return JsonResponse({"error": "Missing survey_id or updated_cate_list"}, status=400)
|
776 |
|
777 |
+
save_dir = get_path('info', str(survey_id))
|
778 |
os.makedirs(save_dir, exist_ok=True)
|
779 |
save_path = os.path.join(save_dir, 'cluster_info_updated.json')
|
780 |
|
|
|
814 |
"outline": str(updated_outline)
|
815 |
}
|
816 |
|
817 |
+
file_path = get_path('txt', Global_survey_id, 'outline.json')
|
818 |
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
819 |
|
820 |
with open(file_path, 'w', encoding='utf-8') as file:
|
|
|
959 |
if request.method == 'POST':
|
960 |
survey_id = request.POST.get('survey_id', '')
|
961 |
markdown_content = request.POST.get('content', '')
|
962 |
+
markdown_dir = get_path('info', survey_id) + '/'
|
963 |
markdown_filename = f'survey_{survey_id}_vanilla.md'
|
964 |
markdown_filepath = os.path.join(markdown_dir, markdown_filename)
|
965 |
|
|
|
977 |
|
978 |
markdown_content = finalize_survey_paper(markdown_content, Global_collection_names, Global_file_names)
|
979 |
# 设置 Markdown 文件的保存路径1
|
980 |
+
markdown_dir = get_path('info', survey_id) + '/'
|
981 |
markdown_filename = f'survey_{survey_id}_processed.md'
|
982 |
markdown_filepath = os.path.join(markdown_dir, markdown_filename)
|
983 |
|
|
|
997 |
|
998 |
# 配置 PDF 文件的保存路径
|
999 |
pdf_filename = f'survey_{survey_id}.pdf'
|
1000 |
+
pdf_dir = get_path('results')
|
1001 |
pdf_filepath = os.path.join(pdf_dir, pdf_filename)
|
1002 |
|
1003 |
# 检查并创建 results 目录
|
|
|
1029 |
|
1030 |
global Global_survey_id, Global_survey_title
|
1031 |
if request.method == 'POST':
|
1032 |
+
base_dir = get_path('info', Global_survey_id)
|
1033 |
md_path = os.path.join(base_dir, f'survey_{Global_survey_id}_processed.md')
|
1034 |
new_md_path = os.path.join(base_dir, f'survey_{Global_survey_id}_preprocessed.md')
|
1035 |
tex_path = os.path.join(base_dir, 'template.tex')
|
1036 |
new_tex_path = os.path.join(base_dir, 'template_with_figure.tex')
|
1037 |
sty_path = os.path.join(base_dir, 'acl.sty')
|
1038 |
+
pdf_dir = get_path('results')
|
1039 |
|
1040 |
os.makedirs(base_dir, exist_ok=True)
|
1041 |
print(f"Directory '{base_dir}' checked or created.")
|
|
|
1051 |
md_to_tex(new_md_path, tex_path, Global_survey_title)
|
1052 |
|
1053 |
insert_figures(
|
1054 |
+
png_path=get_path('info', Global_survey_id, 'outline.png'),
|
1055 |
tex_path= tex_path,
|
1056 |
+
json_path=get_path('info', Global_survey_id, 'flowchart_results.json'),
|
1057 |
ref_names= Global_ref_list,
|
1058 |
survey_title=Global_survey_title,
|
1059 |
new_tex_path=new_tex_path
|
|
|
1162 |
|
1163 |
def Clustering_refs(n_clusters):
|
1164 |
global Global_cluster_num
|
1165 |
+
df = pd.read_csv(get_path('tsv', Global_survey_id + '.tsv'), sep='\t', index_col=0, encoding='utf-8')
|
1166 |
|
1167 |
print(Global_ref_list)
|
1168 |
df_selected = df.iloc[Global_ref_list]
|
|
|
1239 |
Global_ref_list = ref_list
|
1240 |
print(ref_list)
|
1241 |
|
1242 |
+
json_path = get_path('txt', Global_survey_id, 'outline.json')
|
1243 |
+
output_png_path = get_path('info', Global_survey_id, 'outline')
|
1244 |
+
md_path = get_path('info', Global_survey_id, f'survey_{Global_survey_id}_processed.md')
|
1245 |
+
flowchart_results_path = get_path('info', Global_survey_id, 'flowchart_results.json')
|
1246 |
detect_flowcharts(Global_survey_id)
|
1247 |
png_path = generate_graphviz_png(
|
1248 |
json_path=json_path,
|