Spaces:
Sleeping
Sleeping
Commit
·
a97d040
1
Parent(s):
8d551fa
Add Django InteractiveSurvey project
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .env +3 -0
- .gitattributes +3 -0
- .gitattributes copy +35 -0
- .gitignore +19 -0
- Dockerfile +34 -0
- README copy.md +12 -0
- flowchart_classifier.pth +3 -0
- scripts/additional_scripts.py +40 -0
- scripts/setup_env.py +48 -0
- src/.idea/asg.iml +12 -0
- src/.idea/misc.xml +6 -0
- src/.idea/modules.xml +8 -0
- src/.idea/vcs.xml +6 -0
- src/.idea/workspace.xml +271 -0
- src/DATA_PATH +0 -0
- src/__init__.py +0 -0
- src/asg/__init__.py +0 -0
- src/asg/asgi.py +16 -0
- src/asg/settings.py +126 -0
- src/asg/urls.py +22 -0
- src/asg/wsgi.py +16 -0
- src/db.sqlite3 +3 -0
- src/demo/__init__.py +0 -0
- src/demo/admin.py +3 -0
- src/demo/apps.py +5 -0
- src/demo/asg_abstract.py +247 -0
- src/demo/asg_add_flowchart.py +313 -0
- src/demo/asg_clustername.py +228 -0
- src/demo/asg_conclusion.py +253 -0
- src/demo/asg_generator.py +90 -0
- src/demo/asg_latex.py +816 -0
- src/demo/asg_loader.py +256 -0
- src/demo/asg_mindmap.py +302 -0
- src/demo/asg_outline.py +1029 -0
- src/demo/asg_query.py +326 -0
- src/demo/asg_retriever.py +364 -0
- src/demo/asg_splitter.py +25 -0
- src/demo/category_and_tsne.py +231 -0
- src/demo/count_files.py +20 -0
- src/demo/download.py +225 -0
- src/demo/latex_template/acl.sty +312 -0
- src/demo/latex_template/template.tex +22 -0
- src/demo/main.py +448 -0
- src/demo/migrations/__init__.py +0 -0
- src/demo/models.py +14 -0
- src/demo/postprocess.py +39 -0
- src/demo/query1.py +234 -0
- src/demo/references.py +83 -0
- src/demo/survey_generation_pipeline/asg_abstract.py +247 -0
- src/demo/survey_generation_pipeline/asg_clustername.py +228 -0
.env
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
OPENAI_API_KEY=sk-d474bcdf6cac4cceb472233d66d637bd
|
2 |
+
OPENAI_API_BASE=https://dashscope.aliyuncs.com/compatible-mode/v1
|
3 |
+
MODEL=qwen-plus-latest
|
.gitattributes
CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
src/static/img/paper_with_arrow.png filter=lfs diff=lfs merge=lfs -text
|
37 |
+
src/static/img/papers.png filter=lfs diff=lfs merge=lfs -text
|
38 |
+
src/db.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
.gitattributes copy
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 忽略 Python 缓存文件
|
2 |
+
__pycache__/
|
3 |
+
*.pyc
|
4 |
+
|
5 |
+
src/static/data
|
6 |
+
chromadb
|
7 |
+
resources
|
8 |
+
logs
|
9 |
+
examples
|
10 |
+
|
11 |
+
src/demo/survey_generation_pipeline/info
|
12 |
+
src/demo/survey_generation_pipeline/txt
|
13 |
+
src/demo/survey_generation_pipeline/logs
|
14 |
+
src/demo/survey_generation_pipeline/md
|
15 |
+
src/demo/survey_generation_pipeline/pdfs
|
16 |
+
src/demo/survey_generation_pipeline/result
|
17 |
+
src/demo/survey_generation_pipeline/tsv
|
18 |
+
src/demo/survey_generation_pipeline/txt
|
19 |
+
InteractiveSurvey-default-report.pdf
|
Dockerfile
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 使用官方的轻量镜像
|
2 |
+
FROM python:3.10-slim
|
3 |
+
|
4 |
+
# 设置工作目录
|
5 |
+
WORKDIR /app
|
6 |
+
|
7 |
+
# 更新系统并安装依赖
|
8 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
9 |
+
git wget curl build-essential \
|
10 |
+
graphviz texlive-xetex texlive-fonts-recommended texlive-latex-recommended libgl1 && \
|
11 |
+
apt-get clean && rm -rf /var/lib/apt/lists/*
|
12 |
+
|
13 |
+
# RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y
|
14 |
+
# 复制项目依赖文件并安装环境
|
15 |
+
COPY scripts/ /app/
|
16 |
+
RUN python setup_env.py
|
17 |
+
|
18 |
+
# 下载模型文件并调整存储路径
|
19 |
+
RUN wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py && \
|
20 |
+
python download_models_hf.py && \
|
21 |
+
rm -rf /root/.cache/pip && \
|
22 |
+
python additional_scripts.py
|
23 |
+
|
24 |
+
# 复制项目代码
|
25 |
+
COPY . /app/
|
26 |
+
|
27 |
+
# 暴露服务端口
|
28 |
+
EXPOSE 7860
|
29 |
+
|
30 |
+
# 健康检查
|
31 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s CMD curl -f http://localhost:7860/ || exit 1
|
32 |
+
|
33 |
+
# 启动命令
|
34 |
+
CMD ["python", "src/manage.py", "runserver", "0.0.0.0:7860"]
|
README copy.md
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: InteractiveSurvey
|
3 |
+
emoji: 🦀
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: purple
|
6 |
+
sdk: docker
|
7 |
+
pinned: false
|
8 |
+
license: apache-2.0
|
9 |
+
short_description: 'InteractiveSurvey: An LLM-based Personalized and Interactive'
|
10 |
+
---
|
11 |
+
|
12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
flowchart_classifier.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a69d2097e1bc09e3e2b69272c5afade1087acebead83c30f1c6dc6561804badd
|
3 |
+
size 16348318
|
scripts/additional_scripts.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import nltk
|
2 |
+
nltk.download('averaged_perceptron_tagger', download_dir='/usr/local/nltk_data')
|
3 |
+
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
|
7 |
+
file_path = "/root/magic-pdf.json"
|
8 |
+
|
9 |
+
new_config = {
|
10 |
+
"device-mode": "cuda",
|
11 |
+
"layout-config": {
|
12 |
+
"model": "layoutlmv3"
|
13 |
+
},
|
14 |
+
"formula-config": {
|
15 |
+
"mfd_model": "yolo_v8_mfd",
|
16 |
+
"mfr_model": "unimernet_small",
|
17 |
+
"enable": False
|
18 |
+
},
|
19 |
+
"table-config": {
|
20 |
+
"model": "tablemaster",
|
21 |
+
"enable": False,
|
22 |
+
"max_time": 400
|
23 |
+
}
|
24 |
+
}
|
25 |
+
|
26 |
+
if os.path.exists(file_path):
|
27 |
+
with open(file_path, "r", encoding="utf-8") as file:
|
28 |
+
try:
|
29 |
+
data = json.load(file)
|
30 |
+
except json.JSONDecodeError:
|
31 |
+
data = {}
|
32 |
+
else:
|
33 |
+
data = {}
|
34 |
+
|
35 |
+
data.update(new_config)
|
36 |
+
|
37 |
+
with open(file_path, "w", encoding="utf-8") as file:
|
38 |
+
json.dump(data, file, indent=4)
|
39 |
+
|
40 |
+
print(f"File '{file_path}' has been updated.")
|
scripts/setup_env.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
import subprocess
|
3 |
+
import sys
|
4 |
+
import shutil
|
5 |
+
import os
|
6 |
+
|
7 |
+
def clear_pip_cache():
|
8 |
+
print("🧹 Cleaning...")
|
9 |
+
try:
|
10 |
+
# 获取 pip 缓存目录
|
11 |
+
result = subprocess.run([sys.executable, "-m", "pip", "cache", "dir"], stdout=subprocess.PIPE, check=True, text=True)
|
12 |
+
cache_dir = result.stdout.strip()
|
13 |
+
if os.path.exists(cache_dir):
|
14 |
+
shutil.rmtree(cache_dir)
|
15 |
+
print(f"✅ : {cache_dir}")
|
16 |
+
else:
|
17 |
+
print("No cache dir found.")
|
18 |
+
except Exception as e:
|
19 |
+
print(f"❌ {e}")
|
20 |
+
|
21 |
+
# 按顺序构造 pip 安装命令列表(全部加上 --no-cache-dir)
|
22 |
+
commands = [
|
23 |
+
[sys.executable, "-m", "pip", "install", "--no-cache-dir", "unstructured==0.16.10"],
|
24 |
+
[sys.executable, "-m", "pip", "install", "--no-cache-dir", "requests==2.32.3"],
|
25 |
+
[sys.executable, "-m", "pip", "install", "--no-cache-dir", "chromadb==0.5.4"],
|
26 |
+
[sys.executable, "-m", "pip", "install", "--no-cache-dir", "langchain-huggingface==0.1.2"],
|
27 |
+
[sys.executable, "-m", "pip", "install", "--no-cache-dir", "markdown_pdf==1.3"],
|
28 |
+
[sys.executable, "-m", "pip", "install", "--no-cache-dir", "bertopic==0.16.3"],
|
29 |
+
[sys.executable, "-m", "pip", "install", "--no-cache-dir", "-U", "langchain-community"],
|
30 |
+
[sys.executable, "-m", "pip", "install", "--no-cache-dir", "--force-reinstall", "torch==2.3.1", "torchvision==0.18.1", "numpy<2.0.0", "--index-url", "https://download.pytorch.org/whl/cu118"],
|
31 |
+
[sys.executable, "-m", "pip", "install", "--no-cache-dir", "-U", "magic-pdf[full]", "--extra-index-url", "https://wheels.myhloli.com"],
|
32 |
+
[sys.executable, "-m", "pip", "install", "--no-cache-dir", "Django==2.2.5"]
|
33 |
+
[sys.executable, "-m", "pip", "install", "--no-cache-dir", "graphviz"]
|
34 |
+
]
|
35 |
+
|
36 |
+
def run_commands(cmds):
|
37 |
+
for cmd in cmds:
|
38 |
+
cmd_str = " ".join(cmd)
|
39 |
+
print(f"🚀 {cmd_str}")
|
40 |
+
try:
|
41 |
+
subprocess.run(cmd, check=True)
|
42 |
+
except subprocess.CalledProcessError:
|
43 |
+
print(f"❌ {cmd_str}")
|
44 |
+
sys.exit(1)
|
45 |
+
|
46 |
+
if __name__ == "__main__":
|
47 |
+
clear_pip_cache()
|
48 |
+
run_commands(commands)
|
src/.idea/asg.iml
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<module type="WEB_MODULE" version="4">
|
3 |
+
<component name="NewModuleRootManager">
|
4 |
+
<content url="file://$MODULE_DIR$">
|
5 |
+
<excludeFolder url="file://$MODULE_DIR$/.tmp" />
|
6 |
+
<excludeFolder url="file://$MODULE_DIR$/temp" />
|
7 |
+
<excludeFolder url="file://$MODULE_DIR$/tmp" />
|
8 |
+
</content>
|
9 |
+
<orderEntry type="inheritedJdk" />
|
10 |
+
<orderEntry type="sourceFolder" forTests="false" />
|
11 |
+
</component>
|
12 |
+
</module>
|
src/.idea/misc.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="JavaScriptSettings">
|
4 |
+
<option name="languageLevel" value="ES6" />
|
5 |
+
</component>
|
6 |
+
</project>
|
src/.idea/modules.xml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ProjectModuleManager">
|
4 |
+
<modules>
|
5 |
+
<module fileurl="file://$PROJECT_DIR$/.idea/asg.iml" filepath="$PROJECT_DIR$/.idea/asg.iml" />
|
6 |
+
</modules>
|
7 |
+
</component>
|
8 |
+
</project>
|
src/.idea/vcs.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="VcsDirectoryMappings">
|
4 |
+
<mapping directory="$PROJECT_DIR$/.." vcs="Git" />
|
5 |
+
</component>
|
6 |
+
</project>
|
src/.idea/workspace.xml
ADDED
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ChangeListManager">
|
4 |
+
<list default="true" id="cf214f68-be48-41a9-bfbc-51edc15fe9c5" name="Default Changelist" comment="">
|
5 |
+
<change afterPath="$PROJECT_DIR$/demo/templates/demo/index_beta.html" afterDir="false" />
|
6 |
+
<change beforePath="$PROJECT_DIR$/../.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/../.idea/workspace.xml" afterDir="false" />
|
7 |
+
<change beforePath="$PROJECT_DIR$/demo/category_and_tsne.py" beforeDir="false" afterPath="$PROJECT_DIR$/demo/category_and_tsne.py" afterDir="false" />
|
8 |
+
<change beforePath="$PROJECT_DIR$/demo/taskDes.py" beforeDir="false" afterPath="$PROJECT_DIR$/demo/taskDes.py" afterDir="false" />
|
9 |
+
<change beforePath="$PROJECT_DIR$/demo/templates/demo/index.html" beforeDir="false" afterPath="$PROJECT_DIR$/demo/templates/demo/index.html" afterDir="false" />
|
10 |
+
<change beforePath="$PROJECT_DIR$/demo/views.py" beforeDir="false" afterPath="$PROJECT_DIR$/demo/views.py" afterDir="false" />
|
11 |
+
<change beforePath="$PROJECT_DIR$/model_dm" beforeDir="false" afterPath="$PROJECT_DIR$/model_dm" afterDir="false" />
|
12 |
+
<change beforePath="$PROJECT_DIR$/static/data/001.tsv" beforeDir="false" afterPath="$PROJECT_DIR$/static/data/001.tsv" afterDir="false" />
|
13 |
+
<change beforePath="$PROJECT_DIR$/static/img/tsne_2907070.png" beforeDir="false" afterPath="$PROJECT_DIR$/static/img/tsne_2907070.png" afterDir="false" />
|
14 |
+
<change beforePath="$PROJECT_DIR$/static/img/tsne_3274658.png" beforeDir="false" afterPath="$PROJECT_DIR$/static/img/tsne_3274658.png" afterDir="false" />
|
15 |
+
<change beforePath="$PROJECT_DIR$/test.tsv" beforeDir="false" afterPath="$PROJECT_DIR$/test.tsv" afterDir="false" />
|
16 |
+
</list>
|
17 |
+
<ignored path="$PROJECT_DIR$/.tmp/" />
|
18 |
+
<ignored path="$PROJECT_DIR$/temp/" />
|
19 |
+
<ignored path="$PROJECT_DIR$/tmp/" />
|
20 |
+
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
|
21 |
+
<option name="SHOW_DIALOG" value="false" />
|
22 |
+
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
23 |
+
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
24 |
+
<option name="LAST_RESOLUTION" value="IGNORE" />
|
25 |
+
</component>
|
26 |
+
<component name="FUSProjectUsageTrigger">
|
27 |
+
<session id="-1703485925">
|
28 |
+
<usages-collector id="statistics.lifecycle.project">
|
29 |
+
<counts>
|
30 |
+
<entry key="project.closed" value="6" />
|
31 |
+
<entry key="project.open.time.1" value="1" />
|
32 |
+
<entry key="project.open.time.3" value="4" />
|
33 |
+
<entry key="project.open.time.4" value="1" />
|
34 |
+
<entry key="project.opened" value="6" />
|
35 |
+
</counts>
|
36 |
+
</usages-collector>
|
37 |
+
<usages-collector id="statistics.file.extensions.open">
|
38 |
+
<counts>
|
39 |
+
<entry key="html" value="2" />
|
40 |
+
</counts>
|
41 |
+
</usages-collector>
|
42 |
+
<usages-collector id="statistics.file.types.open">
|
43 |
+
<counts>
|
44 |
+
<entry key="HTML" value="2" />
|
45 |
+
</counts>
|
46 |
+
</usages-collector>
|
47 |
+
<usages-collector id="statistics.file.extensions.edit">
|
48 |
+
<counts>
|
49 |
+
<entry key="html" value="1590" />
|
50 |
+
</counts>
|
51 |
+
</usages-collector>
|
52 |
+
<usages-collector id="statistics.file.types.edit">
|
53 |
+
<counts>
|
54 |
+
<entry key="HTML" value="1590" />
|
55 |
+
</counts>
|
56 |
+
</usages-collector>
|
57 |
+
</session>
|
58 |
+
</component>
|
59 |
+
<component name="FileEditorManager">
|
60 |
+
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
|
61 |
+
<file pinned="false" current-in-tab="true">
|
62 |
+
<entry file="file://$PROJECT_DIR$/demo/templates/demo/index_beta.html">
|
63 |
+
<provider selected="true" editor-type-id="text-editor">
|
64 |
+
<state relative-caret-position="858">
|
65 |
+
<caret line="370" column="5" lean-forward="true" selection-start-line="322" selection-start-column="4" selection-end-line="370" selection-end-column="5" />
|
66 |
+
<folding>
|
67 |
+
<element signature="n#div#1;n#div#0;n#div#0;n#body#0;n#html#0;n#!!top" />
|
68 |
+
<element signature="n#div#2;n#div#0;n#div#0;n#body#0;n#html#0;n#!!top" />
|
69 |
+
<element signature="n#div#0;n#div#2;n#div#0;n#div#0;n#body#0;n#html#0;n#!!top" />
|
70 |
+
<element signature="n#style#0;n#li#0;n#!!top" expanded="true" />
|
71 |
+
<element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
|
72 |
+
<element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
|
73 |
+
<element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
|
74 |
+
<element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
|
75 |
+
<element signature="n#style#0;n#h3#0;n#!!top" expanded="true" />
|
76 |
+
<element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
|
77 |
+
<element signature="n#style#0;n#p#0;n#!!top" expanded="true" />
|
78 |
+
<element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
|
79 |
+
<element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
|
80 |
+
<element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
|
81 |
+
<element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
|
82 |
+
<element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
|
83 |
+
</folding>
|
84 |
+
</state>
|
85 |
+
</provider>
|
86 |
+
</entry>
|
87 |
+
</file>
|
88 |
+
</leaf>
|
89 |
+
</component>
|
90 |
+
<component name="Git.Settings">
|
91 |
+
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$/.." />
|
92 |
+
</component>
|
93 |
+
<component name="IdeDocumentHistory">
|
94 |
+
<option name="CHANGED_PATHS">
|
95 |
+
<list>
|
96 |
+
<option value="$PROJECT_DIR$/demo/templates/demo/index.html" />
|
97 |
+
<option value="$PROJECT_DIR$/demo/templates/demo/index_beta.html" />
|
98 |
+
</list>
|
99 |
+
</option>
|
100 |
+
</component>
|
101 |
+
<component name="JsBuildToolGruntFileManager" detection-done="true" sorting="DEFINITION_ORDER" />
|
102 |
+
<component name="JsBuildToolPackageJson" detection-done="true" sorting="DEFINITION_ORDER" />
|
103 |
+
<component name="JsGulpfileManager">
|
104 |
+
<detection-done>true</detection-done>
|
105 |
+
<sorting>DEFINITION_ORDER</sorting>
|
106 |
+
</component>
|
107 |
+
<component name="ProjectFrameBounds" fullScreen="true">
|
108 |
+
<option name="x" value="937" />
|
109 |
+
<option name="y" value="1440" />
|
110 |
+
<option name="width" value="1680" />
|
111 |
+
<option name="height" value="1050" />
|
112 |
+
</component>
|
113 |
+
<component name="ProjectLevelVcsManager" settingsEditedManually="true" />
|
114 |
+
<component name="ProjectView">
|
115 |
+
<navigator proportions="" version="1">
|
116 |
+
<foldersAlwaysOnTop value="true" />
|
117 |
+
</navigator>
|
118 |
+
<panes>
|
119 |
+
<pane id="ProjectPane">
|
120 |
+
<subPane>
|
121 |
+
<expand>
|
122 |
+
<path>
|
123 |
+
<item name="asg" type="b2602c69:ProjectViewProjectNode" />
|
124 |
+
<item name="asg" type="462c0819:PsiDirectoryNode" />
|
125 |
+
</path>
|
126 |
+
<path>
|
127 |
+
<item name="asg" type="b2602c69:ProjectViewProjectNode" />
|
128 |
+
<item name="asg" type="462c0819:PsiDirectoryNode" />
|
129 |
+
<item name="demo" type="462c0819:PsiDirectoryNode" />
|
130 |
+
</path>
|
131 |
+
<path>
|
132 |
+
<item name="asg" type="b2602c69:ProjectViewProjectNode" />
|
133 |
+
<item name="asg" type="462c0819:PsiDirectoryNode" />
|
134 |
+
<item name="demo" type="462c0819:PsiDirectoryNode" />
|
135 |
+
<item name="templates" type="462c0819:PsiDirectoryNode" />
|
136 |
+
</path>
|
137 |
+
<path>
|
138 |
+
<item name="asg" type="b2602c69:ProjectViewProjectNode" />
|
139 |
+
<item name="asg" type="462c0819:PsiDirectoryNode" />
|
140 |
+
<item name="demo" type="462c0819:PsiDirectoryNode" />
|
141 |
+
<item name="templates" type="462c0819:PsiDirectoryNode" />
|
142 |
+
<item name="demo" type="462c0819:PsiDirectoryNode" />
|
143 |
+
</path>
|
144 |
+
<path>
|
145 |
+
<item name="asg" type="b2602c69:ProjectViewProjectNode" />
|
146 |
+
<item name="asg" type="462c0819:PsiDirectoryNode" />
|
147 |
+
<item name="static" type="462c0819:PsiDirectoryNode" />
|
148 |
+
</path>
|
149 |
+
<path>
|
150 |
+
<item name="asg" type="b2602c69:ProjectViewProjectNode" />
|
151 |
+
<item name="asg" type="462c0819:PsiDirectoryNode" />
|
152 |
+
<item name="static" type="462c0819:PsiDirectoryNode" />
|
153 |
+
<item name="img" type="462c0819:PsiDirectoryNode" />
|
154 |
+
</path>
|
155 |
+
</expand>
|
156 |
+
<select />
|
157 |
+
</subPane>
|
158 |
+
</pane>
|
159 |
+
<pane id="Scope" />
|
160 |
+
</panes>
|
161 |
+
</component>
|
162 |
+
<component name="PropertiesComponent">
|
163 |
+
<property name="WebServerToolWindowFactoryState" value="false" />
|
164 |
+
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
|
165 |
+
<property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
|
166 |
+
<property name="nodejs_npm_path_reset_for_default_project" value="true" />
|
167 |
+
</component>
|
168 |
+
<component name="RecentsManager">
|
169 |
+
<key name="CopyFile.RECENT_KEYS">
|
170 |
+
<recent name="$PROJECT_DIR$/demo/templates/demo" />
|
171 |
+
<recent name="$PROJECT_DIR$/static/img" />
|
172 |
+
</key>
|
173 |
+
</component>
|
174 |
+
<component name="RunDashboard">
|
175 |
+
<option name="ruleStates">
|
176 |
+
<list>
|
177 |
+
<RuleState>
|
178 |
+
<option name="name" value="ConfigurationTypeDashboardGroupingRule" />
|
179 |
+
</RuleState>
|
180 |
+
<RuleState>
|
181 |
+
<option name="name" value="StatusDashboardGroupingRule" />
|
182 |
+
</RuleState>
|
183 |
+
</list>
|
184 |
+
</option>
|
185 |
+
</component>
|
186 |
+
<component name="SvnConfiguration">
|
187 |
+
<configuration />
|
188 |
+
</component>
|
189 |
+
<component name="TaskManager">
|
190 |
+
<task active="true" id="Default" summary="Default task">
|
191 |
+
<changelist id="cf214f68-be48-41a9-bfbc-51edc15fe9c5" name="Default Changelist" comment="" />
|
192 |
+
<created>1655724718572</created>
|
193 |
+
<option name="number" value="Default" />
|
194 |
+
<option name="presentableId" value="Default" />
|
195 |
+
<updated>1655724718572</updated>
|
196 |
+
<workItem from="1655724720238" duration="1478000" />
|
197 |
+
<workItem from="1657850351467" duration="27000" />
|
198 |
+
<workItem from="1657851735564" duration="5134000" />
|
199 |
+
<workItem from="1657960829388" duration="4835000" />
|
200 |
+
<workItem from="1658119972542" duration="6469000" />
|
201 |
+
<workItem from="1658150456661" duration="1476000" />
|
202 |
+
</task>
|
203 |
+
<servers />
|
204 |
+
</component>
|
205 |
+
<component name="TimeTrackingManager">
|
206 |
+
<option name="totallyTimeSpent" value="19419000" />
|
207 |
+
</component>
|
208 |
+
<component name="ToolWindowManager">
|
209 |
+
<frame x="937" y="1440" width="1680" height="1050" extended-state="0" />
|
210 |
+
<editor active="true" />
|
211 |
+
<layout>
|
212 |
+
<window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.16696805" />
|
213 |
+
<window_info id="Structure" order="1" side_tool="true" weight="0.25" />
|
214 |
+
<window_info id="Favorites" order="2" side_tool="true" />
|
215 |
+
<window_info anchor="bottom" id="Message" order="0" />
|
216 |
+
<window_info anchor="bottom" id="Find" order="1" />
|
217 |
+
<window_info anchor="bottom" id="Run" order="2" />
|
218 |
+
<window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
|
219 |
+
<window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
|
220 |
+
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
|
221 |
+
<window_info anchor="bottom" id="TODO" order="6" />
|
222 |
+
<window_info anchor="bottom" id="Docker" order="7" show_stripe_button="false" />
|
223 |
+
<window_info anchor="bottom" id="Version Control" order="8" show_stripe_button="false" />
|
224 |
+
<window_info anchor="bottom" id="Terminal" order="9" />
|
225 |
+
<window_info anchor="bottom" id="Event Log" order="10" side_tool="true" />
|
226 |
+
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
|
227 |
+
<window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
|
228 |
+
<window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
|
229 |
+
</layout>
|
230 |
+
</component>
|
231 |
+
<component name="TypeScriptGeneratedFilesManager">
|
232 |
+
<option name="version" value="1" />
|
233 |
+
</component>
|
234 |
+
<component name="VcsContentAnnotationSettings">
|
235 |
+
<option name="myLimit" value="2678400000" />
|
236 |
+
</component>
|
237 |
+
<component name="editorHistoryManager">
|
238 |
+
<entry file="file://$PROJECT_DIR$/demo/templates/demo/index.html">
|
239 |
+
<provider selected="true" editor-type-id="text-editor">
|
240 |
+
<state relative-caret-position="983">
|
241 |
+
<caret line="121" lean-forward="true" selection-start-line="121" selection-end-line="121" />
|
242 |
+
</state>
|
243 |
+
</provider>
|
244 |
+
</entry>
|
245 |
+
<entry file="file://$PROJECT_DIR$/demo/templates/demo/index_beta.html">
|
246 |
+
<provider selected="true" editor-type-id="text-editor">
|
247 |
+
<state relative-caret-position="858">
|
248 |
+
<caret line="370" column="5" lean-forward="true" selection-start-line="322" selection-start-column="4" selection-end-line="370" selection-end-column="5" />
|
249 |
+
<folding>
|
250 |
+
<element signature="n#div#1;n#div#0;n#div#0;n#body#0;n#html#0;n#!!top" />
|
251 |
+
<element signature="n#div#2;n#div#0;n#div#0;n#body#0;n#html#0;n#!!top" />
|
252 |
+
<element signature="n#div#0;n#div#2;n#div#0;n#div#0;n#body#0;n#html#0;n#!!top" />
|
253 |
+
<element signature="n#style#0;n#li#0;n#!!top" expanded="true" />
|
254 |
+
<element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
|
255 |
+
<element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
|
256 |
+
<element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
|
257 |
+
<element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
|
258 |
+
<element signature="n#style#0;n#h3#0;n#!!top" expanded="true" />
|
259 |
+
<element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
|
260 |
+
<element signature="n#style#0;n#p#0;n#!!top" expanded="true" />
|
261 |
+
<element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
|
262 |
+
<element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
|
263 |
+
<element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
|
264 |
+
<element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
|
265 |
+
<element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
|
266 |
+
</folding>
|
267 |
+
</state>
|
268 |
+
</provider>
|
269 |
+
</entry>
|
270 |
+
</component>
|
271 |
+
</project>
|
src/DATA_PATH
ADDED
Binary file (17.2 kB). View file
|
|
src/__init__.py
ADDED
File without changes
|
src/asg/__init__.py
ADDED
File without changes
|
src/asg/asgi.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
ASGI config for asg project.
|
3 |
+
|
4 |
+
It exposes the ASGI callable as a module-level variable named ``application``.
|
5 |
+
|
6 |
+
For more information on this file, see
|
7 |
+
https://docs.djangoproject.com/en/3.0/howto/deployment/asgi/
|
8 |
+
"""
|
9 |
+
|
10 |
+
import os
|
11 |
+
|
12 |
+
from django.core.asgi import get_asgi_application
|
13 |
+
|
14 |
+
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'asg.settings')
|
15 |
+
|
16 |
+
application = get_asgi_application()
|
src/asg/settings.py
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Django settings for asg project.
|
3 |
+
|
4 |
+
Generated by 'django-admin startproject' using Django 3.0.7.
|
5 |
+
|
6 |
+
For more information on this file, see
|
7 |
+
https://docs.djangoproject.com/en/3.0/topics/settings/
|
8 |
+
|
9 |
+
For the full list of settings and their values, see
|
10 |
+
https://docs.djangoproject.com/en/3.0/ref/settings/
|
11 |
+
"""
|
12 |
+
|
13 |
+
import os
|
14 |
+
|
15 |
+
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
|
16 |
+
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
17 |
+
|
18 |
+
|
19 |
+
# Quick-start development settings - unsuitable for production
|
20 |
+
# See https://docs.djangoproject.com/en/3.0/howto/deployment/checklist/
|
21 |
+
|
22 |
+
# SECURITY WARNING: keep the secret key used in production secret!
|
23 |
+
SECRET_KEY = 'g#7@+76ha9hwc1hl8!tnvr6rh1k_z8_@sleb=*8i@nh=h3_oi#'
|
24 |
+
|
25 |
+
# SECURITY WARNING: don't run with debug turned on in production!
|
26 |
+
DEBUG = True
|
27 |
+
|
28 |
+
|
29 |
+
ALLOWED_HOSTS = ['*',]
|
30 |
+
|
31 |
+
|
32 |
+
# Application definition
|
33 |
+
|
34 |
+
INSTALLED_APPS = [
|
35 |
+
'django.contrib.admin',
|
36 |
+
'django.contrib.auth',
|
37 |
+
'django.contrib.contenttypes',
|
38 |
+
'django.contrib.sessions',
|
39 |
+
'django.contrib.messages',
|
40 |
+
'django.contrib.staticfiles',
|
41 |
+
'demo',
|
42 |
+
]
|
43 |
+
|
44 |
+
MIDDLEWARE = [
|
45 |
+
'django.middleware.security.SecurityMiddleware',
|
46 |
+
'django.contrib.sessions.middleware.SessionMiddleware',
|
47 |
+
'django.middleware.common.CommonMiddleware',
|
48 |
+
'django.middleware.csrf.CsrfViewMiddleware',
|
49 |
+
'django.contrib.auth.middleware.AuthenticationMiddleware',
|
50 |
+
'django.contrib.messages.middleware.MessageMiddleware',
|
51 |
+
'django.middleware.clickjacking.XFrameOptionsMiddleware',
|
52 |
+
]
|
53 |
+
|
54 |
+
ROOT_URLCONF = 'asg.urls'
|
55 |
+
|
56 |
+
TEMPLATES = [
|
57 |
+
{
|
58 |
+
'BACKEND': 'django.template.backends.django.DjangoTemplates',
|
59 |
+
'DIRS': ['demo/templates'],
|
60 |
+
'APP_DIRS': True,
|
61 |
+
'OPTIONS': {
|
62 |
+
'context_processors': [
|
63 |
+
'django.template.context_processors.debug',
|
64 |
+
'django.template.context_processors.request',
|
65 |
+
'django.contrib.auth.context_processors.auth',
|
66 |
+
'django.contrib.messages.context_processors.messages',
|
67 |
+
],
|
68 |
+
},
|
69 |
+
},
|
70 |
+
]
|
71 |
+
|
72 |
+
WSGI_APPLICATION = 'asg.wsgi.application'
|
73 |
+
|
74 |
+
|
75 |
+
# Database
|
76 |
+
# https://docs.djangoproject.com/en/3.0/ref/settings/#databases
|
77 |
+
|
78 |
+
DATABASES = {
|
79 |
+
'default': {
|
80 |
+
'ENGINE': 'django.db.backends.sqlite3',
|
81 |
+
'NAME': os.path.join(BASE_DIR, 'db.sqlite3'),
|
82 |
+
}
|
83 |
+
}
|
84 |
+
|
85 |
+
|
86 |
+
# Password validation
|
87 |
+
# https://docs.djangoproject.com/en/3.0/ref/settings/#auth-password-validators
|
88 |
+
|
89 |
+
AUTH_PASSWORD_VALIDATORS = [
|
90 |
+
{
|
91 |
+
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
|
92 |
+
},
|
93 |
+
{
|
94 |
+
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
|
95 |
+
},
|
96 |
+
{
|
97 |
+
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
|
98 |
+
},
|
99 |
+
{
|
100 |
+
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
|
101 |
+
},
|
102 |
+
]
|
103 |
+
|
104 |
+
|
105 |
+
# Internationalization
|
106 |
+
# https://docs.djangoproject.com/en/3.0/topics/i18n/
|
107 |
+
|
108 |
+
LANGUAGE_CODE = 'en-us'
|
109 |
+
|
110 |
+
TIME_ZONE = 'UTC'
|
111 |
+
|
112 |
+
USE_I18N = True
|
113 |
+
|
114 |
+
USE_L10N = True
|
115 |
+
|
116 |
+
USE_TZ = True
|
117 |
+
|
118 |
+
|
119 |
+
# Static files (CSS, JavaScript, Images)
|
120 |
+
# https://docs.djangoproject.com/en/3.0/howto/static-files/
|
121 |
+
|
122 |
+
STATIC_URL = '/static/'
|
123 |
+
|
124 |
+
STATICFILES_DIRS = (
|
125 |
+
os.path.join(BASE_DIR, "static"),
|
126 |
+
)
|
src/asg/urls.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""asg URL Configuration
|
2 |
+
|
3 |
+
The `urlpatterns` list routes URLs to views. For more information please see:
|
4 |
+
https://docs.djangoproject.com/en/3.0/topics/http/urls/
|
5 |
+
Examples:
|
6 |
+
Function views
|
7 |
+
1. Add an import: from my_app import views
|
8 |
+
2. Add a URL to urlpatterns: path('', views.home, name='home')
|
9 |
+
Class-based views
|
10 |
+
1. Add an import: from other_app.views import Home
|
11 |
+
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
|
12 |
+
Including another URLconf
|
13 |
+
1. Import the include() function: from django.urls import include, path
|
14 |
+
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
|
15 |
+
"""
|
16 |
+
from django.contrib import admin
|
17 |
+
from django.urls import path, include
|
18 |
+
|
19 |
+
urlpatterns = [
|
20 |
+
path('admin/', admin.site.urls),
|
21 |
+
path('', include('demo.urls')),
|
22 |
+
]
|
src/asg/wsgi.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
WSGI config for asg project.
|
3 |
+
|
4 |
+
It exposes the WSGI callable as a module-level variable named ``application``.
|
5 |
+
|
6 |
+
For more information on this file, see
|
7 |
+
https://docs.djangoproject.com/en/3.0/howto/deployment/wsgi/
|
8 |
+
"""
|
9 |
+
|
10 |
+
import os
|
11 |
+
|
12 |
+
from django.core.wsgi import get_wsgi_application
|
13 |
+
|
14 |
+
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'asg.settings')
|
15 |
+
|
16 |
+
application = get_wsgi_application()
|
src/db.sqlite3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1424881b6d73f9e786d8ed6e666f3ac3e9e3159cab7bfe62ed52b1b7259b3eae
|
3 |
+
size 131072
|
src/demo/__init__.py
ADDED
File without changes
|
src/demo/admin.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from django.contrib import admin
|
2 |
+
|
3 |
+
# Register your models here.
|
src/demo/apps.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from django.apps import AppConfig
|
2 |
+
|
3 |
+
|
4 |
+
class DemoConfig(AppConfig):
|
5 |
+
name = 'demo'
|
src/demo/asg_abstract.py
ADDED
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
|
4 |
+
class AbstractGenerator:
|
5 |
+
def __init__(self, pipeline):
|
6 |
+
self.pipeline = pipeline
|
7 |
+
|
8 |
+
def generate(self, title, intro, mode='lora'):
|
9 |
+
if mode == 'lora' or mode == 'test':
|
10 |
+
if mode == 'lora':
|
11 |
+
self.pipeline.model.set_adapter("abstract")
|
12 |
+
|
13 |
+
system_prompt = f'''You are a helpful assistant that help to generate the abstract of the survey paper given the survey title and survey introduction.'''
|
14 |
+
# user_prompt = {"survey_title":survey_title, "claims":cluster_with_claims}
|
15 |
+
user_prompt = f'''Help me to generate the abstract of a survey paper given the title: *{title}*, and and the introduction:{intro}'''
|
16 |
+
|
17 |
+
messages = [
|
18 |
+
{"role": "system", "content": system_prompt},
|
19 |
+
{"role": "user", "content": user_prompt},
|
20 |
+
{"role": "assistant", "content":"Abstract: This survey "}
|
21 |
+
]
|
22 |
+
|
23 |
+
outputs = self.pipeline(
|
24 |
+
messages,
|
25 |
+
max_new_tokens=4096,
|
26 |
+
)
|
27 |
+
result = outputs[0]["generated_text"][-1]['content']
|
28 |
+
return result
|
29 |
+
else:
|
30 |
+
raise ValueError('mode not supported')
|
31 |
+
|
32 |
+
if __name__ == '__main__':
|
33 |
+
from transformers import pipeline
|
34 |
+
import torch
|
35 |
+
import transformers
|
36 |
+
|
37 |
+
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
38 |
+
Global_pipeline = transformers.pipeline(
|
39 |
+
"text-generation",
|
40 |
+
model=model_id,
|
41 |
+
model_kwargs={"torch_dtype": torch.bfloat16},
|
42 |
+
token = os.getenv('HF_API_KEY'),
|
43 |
+
device_map="auto",
|
44 |
+
)
|
45 |
+
Global_pipeline.model.load_adapter(peft_model_id = "technicolor/llama3.1_8b_outline_generation", adapter_name="outline")
|
46 |
+
Global_pipeline.model.load_adapter(peft_model_id ="technicolor/llama3.1_8b_abstract_generation", adapter_name="abstract")
|
47 |
+
Global_pipeline.model.load_adapter(peft_model_id ="technicolor/llama3.1_8b_conclusion_generation", adapter_name="conclusion")
|
48 |
+
title = "A Survey of Large Language Models"
|
49 |
+
intro = '''L
|
50 |
+
ANGUAGE is a prominent ability in human beings to
|
51 |
+
express and communicate, which develops in early
|
52 |
+
childhood and evolves over a lifetime [3, 4]. Machines,
|
53 |
+
however, cannot naturally grasp the abilities of understanding and communicating in the form of human language,
|
54 |
+
unless equipped with powerful artificial intelligence (AI)
|
55 |
+
algorithms. It has been a longstanding research challenge
|
56 |
+
to achieve this goal, to enable machines to read, write, and
|
57 |
+
communicate like humans [5].
|
58 |
+
Technically, language modeling (LM) is one of the major
|
59 |
+
approaches to advancing language intelligence of machines.
|
60 |
+
In general, LM aims to model the generative likelihood
|
61 |
+
of word sequences, so as to predict the probabilities of
|
62 |
+
future (or missing) tokens. The research of LM has received
|
63 |
+
extensive attention in the literature, which can be divided
|
64 |
+
into four major development stages:
|
65 |
+
• Statistical language models (SLM). SLMs [6–9] are developed based on statistical learning methods that rose in
|
66 |
+
the 1990s. The basic idea is to build the word prediction
|
67 |
+
model based on the Markov assumption, e.g., predicting the
|
68 |
+
next word based on the most recent context. The SLMs with
|
69 |
+
a fixed context length n are also called n-gram language
|
70 |
+
models, e.g., bigram and trigram language models. SLMs
|
71 |
+
have been widely applied to enhance task performance
|
72 |
+
in information retrieval (IR) [10, 11] and natural language
|
73 |
+
processing (NLP) [12–14]. However, they often suffer from
|
74 |
+
the curse of dimensionality: it is difficult to accurately
|
75 |
+
estimate high-order language models since an exponential
|
76 |
+
number of transition probabilities need to be estimated.
|
77 |
+
Thus, specially designed smoothing strategies such as backoff estimation [15] and Good–Turing estimation [16] have
|
78 |
+
been introduced to alleviate the data sparsity problem.
|
79 |
+
• Neural language models (NLM). NLMs [1, 17, 18] characterize the probability of word sequences by neural networks,
|
80 |
+
e.g., multi-layer perceptron (MLP) and recurrent neural networks (RNNs). As a remarkable contribution, the work in
|
81 |
+
[1] introduced the concept of distributed representation of
|
82 |
+
words and built the word prediction function conditioned
|
83 |
+
on the aggregated context features (i.e., the distributed
|
84 |
+
word vectors). By extending the idea of learning effective
|
85 |
+
features for text data, a general neural network approach
|
86 |
+
was developed to build a unified, end-to-end solution for
|
87 |
+
various NLP tasks [2]. Furthermore, word2vec [19, 20] was
|
88 |
+
proposed to build a simplified shallow neural network
|
89 |
+
for learning distributed word representations, which were
|
90 |
+
demonstrated to be very effective across a variety of NLP
|
91 |
+
tasks. These studies have initiated the use of language
|
92 |
+
models for representation learning (beyond word sequence
|
93 |
+
modeling), having an important impact on the field of NLP.
|
94 |
+
• Pre-trained language models (PLM). As an early attempt, ELMo [21] was proposed to capture context-aware
|
95 |
+
word representations by first pre-training a bidirectional
|
96 |
+
LSTM (biLSTM) network (instead of learning fixed word
|
97 |
+
representations) and then fine-tuning the biLSTM network
|
98 |
+
according to specific downstream tasks. Furthermore, based
|
99 |
+
on the highly parallelizable Transformer architecture [22]
|
100 |
+
with self-attention mechanisms, BERT [23] was proposed by
|
101 |
+
pre-training bidirectional language models with specially
|
102 |
+
designed pre-training tasks on large-scale unlabeled corpora. These pre-trained context-aware word representations
|
103 |
+
are very effective as general-purpose semantic features,
|
104 |
+
which have largely raised the performance bar of NLP
|
105 |
+
tasks. This study has inspired a large number of follow-up
|
106 |
+
work, which sets the “pre-training and fine-tuning” learning
|
107 |
+
paradigm. Following this paradigm, a great number of studies on PLMs have been developed, introducing either different architectures [24, 25] (e.g., GPT-2 [26] and BART [24]) or
|
108 |
+
improved pre-training strategies [27–29]. In this paradigm, it
|
109 |
+
often requires fine-tuning the PLM for adapting to different
|
110 |
+
downstream tasks.
|
111 |
+
• Large language models (LLM). Researchers find that
|
112 |
+
scaling PLM (e.g., scaling model size or data size) often
|
113 |
+
leads to an improved model capacity on downstream tasks
|
114 |
+
(i.e., following the scaling law [30]). A number of studies
|
115 |
+
have explored the performance limit by training an ever
|
116 |
+
larger PLM (e.g., the 175B-parameter GPT-3 and the 540Bparameter PaLM). Although scaling is mainly conducted
|
117 |
+
in model size (with similar architectures and pre-training
|
118 |
+
tasks), these large-sized PLMs display different behaviors
|
119 |
+
from smaller PLMs (e.g., 330M-parameter BERT and 1.5Bparameter GPT-2) and show surprising abilities (called emergent abilities [31]) in solving a series of complex tasks. For
|
120 |
+
example, GPT-3 can solve few-shot tasks through in-context
|
121 |
+
learning, whereas GPT-2 cannot do well. Thus, the research
|
122 |
+
community coins the term “large language models (LLM)”
|
123 |
+
1
|
124 |
+
for these large-sized PLMs [32–35], which attract increasing
|
125 |
+
research attention (See Figure 1). A remarkable application
|
126 |
+
of LLMs is ChatGPT2
|
127 |
+
that adapts the LLMs from the GPT
|
128 |
+
series for dialogue, which presents an amazing conversation
|
129 |
+
ability with humans. We can observe a sharp increase of the
|
130 |
+
arXiv papers that are related to LLMs after the release of
|
131 |
+
ChatGPT in Figure 1.
|
132 |
+
As discussed before, language model is not a new technical concept specially for LLMs, but has evolved with the
|
133 |
+
advance of artificial intelligence over the decades. Early language models mainly aim to model and generate text data,
|
134 |
+
while latest language models (e.g., GPT-4) focus on complex
|
135 |
+
task solving. From language modeling to task solving, it is an
|
136 |
+
important leap in scientific thinking, which is the key to
|
137 |
+
understand the development of language models in the research history. From the perspective of task solving, the four
|
138 |
+
generations of language models have exhibited different levels of model capacities. In Figure 2, we describe the evolution process of language models in terms of the task solving
|
139 |
+
capacity. At first, statistical language models mainly assisted
|
140 |
+
in some specific tasks (e.g., retrieval or speech tasks), in
|
141 |
+
which the predicted or estimated probabilities can enhance
|
142 |
+
the performance of task-specific approaches. Subsequently,
|
143 |
+
neural language models focused on learning task-agnostic
|
144 |
+
representations (e.g., features), aiming to reduce the efforts
|
145 |
+
for human feature engineering. Furthermore, pre-trained
|
146 |
+
language models learned context-aware representations that
|
147 |
+
can be optimized according to downstream tasks. For the
|
148 |
+
latest generation of language model, LLMs are enhanced by
|
149 |
+
exploring the scaling effect on model capacity, which can be
|
150 |
+
considered as general-purpose task solvers. To summarize,
|
151 |
+
in the evolution process, the task scope that can be solved
|
152 |
+
by language models have been greatly extended, and the
|
153 |
+
task performance attained by language models have been
|
154 |
+
significantly enhanced.
|
155 |
+
In the existing literature, PLMs have been widely discussed and surveyed [36–39], while LLMs are seldom reviewed in a systematic way. To motivate our survey, we first
|
156 |
+
highlight three major differences between LLMs and PLMs.
|
157 |
+
First, LLMs display some surprising emergent abilities that
|
158 |
+
may not be observed in previous smaller PLMs. These abilities are key to the performance of language models on complex tasks, making AI algorithms unprecedently powerful
|
159 |
+
and effective. Second, LLMs would revolutionize the way
|
160 |
+
that humans develop and use AI algorithms. Unlike small
|
161 |
+
PLMs, the major approach to accessing LLMs is through
|
162 |
+
the prompting interface (e.g., GPT-4 API). Humans have to
|
163 |
+
understand how LLMs work and format their tasks in a way
|
164 |
+
that LLMs can follow. Third, the development of LLMs no
|
165 |
+
longer draws a clear distinction between research and engineering. The training of LLMs requires extensive practical
|
166 |
+
experiences in large-scale data processing and distributed
|
167 |
+
parallel training. To develop capable LLMs, researchers
|
168 |
+
have to solve complicated engineering issues, working with
|
169 |
+
engineers or being engineers.
|
170 |
+
Nowadays, LLMs are posing a significant impact on
|
171 |
+
the AI community, and the advent of ChatGPT and GPT-4
|
172 |
+
leads to the rethinking of the possibilities of artificial general
|
173 |
+
intelligence (AGI). OpenAI has published a technical article
|
174 |
+
entitled “Planning for AGI and beyond”, which discusses
|
175 |
+
the short-term and long-term plans to approach AGI [40],
|
176 |
+
and a more recent paper has argued that GPT-4 might be
|
177 |
+
considered as an early version of an AGI system [41]. The
|
178 |
+
research areas of AI are being revolutionized by the rapid
|
179 |
+
progress of LLMs. In the field of NLP, LLMs can serve as a
|
180 |
+
general-purpose language task solver (to some extent), and
|
181 |
+
the research paradigm has been shifting towards the use
|
182 |
+
of LLMs. In the field of IR, traditional search engines are
|
183 |
+
challenged by the new information seeking way through AI
|
184 |
+
chatbots (i.e., ChatGPT), and New Bing3 presents an initial
|
185 |
+
attempt that enhances the search results based on LLMs. In
|
186 |
+
the field of CV, the researchers try to develop ChatGPT-like
|
187 |
+
vision-language models that can better serve multimodal
|
188 |
+
dialogues [42–45], and GPT-4 [46] has supported multimodal input by integrating the visual information. This new
|
189 |
+
wave of technology would potentially lead to a prosperous
|
190 |
+
ecosystem of real-world applications based on LLMs. For
|
191 |
+
instance, Microsoft 365 is being empowered by LLMs (i.e.,
|
192 |
+
Copilot) to automate the office work, and OpenAI supports
|
193 |
+
the use of plugins in ChatGPT for implementing special
|
194 |
+
functions.
|
195 |
+
Despite the progress and impact, the underlying principles of LLMs are still not well explored. Firstly, it is
|
196 |
+
mysterious why emergent abilities occur in LLMs, instead of
|
197 |
+
smaller PLMs. As a more general issue, there lacks a deep,
|
198 |
+
detailed investigation of the key factors that contribute to
|
199 |
+
the superior abilities of LLMs. It is important to study when
|
200 |
+
and how LLMs obtain such abilities [47]. Although there are
|
201 |
+
some meaningful discussions about this problem [31, 47],
|
202 |
+
more principled investigations are needed to uncover the
|
203 |
+
“secrets“ of LLMs. Secondly, it is difficult for the research
|
204 |
+
community to train capable LLMs. Due to the huge demand of computation resources, it is very costly to carry
|
205 |
+
out repetitive, ablating studies for investigating the effect
|
206 |
+
of various strategies for training LLMs. Indeed, LLMs are
|
207 |
+
mainly trained by industry, where many important training
|
208 |
+
details (e.g., data collection and cleaning) are not revealed
|
209 |
+
to the public. Thirdly, it is challenging to align LLMs with
|
210 |
+
human values or preferences. Despite the capacities, LLMs
|
211 |
+
are also likely to produce toxic, fictitious, or harmful contents. It requires effective and efficient control approaches
|
212 |
+
to eliminating the potential risk of the use of LLMs [46].
|
213 |
+
Faced with both opportunities and challenges, it needs
|
214 |
+
more attention on the research and development of LLMs. In
|
215 |
+
order to provide a basic understanding of LLMs, this survey
|
216 |
+
conducts a literature review of the recent advances in LLMs
|
217 |
+
from four major aspects, including pre-training (how to pretrain a capable LLM), adaptation (how to effectively adapt
|
218 |
+
pre-trained LLMs for better use), utilization (how to use
|
219 |
+
LLMs for solving various downstream tasks) and capability
|
220 |
+
evaluation (how to evaluate the abilities of LLMs and existing
|
221 |
+
empirical findings). We thoroughly comb the literature and
|
222 |
+
summarize the key findings, techniques, and methods of
|
223 |
+
LLMs. For this survey, we also create a GitHub project
|
224 |
+
website by collecting the supporting resources for LLMs, at
|
225 |
+
the link https://github.com/RUCAIBox/LLMSurvey. We
|
226 |
+
are also aware of several related review articles on PLMs
|
227 |
+
or LLMs [32, 36, 38, 39, 43, 48–54]. These papers either
|
228 |
+
discuss PLMs or some specific (or general) aspects of LLMs.
|
229 |
+
Compared with them, we focus on the techniques and
|
230 |
+
methods to develop and use LLMs and provide a relatively
|
231 |
+
comprehensive reference to important aspects of LLMs.
|
232 |
+
The remainder of this survey is organized as follows:
|
233 |
+
Section 2 introduces the background for LLMs and the evolution of GPT-series models, followed by the summarization
|
234 |
+
of available resources for developing LLMs in Section 3.
|
235 |
+
Sections 4, 5, 6, and 7 review and summarize the recent
|
236 |
+
progress from the four aspects of pre-training, adaptation,
|
237 |
+
utilization, and capacity evaluation, respectively. Then, Section 8 discusses the practical guide for prompt design,
|
238 |
+
and Section 9 reviews the applications of LLMs in several
|
239 |
+
representative domains. Finally, we conclude the survey in
|
240 |
+
Section 10 by summarizing the major findings and discuss
|
241 |
+
the remaining issues for future work.
|
242 |
+
'''
|
243 |
+
|
244 |
+
|
245 |
+
abstract_generator = AbstractGenerator(Global_pipeline)
|
246 |
+
with_lora = abstract_generator.generate(title, intro, mode='lora')
|
247 |
+
with_test = abstract_generator.generate(title, intro, mode='test')
|
src/demo/asg_add_flowchart.py
ADDED
@@ -0,0 +1,313 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
from urllib.parse import quote
|
5 |
+
|
6 |
+
import os
|
7 |
+
import json
|
8 |
+
import torch
|
9 |
+
import torchvision.transforms as transforms
|
10 |
+
from torchvision import models
|
11 |
+
from PIL import Image
|
12 |
+
|
13 |
+
# 常量定义
|
14 |
+
BASE_DIR = os.path.normpath("src/static/data/md") # 根目录
|
15 |
+
INFO_DIR = os.path.normpath("src/static/data/info") # 存放 JSON 结果的目录
|
16 |
+
|
17 |
+
# 加载 PyTorch EfficientNet 训练好的 3 类分类模型
|
18 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
19 |
+
model = models.efficientnet_b0(pretrained=False)
|
20 |
+
|
21 |
+
# 修改最后一层,适应 3 类(flowchart, non-flowchart, other)
|
22 |
+
num_features = model.classifier[1].in_features
|
23 |
+
model.classifier[1] = torch.nn.Linear(num_features, 3) # 3 类
|
24 |
+
model.load_state_dict(torch.load("flowchart_classifier.pth", map_location=device))
|
25 |
+
model.to(device) # 确保模型移动到正确的设备
|
26 |
+
model.eval()
|
27 |
+
|
28 |
+
# 预处理图片
|
29 |
+
transform = transforms.Compose([
|
30 |
+
transforms.Resize((224, 224)),
|
31 |
+
transforms.ToTensor(),
|
32 |
+
])
|
33 |
+
|
34 |
+
def detect_flowcharts(survey_id):
|
35 |
+
""" 在指定 survey_id 目录下查找 flowchart,并保存 JSON 结果 """
|
36 |
+
survey_path = os.path.join(BASE_DIR, survey_id) # 该 survey_id 的目录
|
37 |
+
if not os.path.exists(survey_path):
|
38 |
+
print(f"❌ 目录 {survey_path} 不存在!")
|
39 |
+
return
|
40 |
+
|
41 |
+
flowchart_dict = {} # 存储 flowchart 结果
|
42 |
+
|
43 |
+
# 遍历该 survey 目录下的所有 PDF 文件夹
|
44 |
+
for pdf_folder in os.listdir(survey_path):
|
45 |
+
pdf_folder_path = os.path.join(survey_path, pdf_folder)
|
46 |
+
|
47 |
+
if not os.path.isdir(pdf_folder_path):
|
48 |
+
continue # 只处理文件夹
|
49 |
+
|
50 |
+
print(f"🔍 处理 PDF 文件夹: {pdf_folder}")
|
51 |
+
|
52 |
+
# 遍历所有 `xxx/auto/images` 目录
|
53 |
+
for root, dirs, files in os.walk(pdf_folder_path):
|
54 |
+
if "auto/images" in root.replace("\\", "/"): # 兼容 Windows 和 Linux
|
55 |
+
for filename in sorted(files): # 按文件名排序,保证第一个找到的 Flowchart 被选用
|
56 |
+
if not filename.lower().endswith(".jpg"): # 只处理 JPG
|
57 |
+
continue
|
58 |
+
|
59 |
+
image_path = os.path.join(root, filename)
|
60 |
+
img = Image.open(image_path).convert("RGB") # 打开图片并转换为 RGB
|
61 |
+
|
62 |
+
# 预处理图片并转换为张量
|
63 |
+
img_tensor = transform(img).unsqueeze(0).to(device)
|
64 |
+
|
65 |
+
# 运行分类模型
|
66 |
+
with torch.no_grad():
|
67 |
+
output = model(img_tensor)
|
68 |
+
predicted_class = torch.argmax(output).item()
|
69 |
+
|
70 |
+
# **确保 predicted_class == 0 表示 flowchart**
|
71 |
+
if predicted_class == 2: # `0` 代表 Flowchart 类别
|
72 |
+
print(f"✅ Flowchart detected: {image_path}")
|
73 |
+
flowchart_dict[pdf_folder] = image_path
|
74 |
+
break # **只存当前 PDF 文件夹的第一张 flowchart**
|
75 |
+
|
76 |
+
# 只有检测到 Flowchart 时才保存 JSON
|
77 |
+
if flowchart_dict:
|
78 |
+
os.makedirs(os.path.join(INFO_DIR, survey_id), exist_ok=True) # 确保目录存在
|
79 |
+
json_path = os.path.join(INFO_DIR, survey_id, "flowchart_results.json")
|
80 |
+
with open(json_path, "w", encoding="utf-8") as f:
|
81 |
+
json.dump(flowchart_dict, f, indent=4, ensure_ascii=False)
|
82 |
+
|
83 |
+
print(f"📁 Flowchart 结果已保存: {json_path}")
|
84 |
+
else:
|
85 |
+
print(f"⚠️ 没有检测到 Flowchart,未生成 JSON")
|
86 |
+
|
87 |
+
# 示例调用
|
88 |
+
# survey_id = "test" # 例如 "test"
|
89 |
+
# detect_flowcharts(survey_id)
|
90 |
+
|
91 |
+
def insert_ref_images(json_path, ref_names, text):
|
92 |
+
"""
|
93 |
+
参数:
|
94 |
+
json_path: JSON 文件路径,其内容格式例如:
|
95 |
+
{
|
96 |
+
"Accelerating federated learning with data and model parallelism in edge computing":
|
97 |
+
"src/static/data/md/test/Accelerating federated learning with data and model parallelism in edge computing/auto/images/xxx.jpg",
|
98 |
+
...
|
99 |
+
}
|
100 |
+
ref_names: 引用名称列表,其中第 1 个元素对应 [1],第 2 个对应 [2],以此类推。
|
101 |
+
text: 包含类似 [1]、[2] 等引用的 Markdown 文本。
|
102 |
+
|
103 |
+
返回:
|
104 |
+
修改后的文本字符串。在每个引用标记首次出现行的下方插入对应的 HTML 代码块,
|
105 |
+
格式如下:
|
106 |
+
|
107 |
+
<div style="text-align:center">
|
108 |
+
<img src="image_path" alt="the flow chart of [ref_name]" style="width:50%;"/>
|
109 |
+
</div>
|
110 |
+
<div style="text-align:center">
|
111 |
+
Fig [ref_num]: The flow chart of [ref_name]
|
112 |
+
</div>
|
113 |
+
|
114 |
+
其中 [ref_num] 为引用编号(ref_names 中的 1-based index),[ref_name] 为引用名称。
|
115 |
+
|
116 |
+
说明:
|
117 |
+
1. JSON 中存储的路径已是目标路径,但可能混合了正斜杠和反斜杠。
|
118 |
+
2. 代码将先拆分路径字符串,再利用 os.path.join 拼接生成当前系统的标准路径,
|
119 |
+
最后转换为统一的正斜杠格式并进行 URL ��码,以适配所有系统。
|
120 |
+
"""
|
121 |
+
# 加载 JSON 文件内容
|
122 |
+
try:
|
123 |
+
with open(json_path, 'r', encoding='utf-8') as f:
|
124 |
+
img_mapping = json.load(f)
|
125 |
+
except Exception as e:
|
126 |
+
raise Exception(f"加载 JSON 文件出错: {e}")
|
127 |
+
|
128 |
+
inserted_refs = {} # 记录每个引用标记是否已插入图片
|
129 |
+
lines = text.splitlines()
|
130 |
+
new_lines = []
|
131 |
+
# 匹配类似 [1]、[2] 的引用标记
|
132 |
+
ref_pattern = re.compile(r'\[(\d+)\]')
|
133 |
+
img_index = 2
|
134 |
+
for line in lines:
|
135 |
+
new_lines.append(line)
|
136 |
+
matches = ref_pattern.findall(line)
|
137 |
+
for ref_num_str in matches:
|
138 |
+
try:
|
139 |
+
ref_num = int(ref_num_str)
|
140 |
+
except ValueError:
|
141 |
+
continue
|
142 |
+
|
143 |
+
# 仅在引用标记首次出现时插入 HTML 块
|
144 |
+
if ref_num not in inserted_refs:
|
145 |
+
inserted_refs[ref_num] = True
|
146 |
+
|
147 |
+
if 1 <= ref_num <= len(ref_names):
|
148 |
+
ref_name = ref_names[ref_num - 1]
|
149 |
+
jpg_path = img_mapping.get(ref_name, "")
|
150 |
+
else:
|
151 |
+
ref_name = f"ref_{ref_num}"
|
152 |
+
jpg_path = ""
|
153 |
+
|
154 |
+
if jpg_path:
|
155 |
+
# 将路径中可能混合的正斜杠和反斜杠拆分为多个部分
|
156 |
+
parts = re.split(r'[\\/]+', jpg_path)
|
157 |
+
# 使用 os.path.join 拼接成当前系统的规范路径
|
158 |
+
normalized_jpg_path = os.path.join(*parts)
|
159 |
+
# 转换为适用于 HTML 的路径格式(全部替换为正斜杠)
|
160 |
+
normalized_jpg_path = normalized_jpg_path.replace(os.sep, '/')
|
161 |
+
# 对路径进行 URL 编码(保留斜杠)
|
162 |
+
normalized_jpg_path_url = quote(normalized_jpg_path, safe="/")
|
163 |
+
|
164 |
+
html_block = (
|
165 |
+
f"<div style=\"text-align:center\">\n"
|
166 |
+
f" <img src=\"{normalized_jpg_path_url}\" alt=\"the chart of {ref_name}\" style=\"width:60%;\"/>\n"
|
167 |
+
f"</div>\n"
|
168 |
+
f"<div style=\"text-align:center;font-size:smaller;\">\n"
|
169 |
+
f" Fig {img_index}: Chart from \'{ref_name}\'\n"
|
170 |
+
f"</div>"
|
171 |
+
)
|
172 |
+
new_lines.append(html_block)
|
173 |
+
new_lines.append("") # 增加一个空行分隔
|
174 |
+
img_index += 1
|
175 |
+
|
176 |
+
return "\n".join(new_lines)
|
177 |
+
|
178 |
+
def insert_tex_images(json_path, ref_names, text):
|
179 |
+
"""
|
180 |
+
将 Markdown 文本中出现的数字引用(例如 [1], \[1], \[1\])替换为 LaTeX figure 环境。
|
181 |
+
仅在每个引用编号第一次出现时插入对应图片,后续出现同编号不再重复插入。
|
182 |
+
|
183 |
+
参数:
|
184 |
+
json_path: JSON 文件路径,其内容格式例如:
|
185 |
+
{
|
186 |
+
"Accelerating federated learning with data and model parallelism in edge computing":
|
187 |
+
"src/static/data/md/test/Accelerating federated learning with data and model parallelism in edge computing/auto/images/xxx.jpg",
|
188 |
+
...
|
189 |
+
}
|
190 |
+
ref_names: 引用名称列表。其中第 1 个元素对应 [1],第 2 个对应 [2],以此类推。
|
191 |
+
text: 包含类似 [1]、\[1]、\[1\] 等形式的 Markdown 文本。
|
192 |
+
|
193 |
+
返回:
|
194 |
+
修改后的文本字符串。在每个引用标记首次出现行的下方插入对应的 LaTeX figure 环境:
|
195 |
+
|
196 |
+
\begin{figure}[htbp]
|
197 |
+
\centering
|
198 |
+
\includegraphics[width=0.6\textwidth]{image_path}
|
199 |
+
\caption{Fig 2: Chart from 'ref_name'}
|
200 |
+
\end{figure}
|
201 |
+
|
202 |
+
说明:
|
203 |
+
1. JSON 中存储的路径可能含正反斜杠。
|
204 |
+
2. 我们按系统拼接路径,再统一转为正斜杠并进行 URL 编码。
|
205 |
+
3. figure 的计数从 1 开始(可根据需求调整)。
|
206 |
+
4. 若某引用编号未在 JSON 中匹配到图片,则不插入 figure。
|
207 |
+
"""
|
208 |
+
|
209 |
+
# 读取 JSON
|
210 |
+
try:
|
211 |
+
with open(json_path, 'r', encoding='utf-8') as f:
|
212 |
+
img_mapping = json.load(f)
|
213 |
+
except Exception as e:
|
214 |
+
raise Exception(f"加载 JSON 文件出错: {e}")
|
215 |
+
|
216 |
+
# 用于记录某个编号是否已插入过
|
217 |
+
inserted_refs = {}
|
218 |
+
|
219 |
+
# 按行处理文本
|
220 |
+
lines = text.splitlines()
|
221 |
+
new_lines = []
|
222 |
+
|
223 |
+
# --------------------------
|
224 |
+
# 匹配 [1], \[1], \[1\] 等数字引用
|
225 |
+
# --------------------------
|
226 |
+
# 含义:
|
227 |
+
# (?:\\)? -> 可选的反斜杠 0或1次
|
228 |
+
# \[ -> 文字 '[' (在正则中需转义)
|
229 |
+
# (\d+) -> 捕获一个或多个数字
|
230 |
+
# (?:\\)? -> 可选的反斜杠 0或1次
|
231 |
+
# \] -> 文字 ']' (需转义)
|
232 |
+
# 整体匹配可能出现以下形式:
|
233 |
+
# [1], \[1], \[1\], [12], \[12] 等
|
234 |
+
ref_pattern = re.compile(r'(?:\\)?\[(\d+)(?:\\)?\]')
|
235 |
+
|
236 |
+
# figure 计数
|
237 |
+
figure_index = 1
|
238 |
+
|
239 |
+
for line in lines:
|
240 |
+
new_lines.append(line) # 先把此行内容写入新文本
|
241 |
+
|
242 |
+
# 查找本��中所有符合模式的引用
|
243 |
+
matches = ref_pattern.findall(line)
|
244 |
+
for ref_num_str in matches:
|
245 |
+
try:
|
246 |
+
ref_num = int(ref_num_str)
|
247 |
+
except ValueError:
|
248 |
+
continue
|
249 |
+
|
250 |
+
# 若该引用编号尚未插入过图片,则执行插入
|
251 |
+
if ref_num not in inserted_refs:
|
252 |
+
inserted_refs[ref_num] = True
|
253 |
+
|
254 |
+
# 判断这个编号是否在 ref_names 范围内
|
255 |
+
if 1 <= ref_num <= len(ref_names):
|
256 |
+
ref_name = ref_names[ref_num - 1]
|
257 |
+
jpg_path = img_mapping.get(ref_name, "")
|
258 |
+
else:
|
259 |
+
ref_name = f"ref_{ref_num}"
|
260 |
+
jpg_path = ""
|
261 |
+
|
262 |
+
if jpg_path:
|
263 |
+
# 规范化路径
|
264 |
+
parts = re.split(r'[\\/]+', jpg_path)
|
265 |
+
normalized_jpg_path = os.path.join(*parts)
|
266 |
+
normalized_jpg_path = normalized_jpg_path.replace(os.sep, '/')
|
267 |
+
# URL 编码(保留 '/')
|
268 |
+
# normalized_jpg_path_url = quote(normalized_jpg_path, safe="/")
|
269 |
+
normalized_jpg_path_url = normalized_jpg_path
|
270 |
+
|
271 |
+
# 构建 LaTeX figure 块
|
272 |
+
tex_block = (
|
273 |
+
r"\begin{figure}[htbp]" "\n"
|
274 |
+
r" \centering" "\n"
|
275 |
+
f" \\includegraphics[width=0.5\\textwidth]{{{normalized_jpg_path_url}}}\n"
|
276 |
+
f" \\caption{{Chart from \\textit{ref_name}}}\n"
|
277 |
+
r"\end{figure}"
|
278 |
+
)
|
279 |
+
|
280 |
+
# 插到新文本中,再加个空行分隔
|
281 |
+
new_lines.append(tex_block)
|
282 |
+
new_lines.append("")
|
283 |
+
figure_index += 1
|
284 |
+
|
285 |
+
return "\n".join(new_lines)
|
286 |
+
|
287 |
+
|
288 |
+
# 示例用法
|
289 |
+
if __name__ == "__main__":
|
290 |
+
# Markdown 文件路径
|
291 |
+
md_file_path = "src/static/data/info/test/survey_test_processed.md"
|
292 |
+
# JSON 文件路径
|
293 |
+
json_file_path = "src/static/data/info/test/flowchart_results.json"
|
294 |
+
|
295 |
+
try:
|
296 |
+
with open(md_file_path, "r", encoding="utf-8") as f:
|
297 |
+
text = f.read()
|
298 |
+
except FileNotFoundError:
|
299 |
+
print(f"错误: Markdown 文件 {md_file_path} 未找到!")
|
300 |
+
text = ""
|
301 |
+
|
302 |
+
ref_names = [
|
303 |
+
"An explainable federated learning and blockchain based secure credit modeling method",
|
304 |
+
"Bafl a blockchain based asynchronous",
|
305 |
+
"Biscotti a blockchain system for private and secure federated learning",
|
306 |
+
"Blockdfl a blockchain based fully decentralized peer to peer",
|
307 |
+
"Accelerating blockchain enabled federated learning with clustered clients",
|
308 |
+
"A fast blockchain based federated learning framework with compressed communications"
|
309 |
+
]
|
310 |
+
|
311 |
+
result = insert_ref_images(json_file_path, ref_names, text)
|
312 |
+
print("修改后的文本为:\n")
|
313 |
+
print(result)
|
src/demo/asg_clustername.py
ADDED
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pandas as pd
|
3 |
+
import re # Import the regular expressions module
|
4 |
+
from openai import OpenAI
|
5 |
+
import ast
|
6 |
+
|
7 |
+
def generate_cluster_name_qwen_sep(tsv_path, survey_title):
|
8 |
+
data = pd.read_csv(tsv_path, sep='\t')
|
9 |
+
|
10 |
+
# Define the system prompt once, outside the loop
|
11 |
+
system_prompt = f'''You are a research assistant working on a survey paper. The survey paper is about "{survey_title}". \
|
12 |
+
'''
|
13 |
+
|
14 |
+
result = [] # Initialize the result list
|
15 |
+
|
16 |
+
for i in range(3): # Assuming labels are 0, 1, 2
|
17 |
+
sentence_list = [] # Reset sentence_list for each label
|
18 |
+
for j in range(len(data)):
|
19 |
+
if data['label'][j] == i:
|
20 |
+
sentence_list.append(data['retrieval_result'][j])
|
21 |
+
|
22 |
+
# Convert the sentence list to a string representation
|
23 |
+
user_prompt = f'''
|
24 |
+
Given a list of descriptions of sentences about an aspect of the survey, you need to use one phrase (within 8 words) to summarize it and treat it as a section title of your survey paper. \
|
25 |
+
Your response should be a list with only one element and without any other information, for example, ["Post-training of LLMs"] \
|
26 |
+
Your response must contain one keyword of the survey title, unspecified or irrelevant results are not allowed. \
|
27 |
+
The description list is:{sentence_list}'''
|
28 |
+
|
29 |
+
messages = [
|
30 |
+
{"role": "system", "content": system_prompt},
|
31 |
+
{"role": "user", "content": user_prompt},
|
32 |
+
]
|
33 |
+
|
34 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
35 |
+
openai_api_base = os.getenv("OPENAI_API_BASE")
|
36 |
+
client = OpenAI(
|
37 |
+
api_key=openai_api_key,
|
38 |
+
base_url=openai_api_base,
|
39 |
+
)
|
40 |
+
|
41 |
+
chat_response = client.chat.completions.create(
|
42 |
+
model=os.environ.get("MODEL"),
|
43 |
+
max_tokens=768,
|
44 |
+
temperature=0.5,
|
45 |
+
stop="<|im_end|>",
|
46 |
+
stream=True,
|
47 |
+
messages=messages
|
48 |
+
)
|
49 |
+
|
50 |
+
# Stream the response to a single text string
|
51 |
+
text = ""
|
52 |
+
for chunk in chat_response:
|
53 |
+
if chunk.choices[0].delta.content:
|
54 |
+
text += chunk.choices[0].delta.content
|
55 |
+
|
56 |
+
# Use regex to extract the first content within []
|
57 |
+
match = re.search(r'\[(.*?)\]', text)
|
58 |
+
if match:
|
59 |
+
cluster_name = match.group(1).strip() # Extract and clean the cluster name
|
60 |
+
# 去除集群名称两侧的引号(如果存在)
|
61 |
+
cluster_name = cluster_name.strip('"').strip("'")
|
62 |
+
result.append(cluster_name)
|
63 |
+
else:
|
64 |
+
result.append("No Cluster Name Found") # Handle cases where pattern isn't found
|
65 |
+
# print("The generated cluster names are:")
|
66 |
+
# print(result)
|
67 |
+
return result # This will be a list with three elements
|
68 |
+
|
69 |
+
# Example usage:
|
70 |
+
# result = generate_cluster_name_qwen_sep('path_to_your_file.tsv', 'Your Survey Title')
|
71 |
+
# print(result) # Output might look like ["Cluster One", "Cluster Two", "Cluster Three"]
|
72 |
+
|
73 |
+
def refine_cluster_name(cluster_names, survey_title):
|
74 |
+
cluster_names = str(cluster_names) # Convert to string to handle list input
|
75 |
+
# Define the system prompt to set the context
|
76 |
+
system_prompt = f'''You are a research assistant tasked with optimizing and refining a set of section titles for a survey paper. The survey paper is about "{survey_title}".
|
77 |
+
'''
|
78 |
+
|
79 |
+
# Construct the user prompt, including all cluster names
|
80 |
+
user_prompt = f'''
|
81 |
+
Here is a set of section titles generated for the survey topic "{survey_title}":
|
82 |
+
{cluster_names}
|
83 |
+
Please ensure that all cluster names are coherent and consistent with each other, and that each name is clear, concise, and accurately reflects the corresponding section.
|
84 |
+
Notice to remove the overlapping information between the cluster names.
|
85 |
+
Each cluster name should be within 8 words and include a keyword from the survey title.
|
86 |
+
Response with a list of section titles in the following format without any other irrelevant information,
|
87 |
+
For example, ["Refined Title 1", "Refined Title 2", "Refined Title 3"]
|
88 |
+
'''
|
89 |
+
|
90 |
+
messages = [
|
91 |
+
{"role": "system", "content": system_prompt},
|
92 |
+
{"role": "user", "content": user_prompt},
|
93 |
+
]
|
94 |
+
|
95 |
+
# Initialize OpenAI client
|
96 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
97 |
+
openai_api_base = os.getenv("OPENAI_API_BASE")
|
98 |
+
client = OpenAI(
|
99 |
+
api_key=openai_api_key,
|
100 |
+
base_url=openai_api_base,
|
101 |
+
)
|
102 |
+
|
103 |
+
try:
|
104 |
+
chat_response = client.chat.completions.create(
|
105 |
+
model=os.environ.get("MODEL"),
|
106 |
+
max_tokens=256,
|
107 |
+
temperature=0.5,
|
108 |
+
stop="<|im_end|>",
|
109 |
+
stream=True,
|
110 |
+
messages=messages
|
111 |
+
)
|
112 |
+
|
113 |
+
# Stream the response and concatenate into a complete text
|
114 |
+
text = ""
|
115 |
+
for chunk in chat_response:
|
116 |
+
if chunk.choices[0].delta.content:
|
117 |
+
text += chunk.choices[0].delta.content
|
118 |
+
|
119 |
+
# print("The raw response text is:")
|
120 |
+
# print(text)
|
121 |
+
|
122 |
+
# Use regex to extract content within square brackets
|
123 |
+
match = re.search(r'\[(.*?)\]', text)
|
124 |
+
if match:
|
125 |
+
refined_cluster_names = match.group(1).strip() # Extract and clean the cluster name
|
126 |
+
else:
|
127 |
+
refined_cluster_names = [
|
128 |
+
survey_title + ": Definition",
|
129 |
+
survey_title + ": Methods",
|
130 |
+
survey_title + ": Evaluation"
|
131 |
+
] # Handle cases where pattern isn't found
|
132 |
+
|
133 |
+
except Exception as e:
|
134 |
+
print(f"An error occurred while refining cluster names: {e}")
|
135 |
+
refined_cluster_names = ["Refinement Error"] * len(cluster_names)
|
136 |
+
|
137 |
+
refined_cluster_names = ast.literal_eval(refined_cluster_names) # Convert string to list
|
138 |
+
|
139 |
+
# print("The refined cluster names are:")
|
140 |
+
# print(refined_cluster_names)
|
141 |
+
return refined_cluster_names # Returns a list with the refined cluster names、
|
142 |
+
|
143 |
+
|
144 |
+
|
145 |
+
|
146 |
+
def generate_cluster_name_new(tsv_path, survey_title, cluster_num = 3):
|
147 |
+
data = pd.read_csv(tsv_path, sep='\t')
|
148 |
+
desp=[]
|
149 |
+
|
150 |
+
|
151 |
+
for i in range(cluster_num): # Assuming labels are 0, 1, 2
|
152 |
+
sentence_list = [] # Initialize the sentence list
|
153 |
+
for j in range(len(data)):
|
154 |
+
if data['label'][j] == i:
|
155 |
+
sentence_list.append(data['retrieval_result'][j])
|
156 |
+
desp.append(sentence_list)
|
157 |
+
|
158 |
+
system_prompt = f'''
|
159 |
+
You are a research assistant working on a survey paper. The survey paper is about "{survey_title}". '''
|
160 |
+
|
161 |
+
cluster_info = "\n".join([f'Cluster {i+1}: "{desp[i]}"' for i in range(cluster_num)])
|
162 |
+
|
163 |
+
user_prompt = f'''
|
164 |
+
Your task is to generate {cluster_num} distinctive cluster names (e.g., "Pre-training of LLMs") of the given clusters of reference papers, each reference paper is described by a sentence.
|
165 |
+
|
166 |
+
The clusters of reference papers are:
|
167 |
+
{cluster_info}
|
168 |
+
|
169 |
+
Your output should be a single list of {cluster_num} cluster names, e.g., ["Pre-training of LLMs", "Fine-tuning of LLMs", "Evaluation of LLMs"]
|
170 |
+
Do not output any other text or information.
|
171 |
+
'''
|
172 |
+
|
173 |
+
messages = [
|
174 |
+
{"role": "system", "content": system_prompt},
|
175 |
+
{"role": "user", "content": user_prompt},
|
176 |
+
]
|
177 |
+
|
178 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
179 |
+
openai_api_base = os.getenv("OPENAI_API_BASE")
|
180 |
+
client = OpenAI(
|
181 |
+
api_key=openai_api_key,
|
182 |
+
base_url=openai_api_base,
|
183 |
+
)
|
184 |
+
|
185 |
+
chat_response = client.chat.completions.create(
|
186 |
+
model=os.environ.get("MODEL"),
|
187 |
+
max_tokens=768,
|
188 |
+
temperature=0.5,
|
189 |
+
stop="<|im_end|>",
|
190 |
+
stream=True,
|
191 |
+
messages=messages
|
192 |
+
)
|
193 |
+
|
194 |
+
# Stream the response to a single text string
|
195 |
+
text = ""
|
196 |
+
for chunk in chat_response:
|
197 |
+
if chunk.choices[0].delta.content:
|
198 |
+
text += chunk.choices[0].delta.content
|
199 |
+
# print("The raw response text is:")
|
200 |
+
# print(text)
|
201 |
+
|
202 |
+
# Use regex to extract content within square brackets
|
203 |
+
match = re.search(r'\[(.*?)\]', text)
|
204 |
+
if match:
|
205 |
+
refined_cluster_names = match.group(1).strip() # Extract and clean the cluster name
|
206 |
+
else:
|
207 |
+
predefined_sections = [
|
208 |
+
"Definition", "Methods", "Evaluation", "Applications",
|
209 |
+
"Challenges", "Future Directions", "Comparisons", "Case Studies"
|
210 |
+
]
|
211 |
+
|
212 |
+
# 根据 cluster_num 选择前 cluster_num 个预定义类别
|
213 |
+
refined_cluster_names = [
|
214 |
+
f"{survey_title}: {predefined_sections[i]}" for i in range(cluster_num)
|
215 |
+
]
|
216 |
+
|
217 |
+
refined_cluster_names = ast.literal_eval(refined_cluster_names) # Convert string to list
|
218 |
+
|
219 |
+
# print("The refined cluster names are:")
|
220 |
+
# print(refined_cluster_names)
|
221 |
+
return refined_cluster_names # Returns a list with the refined cluster names、
|
222 |
+
|
223 |
+
|
224 |
+
if __name__ == "__main__":
|
225 |
+
refined_result = refine_cluster_name(["Pre-training of LLMs", "Fine-tuning of LLMs", "Evaluation of LLMs"], 'Survey of LLMs')
|
226 |
+
# print(refined_result)
|
227 |
+
|
228 |
+
|
src/demo/asg_conclusion.py
ADDED
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
|
4 |
+
class ConclusionGenerator:
|
5 |
+
def __init__(self, pipeline):
|
6 |
+
self.pipeline = pipeline
|
7 |
+
|
8 |
+
def generate(self, title, intro, mode='lora'):
|
9 |
+
if mode == 'lora' or mode == 'test':
|
10 |
+
if mode == 'lora':
|
11 |
+
self.pipeline.model.set_adapter("conclusion")
|
12 |
+
|
13 |
+
system_prompt = f'''You are a helpful assistant that help to generate the conclusion of the survey paper given the survey title and survey introduction.'''
|
14 |
+
# user_prompt = {"survey_title":survey_title, "claims":cluster_with_claims}
|
15 |
+
user_prompt = f'''Help me to generate the conclusion of a survey paper given the title: *{title}*, and and the introduction:{intro}'''
|
16 |
+
|
17 |
+
messages = [
|
18 |
+
{"role": "system", "content": system_prompt},
|
19 |
+
{"role": "user", "content": user_prompt},
|
20 |
+
{"role": "assistant", "content":"Conclusion: This survey "}
|
21 |
+
]
|
22 |
+
|
23 |
+
outputs = self.pipeline(
|
24 |
+
messages,
|
25 |
+
max_new_tokens=4096,
|
26 |
+
)
|
27 |
+
result = outputs[0]["generated_text"][-1]['content']
|
28 |
+
return result
|
29 |
+
else:
|
30 |
+
raise ValueError('mode not supported')
|
31 |
+
|
32 |
+
if __name__ == '__main__':
|
33 |
+
from transformers import pipeline
|
34 |
+
import torch
|
35 |
+
import transformers
|
36 |
+
|
37 |
+
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
38 |
+
Global_pipeline = transformers.pipeline(
|
39 |
+
"text-generation",
|
40 |
+
model=model_id,
|
41 |
+
model_kwargs={"torch_dtype": torch.bfloat16},
|
42 |
+
token = os.getenv('HF_API_KEY'),
|
43 |
+
device_map="auto",
|
44 |
+
)
|
45 |
+
Global_pipeline.model.load_adapter(peft_model_id = "technicolor/llama3.1_8b_outline_generation", adapter_name="outline")
|
46 |
+
Global_pipeline.model.load_adapter(peft_model_id ="technicolor/llama3.1_8b_conclusion_generation", adapter_name="conclusion")
|
47 |
+
Global_pipeline.model.load_adapter(peft_model_id ="technicolor/llama3.1_8b_abstract_generation", adapter_name="abstract")
|
48 |
+
|
49 |
+
|
50 |
+
title = "A Survey of Large Language Models"
|
51 |
+
intro = '''L
|
52 |
+
ANGUAGE is a prominent ability in human beings to
|
53 |
+
express and communicate, which develops in early
|
54 |
+
childhood and evolves over a lifetime [3, 4]. Machines,
|
55 |
+
however, cannot naturally grasp the abilities of understanding and communicating in the form of human language,
|
56 |
+
unless equipped with powerful artificial intelligence (AI)
|
57 |
+
algorithms. It has been a longstanding research challenge
|
58 |
+
to achieve this goal, to enable machines to read, write, and
|
59 |
+
communicate like humans [5].
|
60 |
+
Technically, language modeling (LM) is one of the major
|
61 |
+
approaches to advancing language intelligence of machines.
|
62 |
+
In general, LM aims to model the generative likelihood
|
63 |
+
of word sequences, so as to predict the probabilities of
|
64 |
+
future (or missing) tokens. The research of LM has received
|
65 |
+
extensive attention in the literature, which can be divided
|
66 |
+
into four major development stages:
|
67 |
+
• Statistical language models (SLM). SLMs [6–9] are developed based on statistical learning methods that rose in
|
68 |
+
the 1990s. The basic idea is to build the word prediction
|
69 |
+
model based on the Markov assumption, e.g., predicting the
|
70 |
+
next word based on the most recent context. The SLMs with
|
71 |
+
a fixed context length n are also called n-gram language
|
72 |
+
models, e.g., bigram and trigram language models. SLMs
|
73 |
+
have been widely applied to enhance task performance
|
74 |
+
in information retrieval (IR) [10, 11] and natural language
|
75 |
+
processing (NLP) [12–14]. However, they often suffer from
|
76 |
+
the curse of dimensionality: it is difficult to accurately
|
77 |
+
estimate high-order language models since an exponential
|
78 |
+
number of transition probabilities need to be estimated.
|
79 |
+
Thus, specially designed smoothing strategies such as backoff estimation [15] and Good–Turing estimation [16] have
|
80 |
+
been introduced to alleviate the data sparsity problem.
|
81 |
+
• Neural language models (NLM). NLMs [1, 17, 18] characterize the probability of word sequences by neural networks,
|
82 |
+
e.g., multi-layer perceptron (MLP) and recurrent neural networks (RNNs). As a remarkable contribution, the work in
|
83 |
+
[1] introduced the concept of distributed representation of
|
84 |
+
words and built the word prediction function conditioned
|
85 |
+
on the aggregated context features (i.e., the distributed
|
86 |
+
word vectors). By extending the idea of learning effective
|
87 |
+
features for text data, a general neural network approach
|
88 |
+
was developed to build a unified, end-to-end solution for
|
89 |
+
various NLP tasks [2]. Furthermore, word2vec [19, 20] was
|
90 |
+
proposed to build a simplified shallow neural network
|
91 |
+
for learning distributed word representations, which were
|
92 |
+
demonstrated to be very effective across a variety of NLP
|
93 |
+
tasks. These studies have initiated the use of language
|
94 |
+
models for representation learning (beyond word sequence
|
95 |
+
modeling), having an important impact on the field of NLP.
|
96 |
+
• Pre-trained language models (PLM). As an early attempt, ELMo [21] was proposed to capture context-aware
|
97 |
+
word representations by first pre-training a bidirectional
|
98 |
+
LSTM (biLSTM) network (instead of learning fixed word
|
99 |
+
representations) and then fine-tuning the biLSTM network
|
100 |
+
according to specific downstream tasks. Furthermore, based
|
101 |
+
on the highly parallelizable Transformer architecture [22]
|
102 |
+
with self-attention mechanisms, BERT [23] was proposed by
|
103 |
+
pre-training bidirectional language models with specially
|
104 |
+
designed pre-training tasks on large-scale unlabeled corpora. These pre-trained context-aware word representations
|
105 |
+
are very effective as general-purpose semantic features,
|
106 |
+
which have largely raised the performance bar of NLP
|
107 |
+
tasks. This study has inspired a large number of follow-up
|
108 |
+
work, which sets the “pre-training and fine-tuning” learning
|
109 |
+
paradigm. Following this paradigm, a great number of studies on PLMs have been developed, introducing either different architectures [24, 25] (e.g., GPT-2 [26] and BART [24]) or
|
110 |
+
improved pre-training strategies [27–29]. In this paradigm, it
|
111 |
+
often requires fine-tuning the PLM for adapting to different
|
112 |
+
downstream tasks.
|
113 |
+
• Large language models (LLM). Researchers find that
|
114 |
+
scaling PLM (e.g., scaling model size or data size) often
|
115 |
+
leads to an improved model capacity on downstream tasks
|
116 |
+
(i.e., following the scaling law [30]). A number of studies
|
117 |
+
have explored the performance limit by training an ever
|
118 |
+
larger PLM (e.g., the 175B-parameter GPT-3 and the 540Bparameter PaLM). Although scaling is mainly conducted
|
119 |
+
in model size (with similar architectures and pre-training
|
120 |
+
tasks), these large-sized PLMs display different behaviors
|
121 |
+
from smaller PLMs (e.g., 330M-parameter BERT and 1.5Bparameter GPT-2) and show surprising abilities (called emergent abilities [31]) in solving a series of complex tasks. For
|
122 |
+
example, GPT-3 can solve few-shot tasks through in-context
|
123 |
+
learning, whereas GPT-2 cannot do well. Thus, the research
|
124 |
+
community coins the term “large language models (LLM)”
|
125 |
+
1
|
126 |
+
for these large-sized PLMs [32–35], which attract increasing
|
127 |
+
research attention (See Figure 1). A remarkable application
|
128 |
+
of LLMs is ChatGPT2
|
129 |
+
that adapts the LLMs from the GPT
|
130 |
+
series for dialogue, which presents an amazing conversation
|
131 |
+
ability with humans. We can observe a sharp increase of the
|
132 |
+
arXiv papers that are related to LLMs after the release of
|
133 |
+
ChatGPT in Figure 1.
|
134 |
+
As discussed before, language model is not a new technical concept specially for LLMs, but has evolved with the
|
135 |
+
advance of artificial intelligence over the decades. Early language models mainly aim to model and generate text data,
|
136 |
+
while latest language models (e.g., GPT-4) focus on complex
|
137 |
+
task solving. From language modeling to task solving, it is an
|
138 |
+
important leap in scientific thinking, which is the key to
|
139 |
+
understand the development of language models in the research history. From the perspective of task solving, the four
|
140 |
+
generations of language models have exhibited different levels of model capacities. In Figure 2, we describe the evolution process of language models in terms of the task solving
|
141 |
+
capacity. At first, statistical language models mainly assisted
|
142 |
+
in some specific tasks (e.g., retrieval or speech tasks), in
|
143 |
+
which the predicted or estimated probabilities can enhance
|
144 |
+
the performance of task-specific approaches. Subsequently,
|
145 |
+
neural language models focused on learning task-agnostic
|
146 |
+
representations (e.g., features), aiming to reduce the efforts
|
147 |
+
for human feature engineering. Furthermore, pre-trained
|
148 |
+
language models learned context-aware representations that
|
149 |
+
can be optimized according to downstream tasks. For the
|
150 |
+
latest generation of language model, LLMs are enhanced by
|
151 |
+
exploring the scaling effect on model capacity, which can be
|
152 |
+
considered as general-purpose task solvers. To summarize,
|
153 |
+
in the evolution process, the task scope that can be solved
|
154 |
+
by language models have been greatly extended, and the
|
155 |
+
task performance attained by language models have been
|
156 |
+
significantly enhanced.
|
157 |
+
In the existing literature, PLMs have been widely discussed and surveyed [36–39], while LLMs are seldom reviewed in a systematic way. To motivate our survey, we first
|
158 |
+
highlight three major differences between LLMs and PLMs.
|
159 |
+
First, LLMs display some surprising emergent abilities that
|
160 |
+
may not be observed in previous smaller PLMs. These abilities are key to the performance of language models on complex tasks, making AI algorithms unprecedently powerful
|
161 |
+
and effective. Second, LLMs would revolutionize the way
|
162 |
+
that humans develop and use AI algorithms. Unlike small
|
163 |
+
PLMs, the major approach to accessing LLMs is through
|
164 |
+
the prompting interface (e.g., GPT-4 API). Humans have to
|
165 |
+
understand how LLMs work and format their tasks in a way
|
166 |
+
that LLMs can follow. Third, the development of LLMs no
|
167 |
+
longer draws a clear distinction between research and engineering. The training of LLMs requires extensive practical
|
168 |
+
experiences in large-scale data processing and distributed
|
169 |
+
parallel training. To develop capable LLMs, researchers
|
170 |
+
have to solve complicated engineering issues, working with
|
171 |
+
engineers or being engineers.
|
172 |
+
Nowadays, LLMs are posing a significant impact on
|
173 |
+
the AI community, and the advent of ChatGPT and GPT-4
|
174 |
+
leads to the rethinking of the possibilities of artificial general
|
175 |
+
intelligence (AGI). OpenAI has published a technical article
|
176 |
+
entitled “Planning for AGI and beyond”, which discusses
|
177 |
+
the short-term and long-term plans to approach AGI [40],
|
178 |
+
and a more recent paper has argued that GPT-4 might be
|
179 |
+
considered as an early version of an AGI system [41]. The
|
180 |
+
research areas of AI are being revolutionized by the rapid
|
181 |
+
progress of LLMs. In the field of NLP, LLMs can serve as a
|
182 |
+
general-purpose language task solver (to some extent), and
|
183 |
+
the research paradigm has been shifting towards the use
|
184 |
+
of LLMs. In the field of IR, traditional search engines are
|
185 |
+
challenged by the new information seeking way through AI
|
186 |
+
chatbots (i.e., ChatGPT), and New Bing3 presents an initial
|
187 |
+
attempt that enhances the search results based on LLMs. In
|
188 |
+
the field of CV, the researchers try to develop ChatGPT-like
|
189 |
+
vision-language models that can better serve multimodal
|
190 |
+
dialogues [42–45], and GPT-4 [46] has supported multimodal input by integrating the visual information. This new
|
191 |
+
wave of technology would potentially lead to a prosperous
|
192 |
+
ecosystem of real-world applications based on LLMs. For
|
193 |
+
instance, Microsoft 365 is being empowered by LLMs (i.e.,
|
194 |
+
Copilot) to automate the office work, and OpenAI supports
|
195 |
+
the use of plugins in ChatGPT for implementing special
|
196 |
+
functions.
|
197 |
+
Despite the progress and impact, the underlying principles of LLMs are still not well explored. Firstly, it is
|
198 |
+
mysterious why emergent abilities occur in LLMs, instead of
|
199 |
+
smaller PLMs. As a more general issue, there lacks a deep,
|
200 |
+
detailed investigation of the key factors that contribute to
|
201 |
+
the superior abilities of LLMs. It is important to study when
|
202 |
+
and how LLMs obtain such abilities [47]. Although there are
|
203 |
+
some meaningful discussions about this problem [31, 47],
|
204 |
+
more principled investigations are needed to uncover the
|
205 |
+
“secrets“ of LLMs. Secondly, it is difficult for the research
|
206 |
+
community to train capable LLMs. Due to the huge demand of computation resources, it is very costly to carry
|
207 |
+
out repetitive, ablating studies for investigating the effect
|
208 |
+
of various strategies for training LLMs. Indeed, LLMs are
|
209 |
+
mainly trained by industry, where many important training
|
210 |
+
details (e.g., data collection and cleaning) are not revealed
|
211 |
+
to the public. Thirdly, it is challenging to align LLMs with
|
212 |
+
human values or preferences. Despite the capacities, LLMs
|
213 |
+
are also likely to produce toxic, fictitious, or harmful contents. It requires effective and efficient control approaches
|
214 |
+
to eliminating the potential risk of the use of LLMs [46].
|
215 |
+
Faced with both opportunities and challenges, it needs
|
216 |
+
more attention on the research and development of LLMs. In
|
217 |
+
order to provide a basic understanding of LLMs, this survey
|
218 |
+
conducts a literature review of the recent advances in LLMs
|
219 |
+
from four major aspects, including pre-training (how to pretrain a capable LLM), adaptation (how to effectively adapt
|
220 |
+
pre-trained LLMs for better use), utilization (how to use
|
221 |
+
LLMs for solving various downstream tasks) and capability
|
222 |
+
evaluation (how to evaluate the abilities of LLMs and existing
|
223 |
+
empirical findings). We thoroughly comb the literature and
|
224 |
+
summarize the key findings, techniques, and methods of
|
225 |
+
LLMs. For this survey, we also create a GitHub project
|
226 |
+
website by collecting the supporting resources for LLMs, at
|
227 |
+
the link https://github.com/RUCAIBox/LLMSurvey. We
|
228 |
+
are also aware of several related review articles on PLMs
|
229 |
+
or LLMs [32, 36, 38, 39, 43, 48–54]. These papers either
|
230 |
+
discuss PLMs or some specific (or general) aspects of LLMs.
|
231 |
+
Compared with them, we focus on the techniques and
|
232 |
+
methods to develop and use LLMs and provide a relatively
|
233 |
+
comprehensive reference to important aspects of LLMs.
|
234 |
+
The remainder of this survey is organized as follows:
|
235 |
+
Section 2 introduces the background for LLMs and the evolution of GPT-series models, followed by the summarization
|
236 |
+
of available resources for developing LLMs in Section 3.
|
237 |
+
Sections 4, 5, 6, and 7 review and summarize the recent
|
238 |
+
progress from the four aspects of pre-training, adaptation,
|
239 |
+
utilization, and capacity evaluation, respectively. Then, Section 8 discusses the practical guide for prompt design,
|
240 |
+
and Section 9 reviews the applications of LLMs in several
|
241 |
+
representative domains. Finally, we conclude the survey in
|
242 |
+
Section 10 by summarizing the major findings and discuss
|
243 |
+
the remaining issues for future work.
|
244 |
+
'''
|
245 |
+
|
246 |
+
|
247 |
+
conclusion_generator = ConclusionGenerator(Global_pipeline)
|
248 |
+
with_lora = conclusion_generator.generate(title, intro, mode='lora')
|
249 |
+
# print("The conclusion generated with LORA is: \n", with_lora)
|
250 |
+
# print("=============================================================")
|
251 |
+
with_test = conclusion_generator.generate(title, intro, mode='test')
|
252 |
+
# print("The conclusion generated with test is: \n", with_test)
|
253 |
+
# print("=============================================================")
|
src/demo/asg_generator.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from openai import OpenAI
|
2 |
+
import transformers
|
3 |
+
import os
|
4 |
+
import re
|
5 |
+
import ast
|
6 |
+
import json
|
7 |
+
import base64
|
8 |
+
|
9 |
+
def getQwenClient():
|
10 |
+
openai_api_key = os.environ.get("OPENAI_API_KEY")
|
11 |
+
openai_api_base = os.environ.get("OPENAI_API_BASE")
|
12 |
+
|
13 |
+
client = OpenAI(
|
14 |
+
api_key=openai_api_key,
|
15 |
+
base_url=openai_api_base,
|
16 |
+
)
|
17 |
+
return client
|
18 |
+
|
19 |
+
def generateResponse(client, prompt):
|
20 |
+
chat_response = client.chat.completions.create(
|
21 |
+
model=os.environ.get("MODEL"),
|
22 |
+
max_tokens=768,
|
23 |
+
temperature=0.5,
|
24 |
+
stop="<|im_end|>",
|
25 |
+
stream=True,
|
26 |
+
messages=[{"role": "user", "content": prompt}]
|
27 |
+
)
|
28 |
+
|
29 |
+
text = ""
|
30 |
+
for chunk in chat_response:
|
31 |
+
if chunk.choices[0].delta.content:
|
32 |
+
text += chunk.choices[0].delta.content
|
33 |
+
return text
|
34 |
+
|
35 |
+
def generate_sentence_patterns(keyword, num_patterns=5, temp=0.7):
|
36 |
+
template = f"""
|
37 |
+
You are a helpful assistant that provides only the output requested, without any additional text.
|
38 |
+
|
39 |
+
Please generate {num_patterns} commonly used sentence templates in academic papers to describe the '{keyword}'.
|
40 |
+
- Do not include any explanations, sign-offs, or additional text.
|
41 |
+
- The list should be in the following format:
|
42 |
+
[
|
43 |
+
"First template should be here",
|
44 |
+
"Second template should be here",
|
45 |
+
...
|
46 |
+
]
|
47 |
+
|
48 |
+
Begin your response immediately with the list, and do not include any other text.
|
49 |
+
"""
|
50 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
51 |
+
openai_api_base = os.getenv("OPENAI_API_BASE")
|
52 |
+
client = OpenAI(
|
53 |
+
api_key = openai_api_key,
|
54 |
+
base_url = openai_api_base,
|
55 |
+
)
|
56 |
+
response = generateResponse(client, template)
|
57 |
+
return response
|
58 |
+
|
59 |
+
def generate(context, keyword, paper_title, temp=0.7):
|
60 |
+
template = f"""
|
61 |
+
Context:
|
62 |
+
{context}
|
63 |
+
------------------------------------------------------------
|
64 |
+
Based on the above context, answer the question: What {keyword} are mentioned in the paper {paper_title}?
|
65 |
+
Please provide a direct answer in one paragraph, no longer than 100 words.
|
66 |
+
|
67 |
+
If the context provides enough information, answer strictly based on it.
|
68 |
+
If the context provided does not contain any specified {keyword}, deduce and integrate your own opinion as if the {keyword} were described in the context.
|
69 |
+
Ensure that your answer remains consistent with the style and format of the provided context, as if the information you provide is naturally part of it.
|
70 |
+
------------------------------------------------------------
|
71 |
+
Answer:
|
72 |
+
The {keyword} mentioned in this paper discuss [Your oberservation or opinion]...
|
73 |
+
"""
|
74 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
75 |
+
openai_api_base = os.getenv("OPENAI_API_BASE")
|
76 |
+
client = OpenAI(
|
77 |
+
api_key = openai_api_key,
|
78 |
+
base_url = openai_api_base,
|
79 |
+
)
|
80 |
+
response = generateResponse(client, template)
|
81 |
+
return response
|
82 |
+
|
83 |
+
def extract_query_list(text):
|
84 |
+
pattern = re.compile(
|
85 |
+
r'\[\s*"[^"]+"\s*,\s*"[^"]+"\s*,\s*"[^"]+"\s*,\s*"[^"]+"\s*,\s*"[^"]+"\s*,\s*"[^"]+"\s*,\s*"[^"]+"\s*,\s*"[^"]+"\s*,\s*"[^"]+"\s*,\s*"[^"]+"\s*\]'
|
86 |
+
)
|
87 |
+
match = pattern.search(text)
|
88 |
+
if match:
|
89 |
+
return match.group(0)
|
90 |
+
return None
|
src/demo/asg_latex.py
ADDED
@@ -0,0 +1,816 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import subprocess
|
3 |
+
import os
|
4 |
+
|
5 |
+
from openai import OpenAI
|
6 |
+
import dotenv
|
7 |
+
from .asg_add_flowchart import insert_tex_images
|
8 |
+
from .asg_mindmap import insert_outline_figure
|
9 |
+
|
10 |
+
|
11 |
+
def _remove_div_blocks(lines):
|
12 |
+
"""
|
13 |
+
从给定的行列表中,移除所有形如:
|
14 |
+
<div style="...">
|
15 |
+
... (若干行)
|
16 |
+
</div>
|
17 |
+
的 HTML 块(含首尾 <div> ... </div>)整段跳过。
|
18 |
+
返回处理后的新行列表。
|
19 |
+
"""
|
20 |
+
new_lines = []
|
21 |
+
i = 0
|
22 |
+
n = len(lines)
|
23 |
+
|
24 |
+
while i < n:
|
25 |
+
line = lines[i]
|
26 |
+
# 如果该行以 <div style= 开头,则进入跳过模式
|
27 |
+
if line.strip().startswith("<div style="):
|
28 |
+
# 跳过本行
|
29 |
+
i += 1
|
30 |
+
# 一直向后找,直到遇到 '</div>' 行
|
31 |
+
while i < n and not lines[i].strip().startswith("</div>"):
|
32 |
+
i += 1
|
33 |
+
# 这里再跳过 '</div>' 那一行
|
34 |
+
i += 1
|
35 |
+
else:
|
36 |
+
new_lines.append(line)
|
37 |
+
i += 1
|
38 |
+
|
39 |
+
return new_lines
|
40 |
+
|
41 |
+
def _convert_setext_to_atx(lines):
|
42 |
+
"""
|
43 |
+
将形如:
|
44 |
+
|
45 |
+
标题文字
|
46 |
+
===
|
47 |
+
|
48 |
+
转换为:
|
49 |
+
|
50 |
+
# 标题文字
|
51 |
+
|
52 |
+
将形如:
|
53 |
+
|
54 |
+
标题文字
|
55 |
+
---
|
56 |
+
|
57 |
+
转换为:
|
58 |
+
|
59 |
+
## 标题文字
|
60 |
+
"""
|
61 |
+
setext_equal_pattern = re.compile(r'^\s*=+\s*$') # 匹配全 `===`
|
62 |
+
setext_dash_pattern = re.compile(r'^\s*-+\s*$') # 匹配全 `---`
|
63 |
+
|
64 |
+
new_lines = []
|
65 |
+
i = 0
|
66 |
+
n = len(lines)
|
67 |
+
|
68 |
+
while i < n:
|
69 |
+
line = lines[i]
|
70 |
+
if i < n - 1:
|
71 |
+
next_line = lines[i + 1].strip()
|
72 |
+
# 若下一行是 ===
|
73 |
+
if setext_equal_pattern.match(next_line):
|
74 |
+
heading_text = line.strip()
|
75 |
+
new_lines.append(f"# {heading_text}")
|
76 |
+
i += 2 # 跳过下一行
|
77 |
+
continue
|
78 |
+
# 若下一行是 ---
|
79 |
+
if setext_dash_pattern.match(next_line):
|
80 |
+
heading_text = line.strip()
|
81 |
+
new_lines.append(f"## {heading_text}")
|
82 |
+
i += 2
|
83 |
+
continue
|
84 |
+
# 否则不改动
|
85 |
+
new_lines.append(line)
|
86 |
+
i += 1
|
87 |
+
|
88 |
+
return new_lines
|
89 |
+
|
90 |
+
def preprocess_md(md_input_path: str, md_output_path: str = None) -> str:
|
91 |
+
"""
|
92 |
+
预处理一个 Markdown 文件:
|
93 |
+
1. 移除所有 <div style="..."> ... </div> 这类 HTML 块
|
94 |
+
2. 将 setext 标题 (===, ---) 转为 ATX 标题 (#, ##)
|
95 |
+
3. 覆盖写回或输出到新文件
|
96 |
+
|
97 |
+
参数:
|
98 |
+
md_input_path: 原始 Markdown 文件路径
|
99 |
+
md_output_path: 处理后要写出的 Markdown 文件路径; 若为 None 则覆盖原始文件.
|
100 |
+
|
101 |
+
返回:
|
102 |
+
str: 返回处理后 Markdown 文件的实际写出路径 (md_output_path).
|
103 |
+
"""
|
104 |
+
if md_output_path is None:
|
105 |
+
md_output_path = md_input_path
|
106 |
+
|
107 |
+
# 1) 读入行
|
108 |
+
with open(md_input_path, 'r', encoding='utf-8') as f:
|
109 |
+
lines = f.read().splitlines()
|
110 |
+
|
111 |
+
# 2) 移除 <div style="..."> ... </div> 片段
|
112 |
+
lines_no_div = _remove_div_blocks(lines)
|
113 |
+
|
114 |
+
# 3) 将 setext 标题转换为 ATX
|
115 |
+
lines_atx = _convert_setext_to_atx(lines_no_div)
|
116 |
+
|
117 |
+
# 4) 写出
|
118 |
+
with open(md_output_path, 'w', encoding='utf-8') as f:
|
119 |
+
for ln in lines_atx:
|
120 |
+
f.write(ln + "\n")
|
121 |
+
|
122 |
+
return md_output_path
|
123 |
+
|
124 |
+
def search_sections(md_path: str):
|
125 |
+
"""
|
126 |
+
解析仅含 ATX 风格标题的 Markdown 文件,返回一个列表,
|
127 |
+
每个元素是一个三元组: (level, heading_text, content_string)
|
128 |
+
|
129 |
+
说明:
|
130 |
+
- 标题行形如 "# 标题"、"## 标题"、"### 标题" 等(在井号后有一个空格)。
|
131 |
+
- level = (井号个数 - 1),即 "# -> level=0"、"## -> level=1"、"### -> level=2" ...
|
132 |
+
- 移除类似 "3.1.3 "、"2.10.1 " 这类数字点前缀(含其后空格)。
|
133 |
+
- content_string 为该标题之后、直到下一个标题行或文件结束为止的所有文本(换行拼接)。
|
134 |
+
"""
|
135 |
+
|
136 |
+
# 用于匹配 ATX 标题(如 "# 标题", "## 3.1.3 标题" 等)
|
137 |
+
atx_pattern = re.compile(r'^(#+)\s+(.*)$')
|
138 |
+
|
139 |
+
# 用于去除标题前缀的数字.数字.数字... (可能有空格)
|
140 |
+
# 示例匹配: "3.1.3 "、"2.10.1 " 等
|
141 |
+
leading_numbers_pattern = re.compile(r'^\d+(\.\d+)*\s*')
|
142 |
+
|
143 |
+
# 读入行
|
144 |
+
with open(md_path, "r", encoding="utf-8") as f:
|
145 |
+
lines = f.read().splitlines()
|
146 |
+
|
147 |
+
sections = []
|
148 |
+
i = 0
|
149 |
+
n = len(lines)
|
150 |
+
|
151 |
+
def gather_content(start_idx: int):
|
152 |
+
"""
|
153 |
+
从 start_idx 开始,收集正文,直到遇到下一个 ATX 标题或文档末尾。
|
154 |
+
返回 (content_string, end_idx).
|
155 |
+
"""
|
156 |
+
content_lines = []
|
157 |
+
idx = start_idx
|
158 |
+
while idx < n:
|
159 |
+
line = lines[idx].rstrip()
|
160 |
+
# 如果此行匹配到 ATX 标题模式,则停止收集正文
|
161 |
+
if atx_pattern.match(line):
|
162 |
+
break
|
163 |
+
content_lines.append(lines[idx])
|
164 |
+
idx += 1
|
165 |
+
return "\n".join(content_lines), idx
|
166 |
+
|
167 |
+
while i < n:
|
168 |
+
line = lines[i].rstrip()
|
169 |
+
|
170 |
+
# 判断是否为 ATX 标题
|
171 |
+
match_atx = atx_pattern.match(line)
|
172 |
+
if match_atx:
|
173 |
+
# group(1) 例如 "##"
|
174 |
+
# group(2) 例如 "3.1 Introduction"
|
175 |
+
hashes = match_atx.group(1)
|
176 |
+
heading_text_raw = match_atx.group(2).strip()
|
177 |
+
|
178 |
+
# 计算标题层级: "# -> level=0, ## -> level=1, ### -> level=2"
|
179 |
+
heading_level = len(hashes) - 1
|
180 |
+
|
181 |
+
# 移除类似 "3.1.3 " 的前缀
|
182 |
+
heading_text = leading_numbers_pattern.sub('', heading_text_raw).strip()
|
183 |
+
|
184 |
+
i += 1 # 跳过标题行,准备收集正文
|
185 |
+
content_string, new_idx = gather_content(i)
|
186 |
+
|
187 |
+
sections.append((heading_level, heading_text, content_string))
|
188 |
+
i = new_idx
|
189 |
+
print(heading_level, heading_text)
|
190 |
+
else:
|
191 |
+
# 否则跳到下一行
|
192 |
+
i += 1
|
193 |
+
|
194 |
+
# [可选调试输出] 打印当前标题层级及其文本
|
195 |
+
|
196 |
+
|
197 |
+
return sections[1:]
|
198 |
+
|
199 |
+
def abstract_to_tex(section):
|
200 |
+
"""
|
201 |
+
将 Markdown 中的 abstract 段落转化为 LaTeX 片段。
|
202 |
+
|
203 |
+
参数:
|
204 |
+
section: (level, heading_text, content_string)
|
205 |
+
level: 0 表示一级标题, 1 表示二级标题, etc.
|
206 |
+
heading_text: 当前标题文字
|
207 |
+
content_string: 该标题下的 Markdown 文本
|
208 |
+
|
209 |
+
返回:
|
210 |
+
一个字符串,包含对应的 LaTeX abstract 环境。
|
211 |
+
"""
|
212 |
+
level, heading_text, content_string = section
|
213 |
+
|
214 |
+
# 如果标题不是 "Abstract",则直接返回空字符串
|
215 |
+
if heading_text.lower() != "abstract":
|
216 |
+
return ""
|
217 |
+
|
218 |
+
# 生成 LaTeX abstract 环境
|
219 |
+
latex_abstract = (
|
220 |
+
"\\begin{abstract}\n"
|
221 |
+
f"{content_string}\n"
|
222 |
+
"\\end{abstract}"
|
223 |
+
)
|
224 |
+
return latex_abstract
|
225 |
+
|
226 |
+
def references_to_tex(section):
|
227 |
+
"""
|
228 |
+
将 Markdown 中的 references 段落转化为 LaTeX 片段。
|
229 |
+
|
230 |
+
参数:
|
231 |
+
section: (level, heading_text, content_string)
|
232 |
+
level: 0 表示一级标题, 1 表示二级标题, etc.
|
233 |
+
heading_text: 当前标题文字
|
234 |
+
content_string: 该标题下的 Markdown 文本
|
235 |
+
|
236 |
+
返回:
|
237 |
+
一个字符串,包含对应的 LaTeX references 环境。
|
238 |
+
"""
|
239 |
+
level, heading_text, content_string = section
|
240 |
+
|
241 |
+
# 如果标题不是 "References",则直接返回空字符串
|
242 |
+
if heading_text.lower() != "references":
|
243 |
+
return ""
|
244 |
+
|
245 |
+
# 在每一行的末尾添加 \\ 以实现换行
|
246 |
+
lines = content_string.splitlines()
|
247 |
+
latex_content = " \\\\{}\n".join(line.strip() for line in lines if line.strip())
|
248 |
+
|
249 |
+
# 生成 LaTeX 片段,使用 \section* 创建不带编号的标题
|
250 |
+
latex_references = (
|
251 |
+
"\\section*{References}\n" # 不带编号的 section
|
252 |
+
f"{latex_content}"
|
253 |
+
)
|
254 |
+
return latex_references
|
255 |
+
|
256 |
+
def md_to_tex_section(section):
|
257 |
+
|
258 |
+
"""
|
259 |
+
将单个 Markdown 分段 (level, heading, content) 转化为 LaTeX 片段。
|
260 |
+
会根据标题的深度生成 \\section, \\subsection, 或 \\subsubsection 等。
|
261 |
+
同时对 markdown 中的图片 div 进行正则替换,转化为 LaTeX figure 环境。
|
262 |
+
|
263 |
+
参数:
|
264 |
+
section: (level, heading_text, content_string)
|
265 |
+
level: 0 表示一级标题, 1 表示二级标题, etc.
|
266 |
+
heading_text: 当前标题文字
|
267 |
+
content_string: 该标题下的 Markdown 文本
|
268 |
+
|
269 |
+
返回:
|
270 |
+
一个字符串,包含对应的 LaTeX 标题以及内容。
|
271 |
+
内容由 OpenAI 模型将 Markdown 转为 LaTeX,并将图片 div 转为 LaTeX figure。
|
272 |
+
"""
|
273 |
+
level, heading_text, content_string = section
|
274 |
+
|
275 |
+
# 根据 heading level 生成对应的 LaTeX 命令
|
276 |
+
if level == 0:
|
277 |
+
latex_heading = f"\\section{{{heading_text}}}"
|
278 |
+
elif level == 1:
|
279 |
+
latex_heading = f"\\subsection{{{heading_text}}}"
|
280 |
+
elif level == 2:
|
281 |
+
latex_heading = f"\\subsubsection{{{heading_text}}}"
|
282 |
+
else:
|
283 |
+
# 更深入的层级可自行添加
|
284 |
+
latex_heading = f"\\paragraph{{{heading_text}}}"
|
285 |
+
|
286 |
+
# 先粗略替换图片 div 为占位符,后续交由 OpenAI 模型或自身再做处理
|
287 |
+
# 这里我们先把 <div style="text-align:center">...<img ...>...</div><div ...>Fig x: ...</div> 转换为一个自定义标记 [IMG_BLOCK] ... [END_IMG_BLOCK]
|
288 |
+
# 这样后面可以更好控制让 OpenAI 转成正确的 LaTeX 也行,或在本地处理也行。
|
289 |
+
# 这里我们本地进行处理,将它直接转换为 LaTeX figure。
|
290 |
+
|
291 |
+
def replace_img_div(match):
|
292 |
+
"""
|
293 |
+
将 <div style="text-align:center"> <img src="..." alt="..." style="width:60%;"/> </div>
|
294 |
+
<div style="text-align:center;font-size:smaller;">Fig x: ...</div>
|
295 |
+
这种模式转换为标准 LaTeX figure 环境
|
296 |
+
"""
|
297 |
+
whole_block = match.group(0)
|
298 |
+
|
299 |
+
# 提取 src
|
300 |
+
src_match = re.search(r'<img.*?src="(.*?)".*?>', whole_block, re.DOTALL)
|
301 |
+
src_path = src_match.group(1) if src_match else "image_not_found"
|
302 |
+
|
303 |
+
# 提取 alt
|
304 |
+
alt_match = re.search(r'<img.*?alt="(.*?)".*?>', whole_block, re.DOTALL)
|
305 |
+
alt_text = alt_match.group(1) if alt_match else ""
|
306 |
+
|
307 |
+
# 提取 caption (Fig x: ...)
|
308 |
+
fig_match = re.search(r'Fig\s*\d+:\s*(.*?)<\/div>', whole_block, re.DOTALL)
|
309 |
+
fig_caption = fig_match.group(1).strip() if fig_match else ""
|
310 |
+
|
311 |
+
# 生成 LaTeX figure
|
312 |
+
latex_figure = (
|
313 |
+
"\\begin{figure}[htbp]\n"
|
314 |
+
" \\centering\n"
|
315 |
+
f" \\includegraphics[width=0.6\\textwidth]{{{src_path}}}\n"
|
316 |
+
f" \\caption{{{alt_text if alt_text else fig_caption}}}\n"
|
317 |
+
# 也可以根据需求决定是否加 label
|
318 |
+
"\\end{figure}\n"
|
319 |
+
)
|
320 |
+
|
321 |
+
return latex_figure
|
322 |
+
|
323 |
+
# 用正则定位该模式并转换为 latex figure
|
324 |
+
# 该模式大概是:
|
325 |
+
# <div style="text-align:center">.*?<img src="...".*?>.*?</div>\s*<div style="text-align:center;font-size:smaller;">.*?</div>
|
326 |
+
# 这里用非贪婪模式, DOTALL 允许匹配换行
|
327 |
+
pattern_img_div = re.compile(
|
328 |
+
r'<div\s+style="text-align:center".*?>.*?<img.*?>.*?</div>\s*<div\s+style="text-align:center;font-size:smaller;">.*?<\/div>',
|
329 |
+
re.DOTALL
|
330 |
+
)
|
331 |
+
|
332 |
+
content_converted_images = re.sub(pattern_img_div, replace_img_div, content_string)
|
333 |
+
|
334 |
+
# ------------------------------------------------
|
335 |
+
# 调用 OpenAI 接口,将 (转换好图片 div 的) Markdown 转为 LaTeX
|
336 |
+
# ------------------------------------------------
|
337 |
+
system_prompt = (
|
338 |
+
"You are a helpful assistant that converts Markdown text to rigorous LaTeX. "
|
339 |
+
"Maintain inline formatting like bold, italics, and code blocks when possible. "
|
340 |
+
"Simply format horizontally aligned text, lists, tables, etc. into valid LaTeX."
|
341 |
+
"Use [LaTeX] ... [/LaTeX] to wrap the final content without the \\section\{\}."
|
342 |
+
"If the content is mathematically descriptive, please insert exactly one LaTeX math equation with explaination ($...$)to describe it."
|
343 |
+
"Do not include any other irrelevant information."
|
344 |
+
"Remember to clean the refs such as \[1], \[2], \[3] inside the text to strip the backslashes to [1], [2], [3]. No any extra backslashes."
|
345 |
+
)
|
346 |
+
|
347 |
+
user_prompt = (
|
348 |
+
"Convert the following Markdown content to LaTeX. The text may already contain "
|
349 |
+
"some partial LaTeX for figures:\n\n"
|
350 |
+
f"{content_converted_images}"
|
351 |
+
)
|
352 |
+
|
353 |
+
messages = [
|
354 |
+
{"role": "system", "content": system_prompt},
|
355 |
+
{"role": "user", "content": user_prompt},
|
356 |
+
]
|
357 |
+
|
358 |
+
# 从环境变量中获取 openai key 和 base url
|
359 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
360 |
+
openai_api_base = os.getenv("OPENAI_API_BASE")
|
361 |
+
|
362 |
+
# 初始化 Client
|
363 |
+
client = OpenAI(
|
364 |
+
api_key=openai_api_key,
|
365 |
+
base_url=openai_api_base,
|
366 |
+
)
|
367 |
+
|
368 |
+
chat_response = client.chat.completions.create(
|
369 |
+
model=os.environ.get("MODEL"),
|
370 |
+
max_tokens=2048,
|
371 |
+
temperature=0.5,
|
372 |
+
stop="<|im_end|>",
|
373 |
+
stream=True,
|
374 |
+
messages=messages
|
375 |
+
)
|
376 |
+
|
377 |
+
# Stream the response
|
378 |
+
tex_body = ""
|
379 |
+
for chunk in chat_response:
|
380 |
+
if chunk.choices[0].delta.content:
|
381 |
+
tex_body += chunk.choices[0].delta.content
|
382 |
+
|
383 |
+
# 假设我们想在聊天回复中,使用 [LaTeX] ... [/LaTeX] 包裹最终内容,类似:
|
384 |
+
# [LaTeX]
|
385 |
+
# 你要的 tex body ...
|
386 |
+
# [/LaTeX]
|
387 |
+
|
388 |
+
# 可以用正则截取中间内容:
|
389 |
+
pattern = r'\[LaTeX\](.*?)\[/LaTeX\]'
|
390 |
+
match = re.search(pattern, tex_body, re.DOTALL)
|
391 |
+
if match:
|
392 |
+
# 如果拿到中间的内容 就用它, 否则就用全部
|
393 |
+
tex_body = match.group(1).strip()
|
394 |
+
|
395 |
+
# 去掉多余的空白
|
396 |
+
tex_body = re.sub(r'\s+', ' ', tex_body).strip()
|
397 |
+
|
398 |
+
# 整合 LaTeX 标题和转好的正文
|
399 |
+
final_tex_snippet = latex_heading + "\n\n" + tex_body + "\n"
|
400 |
+
print("Tex snippet:")
|
401 |
+
print(final_tex_snippet)
|
402 |
+
return final_tex_snippet
|
403 |
+
|
404 |
+
def md_to_tex_section_without_jpg(section):
|
405 |
+
"""
|
406 |
+
将单个 Markdown 分段 (level, heading_text, content_string) 转化为 LaTeX 片段,
|
407 |
+
不处理任何 HTML 或图片 div,仅调用 OpenAI 模型将普通 Markdown 转为 LaTeX。
|
408 |
+
|
409 |
+
参数:
|
410 |
+
section: (level, heading_text, content_string)
|
411 |
+
- level: 0 表示一级标题, 1 表示二级标题, 2 表示三级标题等
|
412 |
+
- heading_text: 当前标题文字
|
413 |
+
- content_string: 该标题下的 Markdown 文本
|
414 |
+
|
415 |
+
返回:
|
416 |
+
一个字符串,包含对应的 LaTeX 标题以及转换后的正文。
|
417 |
+
"""
|
418 |
+
|
419 |
+
level, heading_text, content_string = section
|
420 |
+
|
421 |
+
# 1) 根据 level 生成对应的 LaTeX 命令
|
422 |
+
# 你也可以改成更灵活的逻辑,比如多级。
|
423 |
+
if level == 0:
|
424 |
+
latex_heading = f"\\section{{{heading_text}}}"
|
425 |
+
elif level == 1:
|
426 |
+
latex_heading = f"\\subsection{{{heading_text}}}"
|
427 |
+
elif level == 2:
|
428 |
+
latex_heading = f"\\subsubsection{{{heading_text}}}"
|
429 |
+
else:
|
430 |
+
latex_heading = f"\\paragraph{{{heading_text}}}"
|
431 |
+
|
432 |
+
# 2) 判断是否要跳过 LLM 转换
|
433 |
+
# 这里给出几种常见原因:
|
434 |
+
# - 内容字符串为空或全是空白 (content_string.strip() == "")
|
435 |
+
# - 标题看起来只是一个段落号, 形如"3"、"3.1"、"3.1.1" 等 (可根据需要调宽或调窄判断规则)
|
436 |
+
|
437 |
+
# 例:用一个正则匹配 `数字(.数字)*`,可带可不带后缀空格
|
438 |
+
# 如果 heading_text 完全匹配这个模式,就认为它是个“纯编号标题”,不必调用 LLM
|
439 |
+
pure_number_pattern = re.compile(r'^\d+(\.\d+)*$')
|
440 |
+
|
441 |
+
# 先去一下两端空格
|
442 |
+
ht_stripped = heading_text.strip()
|
443 |
+
# 若正文为空,或标题是纯数字/编号,就跳过 LLM
|
444 |
+
skip_llm = (not content_string.strip()) or bool(pure_number_pattern.match(ht_stripped))
|
445 |
+
|
446 |
+
if skip_llm:
|
447 |
+
# 直接返回标题 + 原始正文 (若有也可保留)
|
448 |
+
# 如果你只想输出标题,就让正文为空
|
449 |
+
tex_body = content_string
|
450 |
+
# 也可以选择把正文丢弃,比如:
|
451 |
+
# tex_body = ""
|
452 |
+
else:
|
453 |
+
# 3) 需要调用 LLM 的情况
|
454 |
+
# system_prompt = (
|
455 |
+
# "You are a helpful assistant that converts Markdown text to rigorous LaTeX. "
|
456 |
+
# "Maintain inline formatting like bold, italics, and code blocks when possible. "
|
457 |
+
# "Simply format horizontally aligned text, lists, tables, etc. into valid LaTeX."
|
458 |
+
# "Use [LaTeX] ... [/LaTeX] to wrap the final content without the \\section\{\}."
|
459 |
+
# "If the content is mathematically descriptive, please insert exactly one LaTeX math equation with explaination (\\[...\\])to describe it."
|
460 |
+
# "You are forced to use \\begin{dmath} and \\end{dmath} to replace the origin square brackets and wrap the equation"
|
461 |
+
# "Do not include any other irrelevant information."
|
462 |
+
# "Remember to clean the refs such as \[1], \[2], \[3] inside the text to strip the backslashes to [1], [2], [3]. No any extra backslashes."
|
463 |
+
# )
|
464 |
+
system_prompt = (
|
465 |
+
"You are a helpful assistant that converts Markdown text to rigorous LaTeX. "
|
466 |
+
"Maintain inline formatting like bold, italics, and code blocks when possible. "
|
467 |
+
"Format horizontally aligned text, lists, and tables into valid LaTeX.\n\n"
|
468 |
+
|
469 |
+
"Use [LaTeX] ... [/LaTeX] to wrap the final content without the \\section{}.\n\n"
|
470 |
+
"If the content is mathematically descriptive, please insert exactly one LaTeX math equation to describe it."
|
471 |
+
"For mathematical content, strictly follow the **standard equation format** below:\n\n"
|
472 |
+
|
473 |
+
"1. **Wrap equations inside `equation`**:\n"
|
474 |
+
" ```latex\n"
|
475 |
+
" \\begin{equation}\n"
|
476 |
+
" \\resizebox{0.95\\columnwidth}{!}{$\n"
|
477 |
+
" ... % (Insert the equation here)\n"
|
478 |
+
" $}\n"
|
479 |
+
" \\end{equation}\n"
|
480 |
+
" ```\n"
|
481 |
+
" - **All equations must be enclosed in `\\resizebox{0.95\\columnwidth}{!}{...}`**.\n"
|
482 |
+
" - **Ensure the equation fits within `\\columnwidth`** in two-column layouts.\n\n"
|
483 |
+
|
484 |
+
"2. **For descriptions, simply use plain text with double backslashes, for example:\n"
|
485 |
+
"$f_i(x)$ is the local objective function of node $i$.\\"
|
486 |
+
"$\mathcal{N}_i$ is the set of in-neighbors of node $i$.\\"
|
487 |
+
|
488 |
+
"3. **Ensure proper formatting**:\n"
|
489 |
+
" - **DO NOT use `align`, `multline`, or `split`**—only `equation` with `resizebox`.\n"
|
490 |
+
" - **DO NOT allow formulas to exceed column width**.\n"
|
491 |
+
" - **DO NOT allow any other latex syntax such as"
|
492 |
+
" \\documentclass{article} \\usepackage{amsmath} \\usepackage{graphicx} \\begin{document}** use the plain content with formula.\n"
|
493 |
+
|
494 |
+
" - **Maintain the original refs and ensure that references like [1], [2], [3], do not contain unnecessary backslashes**.\n\n"
|
495 |
+
|
496 |
+
"All generated LaTeX content **must strictly adhere to this structure**."
|
497 |
+
)
|
498 |
+
|
499 |
+
user_prompt = (
|
500 |
+
"Convert the following Markdown content to LaTeX. "
|
501 |
+
f"{content_string}"
|
502 |
+
)
|
503 |
+
|
504 |
+
messages = [
|
505 |
+
{"role": "system", "content": system_prompt},
|
506 |
+
{"role": "user", "content": user_prompt},
|
507 |
+
]
|
508 |
+
|
509 |
+
# 从环境变量中获取 openai key 和 base url
|
510 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
511 |
+
openai_api_base = os.getenv("OPENAI_API_BASE")
|
512 |
+
|
513 |
+
# 初始化 Client
|
514 |
+
client = OpenAI(
|
515 |
+
api_key=openai_api_key,
|
516 |
+
base_url=openai_api_base,
|
517 |
+
)
|
518 |
+
|
519 |
+
chat_response = client.chat.completions.create(
|
520 |
+
model=os.environ.get("MODEL"),
|
521 |
+
max_tokens=2048,
|
522 |
+
temperature=0.5,
|
523 |
+
stop="<|im_end|>",
|
524 |
+
stream=True,
|
525 |
+
messages=messages
|
526 |
+
)
|
527 |
+
|
528 |
+
# 流式读取返���
|
529 |
+
tex_body = ""
|
530 |
+
for chunk in chat_response:
|
531 |
+
if chunk.choices[0].delta.content:
|
532 |
+
tex_body += chunk.choices[0].delta.content
|
533 |
+
|
534 |
+
# 提取 [LaTeX] ... [/LaTeX] 中间的内容
|
535 |
+
pattern = r'\[LaTeX\](.*?)\[/LaTeX\]'
|
536 |
+
match = re.search(pattern, tex_body, re.DOTALL)
|
537 |
+
if match:
|
538 |
+
tex_body = match.group(1).strip()
|
539 |
+
|
540 |
+
# 去掉多余的空白
|
541 |
+
tex_body = re.sub(r'\s+', ' ', tex_body).strip()
|
542 |
+
|
543 |
+
# 4) 最终拼接
|
544 |
+
final_tex_snippet = latex_heading + "\n\n" + tex_body + "\n"
|
545 |
+
print("Tex snippet:")
|
546 |
+
print(final_tex_snippet)
|
547 |
+
return final_tex_snippet
|
548 |
+
|
549 |
+
def insert_section(tex_path: str, section_content: str):
|
550 |
+
"""
|
551 |
+
将 section_content 追加到 .tex 文件“最后一个 section(或子节)的正文末尾”。
|
552 |
+
具体逻辑如下:
|
553 |
+
1. 如果文件内找不到任何 \section{...}、\subsection{...}、\subsubsection{...},
|
554 |
+
那么就将 section_content 插入到 \end{abstract} 之后。
|
555 |
+
2. 如果在全文中能找到若干标题 (\section、\subsection、\subsubsection),
|
556 |
+
则将 section_content 插入到最后出现的那个标题对应正文的末尾(即它和下一个标题/文件结束之间)。
|
557 |
+
3. 如果既没有 abstract 环境,也没有任何标题,则在 \end{document} 前插入。
|
558 |
+
|
559 |
+
参数:
|
560 |
+
tex_path: str
|
561 |
+
.tex 文件的路径。
|
562 |
+
section_content: str
|
563 |
+
需要插入的段落字符串(LaTeX 格式)。
|
564 |
+
|
565 |
+
注意:
|
566 |
+
- 这段逻辑会将新的内容**追加**到最后一个标题所对应正文的末尾,
|
567 |
+
这样可以避免把之前的内容“分割”或“顶开”。
|
568 |
+
"""
|
569 |
+
|
570 |
+
if not os.path.exists(tex_path):
|
571 |
+
print(f"TeX 文件不存在: {tex_path}")
|
572 |
+
return
|
573 |
+
|
574 |
+
with open(tex_path, 'r', encoding='utf-8') as f:
|
575 |
+
lines = f.readlines()
|
576 |
+
|
577 |
+
# 正则匹配标题、abstract、document
|
578 |
+
# 注意 \section、\subsection、\subsubsection 都做单独分组,这样获取行号时好区分
|
579 |
+
title_pattern = re.compile(r'^(\\section|\\subsection|\\subsubsection)\{[^}]*\}')
|
580 |
+
end_abstract_pattern = re.compile(r'^\\end\{abstract\}')
|
581 |
+
end_document_pattern = re.compile(r'^\\end\{document\}')
|
582 |
+
|
583 |
+
# 找到所有标题行号,保存到列表
|
584 |
+
title_lines = []
|
585 |
+
end_abstract_line = None
|
586 |
+
end_document_line = None
|
587 |
+
|
588 |
+
for i, line in enumerate(lines):
|
589 |
+
if title_pattern.match(line.strip()):
|
590 |
+
title_lines.append(i)
|
591 |
+
elif end_abstract_pattern.match(line.strip()):
|
592 |
+
end_abstract_line = i
|
593 |
+
elif end_document_pattern.match(line.strip()):
|
594 |
+
end_document_line = i
|
595 |
+
|
596 |
+
# 将要插入的内容行列表
|
597 |
+
insert_content_lines = section_content.strip().split('\n')
|
598 |
+
|
599 |
+
# 如果找不到任何标题
|
600 |
+
if not title_lines:
|
601 |
+
# 如果有 \end{abstract},就插在 \end{abstract} 后
|
602 |
+
if end_abstract_line is not None:
|
603 |
+
insert_idx = end_abstract_line + 1
|
604 |
+
else:
|
605 |
+
# 没有 \end{abstract},就尝试在 \end{document} 之前插入
|
606 |
+
if end_document_line is not None:
|
607 |
+
insert_idx = end_document_line
|
608 |
+
else:
|
609 |
+
# 如果也没有 \end{document},就插到文件末尾
|
610 |
+
insert_idx = len(lines)
|
611 |
+
|
612 |
+
new_lines = (
|
613 |
+
lines[:insert_idx]
|
614 |
+
+ [l + "\n" for l in insert_content_lines]
|
615 |
+
+ lines[insert_idx:]
|
616 |
+
)
|
617 |
+
|
618 |
+
else:
|
619 |
+
# 有标题时,将内容追加到“最后一个标题对应正文”的末尾
|
620 |
+
last_title_line = title_lines[-1]
|
621 |
+
|
622 |
+
# 找到下一个标题的行号(如果有),或 \end{document} 行号,以确定正文区间结束
|
623 |
+
# “最后标题正文”从 last_title_line+1 一直到 next_title_line-1(或结束)
|
624 |
+
next_boundaries = [end_document_line if end_document_line is not None else len(lines)]
|
625 |
+
for t_line in title_lines:
|
626 |
+
if t_line > last_title_line:
|
627 |
+
next_boundaries.append(t_line)
|
628 |
+
# next_boundary 是最后标题之后遇到的第一个 boundary(若没有, 就是文件末尾)
|
629 |
+
next_boundary = min(next_boundaries) if next_boundaries else len(lines)
|
630 |
+
|
631 |
+
# 我们希望将新的内容插在“最后标题正文的最末尾”之后,也就是说在 next_boundary 前。
|
632 |
+
# 不过若“最后标题”本身就处于全文件最终,next_boundary 可能表示文件末尾/文档结束。
|
633 |
+
# 这里为了避免把最后一行顶下去,可以先把其中的正文行都保留,再在最后插入 section_content。
|
634 |
+
new_lines = []
|
635 |
+
new_lines.extend(lines[:next_boundary]) # 保留从头到最后正文结束
|
636 |
+
new_lines.extend([l + "\n" for l in insert_content_lines])
|
637 |
+
new_lines.extend(lines[next_boundary:])
|
638 |
+
|
639 |
+
with open(tex_path, 'w', encoding='utf-8') as f:
|
640 |
+
f.writelines(new_lines)
|
641 |
+
|
642 |
+
print("成功插入 section 内容:", tex_path)
|
643 |
+
|
644 |
+
def md_to_tex(md_path, tex_path, title):
|
645 |
+
"""
|
646 |
+
将 Markdown 文件转换为 LaTeX 文件。
|
647 |
+
|
648 |
+
参数:
|
649 |
+
md_path (str): 输入的 Markdown 文件路径。
|
650 |
+
tex_path (str): 输出的 LaTeX 文件路径。
|
651 |
+
"""
|
652 |
+
sections = search_sections(md_path)
|
653 |
+
section_index = 0
|
654 |
+
while section_index < len(sections):
|
655 |
+
print(f"Converting section {section_index+1}/{len(sections)}")
|
656 |
+
if section_index == 0:
|
657 |
+
tex = abstract_to_tex(sections[section_index])
|
658 |
+
print(tex)
|
659 |
+
elif section_index == len(sections) - 1:
|
660 |
+
postprocess(tex_path, title)
|
661 |
+
tex = references_to_tex(sections[section_index])
|
662 |
+
print(tex)
|
663 |
+
else:
|
664 |
+
tex = md_to_tex_section_without_jpg(sections[section_index])
|
665 |
+
print(tex)
|
666 |
+
insert_section(tex_path, tex)
|
667 |
+
section_index += 1
|
668 |
+
# tex_to_pdf(tex_path, output_dir=os.path.dirname(tex_path), compiler="pdflatex")
|
669 |
+
|
670 |
+
def tex_to_pdf(tex_path, output_dir=None, compiler="xelatex"):
|
671 |
+
"""
|
672 |
+
将 LaTeX 文件编译为 PDF 文件。
|
673 |
+
|
674 |
+
参数:
|
675 |
+
tex_path (str): 输入的 LaTeX 文件路径。
|
676 |
+
output_dir (str): 输出的 PDF 文件目录。
|
677 |
+
compiler (str): 编译器,默认为 "xelatex"。
|
678 |
+
"""
|
679 |
+
if output_dir is None:
|
680 |
+
output_dir = os.path.dirname(tex_path)
|
681 |
+
tex_name = os.path.basename(tex_path)
|
682 |
+
tex_name_no_ext = os.path.splitext(tex_name)[0]
|
683 |
+
pdf_path = os.path.join(output_dir, f"{tex_name_no_ext}.pdf")
|
684 |
+
|
685 |
+
subprocess.run([
|
686 |
+
compiler,
|
687 |
+
"-interaction=nonstopmode",
|
688 |
+
"-output-directory",
|
689 |
+
output_dir,
|
690 |
+
tex_path
|
691 |
+
])
|
692 |
+
|
693 |
+
print(f"PDF 文件已生成: {pdf_path}")
|
694 |
+
|
695 |
+
def insert_figures(png_path, tex_path, json_path, ref_names, survey_title, new_tex_path):
|
696 |
+
"""
|
697 |
+
读取给定的 TeX 文件 (tex_path),先调用 insert_outline_figure 在其中插入概览图片;
|
698 |
+
然后再调用 insert_tex_images 在文中发现的引用标记位置插入 figure 环境。
|
699 |
+
最后把处理完的文本写入 new_tex_path。
|
700 |
+
|
701 |
+
参数:
|
702 |
+
png_path: 大纲图片的路径(会传给 insert_outline_figure)。
|
703 |
+
tex_path: 原始 TeX 文件路径。
|
704 |
+
json_path: 图片对应的 JSON(会传给 insert_tex_images,内含 引用名称 -> 图片路径 的映射)。
|
705 |
+
ref_names: 引用名称列表 (index 从 0 开始)。
|
706 |
+
survey_title: 用于大纲图片 figure 中的说明文字。
|
707 |
+
new_tex_path: 处理后新的 TeX 文件输出路径。
|
708 |
+
"""
|
709 |
+
# 1. 读取原始 tex 文件内容
|
710 |
+
with open(tex_path, 'r', encoding='utf-8') as f:
|
711 |
+
tex_content = f.read()
|
712 |
+
|
713 |
+
# 2. 在 '2 Introduction' 前插入一张占满整页的描述性图片(概览图)
|
714 |
+
updated_tex = insert_outline_figure(
|
715 |
+
png_path=png_path,
|
716 |
+
tex_content=tex_content,
|
717 |
+
survey_title=survey_title
|
718 |
+
)
|
719 |
+
|
720 |
+
# 3. 在文中其他引用 [n], \[n], \[n\] 等位置插入 figure
|
721 |
+
updated_tex = insert_tex_images(
|
722 |
+
json_path=json_path,
|
723 |
+
ref_names=ref_names,
|
724 |
+
text=updated_tex
|
725 |
+
)
|
726 |
+
|
727 |
+
# 4. 将处理结果写入新路径
|
728 |
+
with open(new_tex_path, 'w', encoding='utf-8') as f:
|
729 |
+
f.write(updated_tex)
|
730 |
+
|
731 |
+
print(f"已生成新的 TeX 文件: {new_tex_path}")
|
732 |
+
return new_tex_path
|
733 |
+
|
734 |
+
def postprocess(tex_path, new_title):
|
735 |
+
"""
|
736 |
+
读取给定的 TeX 文件 (tex_path):
|
737 |
+
1) 在第一处 \author 行的上一行插入 \title{new_title}。
|
738 |
+
2) 将所有形如 "\[1\]"、"\[1]"、以及 "\[12\]" 等引用标记,
|
739 |
+
以及 "[1\]" 之类的混合形式,全都去掉反斜杠,统一替换为 [1]、[12]。
|
740 |
+
3) 将所有由 \[ \] 包裹的数学公式都替换为 \begin{dmath} \end{dmath}。
|
741 |
+
最后将结果覆盖写回原始文件,并返回 tex_path。
|
742 |
+
"""
|
743 |
+
new_title = 'A Survey of ' + new_title
|
744 |
+
# 1) 读取文件行
|
745 |
+
with open(tex_path, 'r', encoding='utf-8') as f:
|
746 |
+
lines = f.readlines()
|
747 |
+
|
748 |
+
# 2) 找到包含 "\author" 的行,在其上一行插入 \title{...}
|
749 |
+
inserted = False
|
750 |
+
for i, line in enumerate(lines):
|
751 |
+
# 如果这一行以 \author 开头,或包含 \author{...} 等
|
752 |
+
if line.strip().startswith(r'\author'):
|
753 |
+
# 在它前面插入一行 \title{new_title}
|
754 |
+
lines.insert(i, f'\\title{{{new_title}}}\n')
|
755 |
+
inserted = True
|
756 |
+
break
|
757 |
+
|
758 |
+
if not inserted:
|
759 |
+
# 若整份文件都没有 \author{...},可以选择直接在文档末尾插入,或在开头插入,视需求而定
|
760 |
+
print(f"[警告] 未找到 '\\author' 行,未能插入 '\\title{{{new_title}}}'。")
|
761 |
+
# 下面演示直接在文档末尾插入(可按自己需求改放开头等)
|
762 |
+
lines.append(f'\\title{{{new_title}}}\n')
|
763 |
+
|
764 |
+
# 将行列表合并为一个整体字符串,便于进行正则替换
|
765 |
+
text_joined = ''.join(lines)
|
766 |
+
|
767 |
+
# 3) 将形如 "\[1\]"、"\[12]"、"[12\]" 等都换成 "[1]"、"[12]" 等
|
768 |
+
# 核心正则:'(?:\\)?\[(\d+)(?:\\)?\]'
|
769 |
+
# (?:\\)? ---- 可选的一个反斜杠
|
770 |
+
# \[ ---- 匹配方括号的开头 '['
|
771 |
+
# (\d+) ---- 匹配并捕获1--多位数字
|
772 |
+
# (?:\\)? ---- 可选的一个反斜杠
|
773 |
+
# \] ---- 匹配方括号的结尾 ']'
|
774 |
+
ref_pattern = re.compile(r'(?:\\)?\[(\d+)(?:\\)?\]')
|
775 |
+
text_processed = ref_pattern.sub(r'[\1]', text_joined)
|
776 |
+
|
777 |
+
# 4) 将所有由 \[ \] 包裹的数学公式替换为 \begin{dmath} \end{dmath}
|
778 |
+
# 正则示例: 匹配 \[ ... \] 中间任意内容 (非贪婪)
|
779 |
+
# 使用 DOTALL 选项让 '.' 匹配换行
|
780 |
+
# eq_pattern = re.compile(r'\\\[(.*?)\\\]', re.DOTALL)
|
781 |
+
# text_processed = eq_pattern.sub(r'\\begin{dmath}\1\\end{dmath}', text_processed)
|
782 |
+
|
783 |
+
# 5) 写回原文件
|
784 |
+
with open(tex_path, 'w', encoding='utf-8') as f:
|
785 |
+
f.write(text_processed)
|
786 |
+
|
787 |
+
print(f"[完成] 已在 '{tex_path}' 中插入/追加 \\title{{{new_title}}},替换引用标记并将公式转为 dmath 格式。")
|
788 |
+
return tex_path
|
789 |
+
|
790 |
+
def md_to_tex_to_pdf(md_path, tex_path, pdf_path, png_path, json_path, ref_names, survey_title):
|
791 |
+
"""
|
792 |
+
将 Markdown 文件转换为 LaTeX 文件,然后再编译为 PDF 文件。
|
793 |
+
|
794 |
+
参数:
|
795 |
+
md_path (str): 输入的 Markdown 文件路径。
|
796 |
+
tex_path (str): 输出的 LaTeX 文件路径。
|
797 |
+
pdf_path (str): 输出的 PDF 文件路径。
|
798 |
+
"""
|
799 |
+
md_to_tex(md_path, tex_path)
|
800 |
+
new_tex_path = insert_figures(png_path, tex_path, json_path, ref_names, survey_title, tex_path)
|
801 |
+
# tex_to_pdf(new_tex_path, output_dir=os.path.dirname(tex_path), compiler="pdflatex")
|
802 |
+
|
803 |
+
if __name__ == "__main__":
|
804 |
+
# 读取环境变量
|
805 |
+
dotenv.load_dotenv()
|
806 |
+
# md_path = preprocess_md("src/demo/latex_template/test copy.md", "src/demo/latex_template/test_preprocessed.md")
|
807 |
+
md_path = 'src/static/data/info/undefined/survey_undefined_preprocessed.md'
|
808 |
+
tex_path = "src/static/data/info/undefined/template.tex"
|
809 |
+
md_to_tex(md_path, tex_path, title="A Comprehensive Review of ADMM On Consensus Distributed Optimization")
|
810 |
+
# insert_figures('src/static/data/info/undefined/outline.png',
|
811 |
+
# 'src/demo/latex_template/template.tex',
|
812 |
+
# 'src/static/data/info/undefined/flowchart_results.json',
|
813 |
+
# ['A comprehensive review of recommender systems transitioning from theory to practice', 'A large language model enhanced conversational recommender system'],
|
814 |
+
# 'Survey Title',
|
815 |
+
# 'src/demo/latex_template/template_with_figures.tex')
|
816 |
+
tex_to_pdf(tex_path, output_dir=os.path.dirname(tex_path), compiler="xelatex")
|
src/demo/asg_loader.py
ADDED
@@ -0,0 +1,256 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import json
|
4 |
+
import subprocess
|
5 |
+
from langchain_community.document_loaders import UnstructuredMarkdownLoader
|
6 |
+
from langchain_core.documents import Document
|
7 |
+
import shutil
|
8 |
+
|
9 |
+
class DocumentLoading:
|
10 |
+
def convert_pdf_to_md(self, pdf_file, output_dir="output", method="auto"):
|
11 |
+
base_name = os.path.splitext(os.path.basename(pdf_file))[0]
|
12 |
+
target_dir = os.path.join(output_dir, base_name)
|
13 |
+
md_file_path = os.path.join(target_dir, method, f"{base_name}.md")
|
14 |
+
print("The md file path is: ", md_file_path)
|
15 |
+
|
16 |
+
if os.path.exists(md_file_path):
|
17 |
+
print(f"Markdown file for {pdf_file} already exists at {md_file_path}. Skipping conversion.", flush=True)
|
18 |
+
return
|
19 |
+
|
20 |
+
command = ["magic-pdf", "-p", pdf_file, "-o", output_dir, "-m", method]
|
21 |
+
try:
|
22 |
+
subprocess.run(command, check=True)
|
23 |
+
# 检查是否生成了 Markdown 文件
|
24 |
+
if not os.path.exists(md_file_path):
|
25 |
+
print(f"Conversion failed: Markdown file not found at {md_file_path}. Cleaning up folder...")
|
26 |
+
shutil.rmtree(target_dir) # 删除生成的文件夹
|
27 |
+
else:
|
28 |
+
print(f"Successfully converted {pdf_file} to markdown format in {target_dir}.")
|
29 |
+
except subprocess.CalledProcessError as e:
|
30 |
+
print(f"An error occurred during conversion: {e}")
|
31 |
+
# 如果发生错误且文件夹已生成,则删除文件夹
|
32 |
+
if os.path.exists(target_dir):
|
33 |
+
print(f"Cleaning up incomplete folder: {target_dir}")
|
34 |
+
shutil.rmtree(target_dir)
|
35 |
+
# new
|
36 |
+
def convert_pdf_to_md_new(self, pdf_dir, output_dir="output", method="auto"):
|
37 |
+
pdf_files = glob.glob(os.path.join(pdf_dir, "*.pdf"))
|
38 |
+
|
39 |
+
for pdf_file in pdf_files:
|
40 |
+
base_name = os.path.splitext(os.path.basename(pdf_file))[0]
|
41 |
+
target_dir = os.path.join(output_dir, base_name)
|
42 |
+
|
43 |
+
if os.path.exists(target_dir):
|
44 |
+
print(f"Folder for {pdf_file} already exists in {output_dir}. Skipping conversion.")
|
45 |
+
else:
|
46 |
+
command = ["magic-pdf", "-p", pdf_file, "-o", output_dir, "-m", method]
|
47 |
+
try:
|
48 |
+
subprocess.run(command, check=True)
|
49 |
+
print(f"Successfully converted {pdf_file} to markdown format in {target_dir}.")
|
50 |
+
except subprocess.CalledProcessError as e:
|
51 |
+
print(f"An error occurred: {e}")
|
52 |
+
|
53 |
+
def batch_convert_pdfs(pdf_files, output_dir="output", method="auto", max_workers=None):
|
54 |
+
# Create a process pool to run the conversion in parallel
|
55 |
+
with ProcessPoolExecutor(max_workers=max_workers) as executor:
|
56 |
+
# Submit each PDF file to the process pool for conversion
|
57 |
+
futures = [executor.submit(convert_pdf_to_md, pdf, output_dir, method) for pdf in pdf_files]
|
58 |
+
|
59 |
+
# Optionally, you can monitor the status of each future as they complete
|
60 |
+
for future in futures:
|
61 |
+
try:
|
62 |
+
future.result() # This will raise any exceptions that occurred during the processing
|
63 |
+
except Exception as exc:
|
64 |
+
print(f"An error occurred during processing: {exc}")
|
65 |
+
|
66 |
+
def extract_information_from_md(self, md_text):
|
67 |
+
title_match = re.search(r'^(.*?)(\n\n|\Z)', md_text, re.DOTALL)
|
68 |
+
title = title_match.group(1).strip() if title_match else "N/A"
|
69 |
+
|
70 |
+
authors_match = re.search(
|
71 |
+
r'\n\n(.*?)(\n\n[aA][\s]*[bB][\s]*[sS][\s]*[tT][\s]*[rR][\s]*[aA][\s]*[cC][\s]*[tT][^\n]*\n\n)',
|
72 |
+
md_text,
|
73 |
+
re.DOTALL
|
74 |
+
)
|
75 |
+
authors = authors_match.group(1).strip() if authors_match else "N/A"
|
76 |
+
|
77 |
+
abstract_match = re.search(
|
78 |
+
r'(\n\n[aA][\s]*[bB][\s]*[sS][\s]*[tT][\s]*[rR][\s]*[aA][\s]*[cC][\s]*[tT][^\n]*\n\n)(.*?)(\n\n|\Z)',
|
79 |
+
md_text,
|
80 |
+
re.DOTALL
|
81 |
+
)
|
82 |
+
abstract = abstract_match.group(0).strip() if abstract_match else "N/A"
|
83 |
+
abstract = re.sub(r'^[aA]\s*[bB]\s*[sS]\s*[tT]\s*[rR]\s*[aA]\s*[cC]\s*[tT][^\w]*', '', abstract)
|
84 |
+
abstract = re.sub(r'^[^a-zA-Z]*', '', abstract)
|
85 |
+
|
86 |
+
introduction_match = re.search(
|
87 |
+
r'\n\n([1I][\.\- ]?\s*)?[Ii]\s*[nN]\s*[tT]\s*[rR]\s*[oO]\s*[dD]\s*[uU]\s*[cC]\s*[tT]\s*[iI]\s*[oO]\s*[nN][\.\- ]?\s*\n\n(.*?)'
|
88 |
+
r'(?=\n\n(?:([2I][I]|\s*2)[^\n]*?\n\n|\n\n(?:[2I][I][^\n]*?\n\n)))',
|
89 |
+
md_text,
|
90 |
+
re.DOTALL
|
91 |
+
)
|
92 |
+
introduction = introduction_match.group(2).strip() if introduction_match else "N/A"
|
93 |
+
|
94 |
+
main_content_match = re.search(
|
95 |
+
r'(.*?)(\n\n([3I][\.\- ]?\s*)?[Rr][Ee][Ff][Ee][Rr][Ee][Nn][Cc][Ee][Ss][^\n]*\n\n|\Z)',
|
96 |
+
md_text,
|
97 |
+
re.DOTALL
|
98 |
+
)
|
99 |
+
|
100 |
+
if main_content_match:
|
101 |
+
main_content = main_content_match.group(1).strip()
|
102 |
+
else:
|
103 |
+
main_content = "N/A"
|
104 |
+
|
105 |
+
extracted_data = {
|
106 |
+
"title": title,
|
107 |
+
"authors": authors,
|
108 |
+
"abstract": abstract,
|
109 |
+
"introduction": introduction,
|
110 |
+
"main_content": main_content
|
111 |
+
}
|
112 |
+
return extracted_data
|
113 |
+
|
114 |
+
def process_md_file(self, md_file_path, survey_id):
|
115 |
+
loader = UnstructuredMarkdownLoader(md_file_path)
|
116 |
+
data = loader.load()
|
117 |
+
assert len(data) == 1, "Expected exactly one document in the markdown file."
|
118 |
+
assert isinstance(data[0], Document), "The loaded data is not of type Document."
|
119 |
+
extracted_text = data[0].page_content
|
120 |
+
|
121 |
+
extracted_data = self.extract_information_from_md(extracted_text)
|
122 |
+
if len(extracted_data["abstract"]) < 10:
|
123 |
+
extracted_data["abstract"] = extracted_data['title']
|
124 |
+
|
125 |
+
title = os.path.splitext(os.path.basename(md_file_path))[0]
|
126 |
+
title_new = title.strip()
|
127 |
+
invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '_']
|
128 |
+
for char in invalid_chars:
|
129 |
+
title_new = title_new.replace(char, ' ')
|
130 |
+
|
131 |
+
os.makedirs(f'./src/static/data/txt/{survey_id}', exist_ok=True)
|
132 |
+
with open(f'./src/static/data/txt/{survey_id}/{title_new}.json', 'w', encoding='utf-8') as f:
|
133 |
+
json.dump(extracted_data, f, ensure_ascii=False, indent=4)
|
134 |
+
return extracted_data['introduction']
|
135 |
+
|
136 |
+
def process_md_file_full(self, md_file_path, survey_id):
|
137 |
+
loader = UnstructuredMarkdownLoader(md_file_path)
|
138 |
+
data = loader.load()
|
139 |
+
assert len(data) == 1, "Expected exactly one document in the markdown file."
|
140 |
+
assert isinstance(data[0], Document), "The loaded data is not of type Document."
|
141 |
+
extracted_text = data[0].page_content
|
142 |
+
|
143 |
+
extracted_data = self.extract_information_from_md(extracted_text)
|
144 |
+
if len(extracted_data["abstract"]) < 10:
|
145 |
+
extracted_data["abstract"] = extracted_data['title']
|
146 |
+
|
147 |
+
title = os.path.splitext(os.path.basename(md_file_path))[0]
|
148 |
+
title_new = title.strip()
|
149 |
+
invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '_']
|
150 |
+
for char in invalid_chars:
|
151 |
+
title_new = title_new.replace(char, ' ')
|
152 |
+
|
153 |
+
os.makedirs(f'./src/static/data/txt/{survey_id}', exist_ok=True)
|
154 |
+
with open(f'./src/static/data/txt/{survey_id}/{title_new}.json', 'w', encoding='utf-8') as f:
|
155 |
+
json.dump(extracted_data, f, ensure_ascii=False, indent=4)
|
156 |
+
return extracted_data['abstract'] + extracted_data['introduction'] + extracted_data['main_content']
|
157 |
+
|
158 |
+
|
159 |
+
def load_pdf(self, pdf_file, survey_id, mode):
|
160 |
+
os.makedirs(f'./src/static/data/md/{survey_id}', exist_ok=True)
|
161 |
+
output_dir = f"./src/static/data/md/{survey_id}"
|
162 |
+
base_name = os.path.splitext(os.path.basename(pdf_file))[0]
|
163 |
+
target_dir = os.path.join(output_dir, base_name, "auto")
|
164 |
+
|
165 |
+
# 1. Convert PDF to markdown if the folder doesn't exist
|
166 |
+
self.convert_pdf_to_md(pdf_file, output_dir)
|
167 |
+
|
168 |
+
# 2. Process the markdown file in the output directory
|
169 |
+
md_file_path = os.path.join(target_dir, f"{base_name}.md")
|
170 |
+
if not os.path.exists(md_file_path):
|
171 |
+
raise FileNotFoundError(f"Markdown file {md_file_path} does not exist. Conversion might have failed.")
|
172 |
+
|
173 |
+
if mode == "intro":
|
174 |
+
return self.process_md_file(md_file_path, survey_id)
|
175 |
+
elif mode == "full":
|
176 |
+
return self.process_md_file_full(md_file_path, survey_id)
|
177 |
+
|
178 |
+
# wrong, still being tested
|
179 |
+
def load_pdf_new(self, pdf_dir, survey_id):
|
180 |
+
os.makedirs(f'./src/static/data/md/{survey_id}', exist_ok=True)
|
181 |
+
output_dir = f"./src/static/data/md/{survey_id}"
|
182 |
+
self.convert_pdf_to_md_new(pdf_dir, output_dir)
|
183 |
+
markdown_files = glob.glob(os.path.join(output_dir, "*", "auto", "*.md"))
|
184 |
+
all_introductions = []
|
185 |
+
|
186 |
+
for md_file_path in markdown_files:
|
187 |
+
try:
|
188 |
+
introduction = self.process_md_file(md_file_path, survey_id)
|
189 |
+
all_introductions.append(introduction)
|
190 |
+
except FileNotFoundError as e:
|
191 |
+
print(f"Markdown file {md_file_path} does not exist. Conversion might have failed.")
|
192 |
+
|
193 |
+
return all_introductions
|
194 |
+
|
195 |
+
|
196 |
+
|
197 |
+
def parallel_load_pdfs(self, pdf_files, survey_id, max_workers=4):
|
198 |
+
with ProcessPoolExecutor(max_workers=max_workers) as executor:
|
199 |
+
# Submit tasks for parallel execution
|
200 |
+
futures = [executor.submit(self.load_pdf, pdf, survey_id) for pdf in pdf_files]
|
201 |
+
|
202 |
+
# Collect results
|
203 |
+
for future in futures:
|
204 |
+
try:
|
205 |
+
result = future.result()
|
206 |
+
print(f"Processed result: {result}")
|
207 |
+
except Exception as e:
|
208 |
+
print(f"Error processing PDF: {e}")
|
209 |
+
|
210 |
+
def ensure_non_empty_introduction(self, introduction, full_text):
|
211 |
+
"""
|
212 |
+
Ensure introduction is not empty. If empty, replace with full text.
|
213 |
+
"""
|
214 |
+
if introduction == "N/A" or len(introduction.strip()) < 50:
|
215 |
+
return full_text.strip()
|
216 |
+
return introduction
|
217 |
+
|
218 |
+
def extract_information_from_md_new(self, md_text):
|
219 |
+
# Title extraction
|
220 |
+
title_match = re.search(r'^(.*?)(\n\n|\Z)', md_text, re.DOTALL)
|
221 |
+
title = title_match.group(1).strip() if title_match else "N/A"
|
222 |
+
|
223 |
+
# Authors extraction
|
224 |
+
authors_match = re.search(
|
225 |
+
r'\n\n(.*?)(\n\n[aA][\s]*[bB][\s]*[sS][\s]*[tT][\s]*[rR][\s]*[aA][\s]*[cC][\s]*[tT][^\n]*\n\n)',
|
226 |
+
md_text,
|
227 |
+
re.DOTALL
|
228 |
+
)
|
229 |
+
authors = authors_match.group(1).strip() if authors_match else "N/A"
|
230 |
+
|
231 |
+
# Abstract extraction
|
232 |
+
abstract_match = re.search(
|
233 |
+
r'(\n\n[aA][\s]*[bB][\s]*[sS][\s]*[tT][\s]*[rR][\s]*[aA][\s]*[cC][\s]*[tT][^\n]*\n\n)(.*?)(\n\n|\Z)',
|
234 |
+
md_text,
|
235 |
+
re.DOTALL
|
236 |
+
)
|
237 |
+
abstract = abstract_match.group(0).strip() if abstract_match else "N/A"
|
238 |
+
abstract = re.sub(r'^[aA]\s*[bB]\s*[sS]\s*[tT]\s*[rR]\s*[aA]\s*[cC]\s*[tT][^\w]*', '', abstract)
|
239 |
+
abstract = re.sub(r'^[^a-zA-Z]*', '', abstract)
|
240 |
+
|
241 |
+
# Introduction extraction
|
242 |
+
introduction_match = re.search(
|
243 |
+
r'\n\n([1I][\.\- ]?\s*)?[Ii]\s*[nN]\s*[tT]\s*[rR]\s*[oO]\s*[dD]\s*[uU]\s*[cC]\s*[tT]\s*[iI]\s*[oO]\s*[nN][\.\- ]?\s*\n\n(.*?)',
|
244 |
+
md_text, re.DOTALL
|
245 |
+
)
|
246 |
+
introduction = introduction_match.group(2).strip() if introduction_match else "N/A"
|
247 |
+
|
248 |
+
# Ensure introduction is not empty
|
249 |
+
introduction = self.ensure_non_empty_introduction(introduction, md_text)
|
250 |
+
|
251 |
+
return {
|
252 |
+
"title": title,
|
253 |
+
"authors": authors,
|
254 |
+
"abstract": abstract,
|
255 |
+
"introduction": introduction
|
256 |
+
}
|
src/demo/asg_mindmap.py
ADDED
@@ -0,0 +1,302 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import re
|
3 |
+
import textwrap
|
4 |
+
from graphviz import Digraph
|
5 |
+
import os
|
6 |
+
|
7 |
+
def wrap_text(text, max_chars):
|
8 |
+
"""
|
9 |
+
对文本进行自动换行包装,每行最大字符数为 max_chars。
|
10 |
+
"""
|
11 |
+
return textwrap.fill(text, width=max_chars)
|
12 |
+
|
13 |
+
def parse_md_refs(md_content):
|
14 |
+
"""
|
15 |
+
解析 Markdown 内容,提取以 x.y.z 格式标题对应的引用。
|
16 |
+
|
17 |
+
对于每个满足格式的 section,其内容中所有形如 [数字] 的引用
|
18 |
+
将被抽取出来,去重后按数字升序排序,生成类似 "[1,2,3]" 的引用字符串。
|
19 |
+
|
20 |
+
如果遇到 undesired header(如 "6 Future Directions" 或 "7 Conclusion"),
|
21 |
+
则停止后续内容的解析,确保最后一个 section 仅包含到该 header 之前的内容。
|
22 |
+
|
23 |
+
返回字典,键为 section 编号(例如 "3.1.1"),值为引用字符串(例如 "[1,2,3]")。
|
24 |
+
"""
|
25 |
+
ref_dict = {}
|
26 |
+
|
27 |
+
# 处理 Markdown 内容(按行拆分)
|
28 |
+
lines = md_content.split("\n") if md_content else []
|
29 |
+
|
30 |
+
# 匹配 Markdown 标题中以 x.y.z 开头的叶节点(例如 "5.1.1 Neural Topic...")
|
31 |
+
section_header_regex = re.compile(r'^\s*#+\s*(\d+\.\d+\.\d+).*')
|
32 |
+
# 匹配 undesired header,如 "6 Future Directions" 或 "7 Conclusion"
|
33 |
+
undesired_header_regex = re.compile(r'^\s*#+\s*(6 Future Directions|7 Conclusion)\b')
|
34 |
+
# 匹配引用,例如 [数字]
|
35 |
+
ref_pattern = re.compile(r'\[(\d+)\]')
|
36 |
+
|
37 |
+
current_section = None
|
38 |
+
current_content = []
|
39 |
+
|
40 |
+
for line in lines:
|
41 |
+
# 如果检测到 undesired header,则先处理当前 section,再退出循环
|
42 |
+
if undesired_header_regex.match(line):
|
43 |
+
break
|
44 |
+
|
45 |
+
header_match = section_header_regex.match(line)
|
46 |
+
if header_match:
|
47 |
+
# 处理上一个 section
|
48 |
+
if current_section is not None:
|
49 |
+
all_refs = [int(num) for content_line in current_content for num in ref_pattern.findall(content_line)]
|
50 |
+
if all_refs:
|
51 |
+
ref_dict[current_section] = "[" + ",".join(map(str, sorted(set(all_refs)))) + "]"
|
52 |
+
|
53 |
+
# 更新当前 section
|
54 |
+
current_section = header_match.group(1)
|
55 |
+
current_content = []
|
56 |
+
else:
|
57 |
+
if current_section is not None:
|
58 |
+
current_content.append(line)
|
59 |
+
|
60 |
+
# 处理最后一个 section
|
61 |
+
if current_section is not None and current_content:
|
62 |
+
all_refs = [int(num) for content_line in current_content for num in ref_pattern.findall(content_line)]
|
63 |
+
if all_refs:
|
64 |
+
ref_dict[current_section] = "[" + ",".join(map(str, sorted(set(all_refs)))) + "]"
|
65 |
+
|
66 |
+
return ref_dict
|
67 |
+
|
68 |
+
def generate_graphviz_png(json_path, output_png_path, md_content=None, title="Document Outline", max_root_chars=20):
|
69 |
+
"""
|
70 |
+
从 JSON 文件中读取大纲,构造树状结构,并生成 mindmap 的 PNG 图片。
|
71 |
+
|
72 |
+
如果提供了 md_content,则根据 Markdown 内容中以 x.y.z 格式标题对应的引用,
|
73 |
+
在生成 mindmap 时,对于叶节点(没有子节点且标题以 x.y.z 开头)的标签,
|
74 |
+
在原文本后追加一个换行,然后添加引用信息(例如 "[1,2,3]"),
|
75 |
+
且引用经过数字排序。
|
76 |
+
|
77 |
+
同时,仅对根节点文本进行自动换行包装,以限制根节点的最大宽度,
|
78 |
+
其它节点保持原始文本格式。
|
79 |
+
|
80 |
+
参数:
|
81 |
+
json_path: JSON 文件路径(包含大纲)
|
82 |
+
output_png_path: 输出 PNG 文件路径(不带后缀)
|
83 |
+
md_content: Markdown 文本内容(字符串),可选
|
84 |
+
title: 用于替换 mindmap 中根节点的标题,默认 "Document Outline"
|
85 |
+
max_root_chars: 限制根节点每行最大字符数,默认 20
|
86 |
+
"""
|
87 |
+
# 解析 Markdown 内容的引用
|
88 |
+
ref_dict = parse_md_refs(md_content) if md_content else {}
|
89 |
+
|
90 |
+
# 读取 JSON 大纲
|
91 |
+
with open(json_path, "r", encoding="utf-8") as f:
|
92 |
+
data = json.load(f)
|
93 |
+
|
94 |
+
outline_str = data.get("outline", "")
|
95 |
+
|
96 |
+
# 解析形如 [层级, '标题'] 的项
|
97 |
+
pattern = re.compile(r"\[(\d+),\s*'([^']+)'\]")
|
98 |
+
items = pattern.findall(outline_str)
|
99 |
+
items = [(int(level), title) for level, title in items]
|
100 |
+
|
101 |
+
# 不需要的标题关键词
|
102 |
+
undesired_keywords = {"Abstract", "Introduction", "Future Directions", "Conclusion"}
|
103 |
+
# 过滤掉不需要的条目
|
104 |
+
filtered_items = [
|
105 |
+
(lvl, title) for lvl, title in items
|
106 |
+
if not re.match(r"^\d+\s+(.+)", title) or re.match(r"^\d+\s+(.+)", title).group(1) not in undesired_keywords
|
107 |
+
]
|
108 |
+
|
109 |
+
# 构造树状结构
|
110 |
+
tree = []
|
111 |
+
stack = []
|
112 |
+
for lvl, title_item in filtered_items:
|
113 |
+
node = {"title": title_item, "children": []}
|
114 |
+
while stack and lvl <= stack[-1][0]:
|
115 |
+
stack.pop()
|
116 |
+
if stack:
|
117 |
+
stack[-1][1]["children"].append(node)
|
118 |
+
else:
|
119 |
+
tree.append(node)
|
120 |
+
stack.append((lvl, node))
|
121 |
+
|
122 |
+
# 生成 Mindmap
|
123 |
+
dot = Digraph(comment=title, format='png', engine='dot')
|
124 |
+
dot.graph_attr.update(rankdir='LR', splines='ortho', bgcolor='white', dpi="150")
|
125 |
+
dot.attr('node', shape='box', style='rounded,filled', fillcolor='white', color='gray')
|
126 |
+
dot.edge_attr.update(arrowhead='none', color="black")
|
127 |
+
|
128 |
+
# 处理根节点
|
129 |
+
wrapped_title = wrap_text(title, max_root_chars)
|
130 |
+
dot.node('root', label=wrapped_title, shape='ellipse', style='filled', fillcolor='lightgray')
|
131 |
+
|
132 |
+
node_counter = [0]
|
133 |
+
section_pattern = re.compile(r'^(\d+\.\d+\.\d+)')
|
134 |
+
|
135 |
+
def add_nodes(node, parent_id):
|
136 |
+
current_id = f'node_{node_counter[0]}'
|
137 |
+
node_counter[0] += 1
|
138 |
+
safe_label = node['title'].replace('"', r'\"')
|
139 |
+
|
140 |
+
# 如果是叶节点且标题以 x.y.z 开头,则追加引用信息(如果存在)
|
141 |
+
if not node["children"]:
|
142 |
+
m = section_pattern.match(safe_label)
|
143 |
+
if m:
|
144 |
+
section_id = m.group(1)
|
145 |
+
if section_id in ref_dict:
|
146 |
+
safe_label += "\n" + ref_dict[section_id]
|
147 |
+
|
148 |
+
dot.node(current_id, label=safe_label)
|
149 |
+
dot.edge(parent_id, current_id)
|
150 |
+
for child in node.get("children", []):
|
151 |
+
add_nodes(child, current_id)
|
152 |
+
|
153 |
+
for top_node in tree:
|
154 |
+
add_nodes(top_node, "root")
|
155 |
+
|
156 |
+
dot.render(output_png_path, cleanup=True)
|
157 |
+
print("生成 PNG 文件:", output_png_path + ".png")
|
158 |
+
return output_png_path + ".png"
|
159 |
+
|
160 |
+
|
161 |
+
|
162 |
+
def insert_outline_image(png_path, md_content, survey_title):
|
163 |
+
"""
|
164 |
+
在给定的 Markdown 内容字符串中查找 "2 Introduction" 这一行,
|
165 |
+
然后在该位置之前插入 outline 图片的 HTML 代码块,确保渲染时
|
166 |
+
HTML 块与后续 Markdown 内容间有足够空行分隔开。
|
167 |
+
|
168 |
+
参数:
|
169 |
+
png_path: 要插入的 PNG 图片路径,将作为 img 的 src 属性值。
|
170 |
+
md_content: Markdown 文件内容字符串。
|
171 |
+
survey_title: 用于生成图片说明文字的问卷标题。
|
172 |
+
|
173 |
+
插入的 HTML 格式如下:
|
174 |
+
|
175 |
+
<div style="text-align:center">
|
176 |
+
<img src="{png_path}" alt="Outline" style="width:100%;"/>
|
177 |
+
</div>
|
178 |
+
<div style="text-align:center">
|
179 |
+
Fig 1. The outline of the {survey_title}
|
180 |
+
</div>
|
181 |
+
|
182 |
+
函数返回更新后的 Markdown 内容字符串。
|
183 |
+
"""
|
184 |
+
|
185 |
+
# 将 Markdown 内容字符串分割成行(保留换行符)
|
186 |
+
lines = md_content.splitlines(keepends=True)
|
187 |
+
print(lines)
|
188 |
+
|
189 |
+
# 查找包含 "2 Introduction" 的行的索引
|
190 |
+
intro_index = None
|
191 |
+
for i, line in enumerate(lines):
|
192 |
+
if '2 Introduction' in line:
|
193 |
+
intro_index = i
|
194 |
+
break
|
195 |
+
|
196 |
+
if intro_index is None:
|
197 |
+
print("没有找到 '2 Introduction' 这一行!")
|
198 |
+
return md_content
|
199 |
+
|
200 |
+
# 确保路径中的反斜杠被替换成正斜杠
|
201 |
+
png_path_fixed = png_path.replace("\\", "/")
|
202 |
+
|
203 |
+
# 构造需要插入的 HTML 代码块,在前后增加空行
|
204 |
+
html_snippet = (
|
205 |
+
"\n\n" # 添加换行确保与上文/下文分隔
|
206 |
+
f'<div style="text-align:center">\n'
|
207 |
+
f' <img src="{png_path_fixed}" alt="Outline" style="width:100%;"/>\n'
|
208 |
+
f'</div>\n'
|
209 |
+
f'<div style="text-align:center">\n'
|
210 |
+
f' Fig 1. The outline of the {survey_title}\n'
|
211 |
+
f'</div>\n'
|
212 |
+
"\n" # 再添加一个空行确保与下方内容分隔
|
213 |
+
)
|
214 |
+
|
215 |
+
print(f"将在第 {intro_index} 行插入如下 HTML 代码块(插入在 '2 Introduction' 之前):\n{html_snippet}")
|
216 |
+
|
217 |
+
# 在找到的 "2 Introduction" 这一行之前插入 html_snippet
|
218 |
+
lines.insert(intro_index, html_snippet)
|
219 |
+
|
220 |
+
# 合并所有行,构造更新后的 Markdown 内容
|
221 |
+
updated_md = "".join(lines)
|
222 |
+
|
223 |
+
print("已在 Markdown 内容中插入 outline 图片。")
|
224 |
+
return updated_md
|
225 |
+
|
226 |
+
def insert_outline_figure(png_path, tex_content, survey_title):
|
227 |
+
"""
|
228 |
+
在给定的 TeX 文件内容字符串中查找 "2 Introduction" 这一行,
|
229 |
+
然后在其之前插入一个跨页(双栏)的 figure* 环境,包括整页显示的图片。
|
230 |
+
它将生成类似如下 LaTeX 片段:
|
231 |
+
|
232 |
+
\begin{figure*}[htbp]
|
233 |
+
\centering
|
234 |
+
\includegraphics[width=\textwidth]{path/to/xxx.png}
|
235 |
+
\caption{Fig 1. The outline of the XXX}
|
236 |
+
\end{figure*}
|
237 |
+
|
238 |
+
参数:
|
239 |
+
png_path: 要插入的 PNG 图片路径
|
240 |
+
tex_content: TeX 文件内容字符串
|
241 |
+
survey_title: 用于生成图片 caption 的文献/问卷标题
|
242 |
+
|
243 |
+
返回:
|
244 |
+
更新后的 TeX 文本字符串
|
245 |
+
"""
|
246 |
+
|
247 |
+
# 将 TeX 内容逐行分割(保留换行符)
|
248 |
+
lines = tex_content.splitlines(keepends=True)
|
249 |
+
|
250 |
+
# 查找包含 "2 Introduction" 的行索引
|
251 |
+
intro_index = None
|
252 |
+
for i, line in enumerate(lines):
|
253 |
+
if 'Introduction' in line:
|
254 |
+
intro_index = i
|
255 |
+
break
|
256 |
+
|
257 |
+
# 如果找不到,就直接返回原文
|
258 |
+
if intro_index is None:
|
259 |
+
print("没有找到 'Introduction' 这一行,未执行插入。")
|
260 |
+
return tex_content
|
261 |
+
|
262 |
+
# 构造 TeX 的 figure* 代码块
|
263 |
+
# 为确保整页,可用 [p] 或者 [htbp],具体可根据排���需要调整
|
264 |
+
# 也可替换成普通 \begin{figure} ... \end{figure},如果不需要跨双栏
|
265 |
+
figure_block = (
|
266 |
+
"\n" # 加一个空行,确保与上文分隔
|
267 |
+
"\\begin{figure*}[htbp]\n"
|
268 |
+
" \\centering\n"
|
269 |
+
f" \\includegraphics[width=\\textwidth]{{{png_path}}}\n"
|
270 |
+
f" \\caption{{The outline of our survey: {survey_title}}}\n"
|
271 |
+
"\\end{figure*}\n\n" # 再留一个空行分隔
|
272 |
+
)
|
273 |
+
|
274 |
+
# 在找到的 "2 Introduction" 所在行之前插入 figure 环境
|
275 |
+
lines.insert(intro_index, figure_block)
|
276 |
+
|
277 |
+
# 重新拼接所有行
|
278 |
+
updated_tex = "".join(lines)
|
279 |
+
return updated_tex
|
280 |
+
# 使用示例:
|
281 |
+
# if __name__ == "__main__":
|
282 |
+
# png_path = 'src/static/data/info/test_4/outline.png'
|
283 |
+
# md_content = ''
|
284 |
+
# survey_title = "My Survey Title"
|
285 |
+
# updated_md = insert_outline_image(png_path, md_content, survey_title)
|
286 |
+
# --------------------------
|
287 |
+
# 使用示例
|
288 |
+
# --------------------------
|
289 |
+
if __name__ == "__main__":
|
290 |
+
json_path = os.path.join("src", "static", "data", "txt", 'test_2', "outline.json")
|
291 |
+
output_png_path = os.path.join("src", "static", "data", "info", 'test_2', "outline")
|
292 |
+
md_path = os.path.join("src", "static", "data", "info", 'test_2', f"survey_{'test_2'}_processed.md")
|
293 |
+
flowchart_results_path = os.path.join("src", "static", "data", "info", 'test_2', "flowchart_results.json")
|
294 |
+
png_path = generate_graphviz_png(
|
295 |
+
json_path=json_path,
|
296 |
+
output_png_path=output_png_path,
|
297 |
+
md_path=md_path,
|
298 |
+
title='test',
|
299 |
+
max_root_chars=30
|
300 |
+
)
|
301 |
+
|
302 |
+
# generate_graphviz_png(json_file_path, output_png_file, md_file_path, title=mindmap_title, max_root_chars=20)
|
src/demo/asg_outline.py
ADDED
@@ -0,0 +1,1029 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import transformers
|
2 |
+
import torch
|
3 |
+
import os
|
4 |
+
import json
|
5 |
+
import re
|
6 |
+
import ast
|
7 |
+
from .survey_generator_api import *
|
8 |
+
from .asg_abstract import AbstractGenerator
|
9 |
+
from .asg_conclusion import ConclusionGenerator
|
10 |
+
from .asg_retriever import *
|
11 |
+
import pandas as df
|
12 |
+
from .references import generate_references
|
13 |
+
|
14 |
+
|
15 |
+
class OutlineGenerator():
|
16 |
+
def __init__(self, pipeline, df, cluster_names, mode='desp'):
|
17 |
+
self.pipeline = pipeline
|
18 |
+
# self.pipeline.model.load_adapter("technicolor/llama3.1_8b_outline_generation")
|
19 |
+
self.pipeline.model.set_adapter("outline")
|
20 |
+
self.df = df
|
21 |
+
self.cluster = [{'label': i, 'name': cluster_names[i]} for i in range(len(cluster_names))]
|
22 |
+
self._add_cluster_info()
|
23 |
+
self.mode = mode
|
24 |
+
|
25 |
+
def __init__(self, df, cluster_names, mode='desp'): #Without local llms
|
26 |
+
self.df = df
|
27 |
+
self.cluster = [{'label': i, 'name': cluster_names[i]} for i in range(len(cluster_names))]
|
28 |
+
self._add_cluster_info()
|
29 |
+
self.mode = mode
|
30 |
+
|
31 |
+
def _add_cluster_info(self):
|
32 |
+
label_to_info = {label: self.df[self.df['label'] == label] for label in range(len(self.cluster))}
|
33 |
+
for cluster in self.cluster:
|
34 |
+
cluster['info'] = label_to_info[cluster['label']]
|
35 |
+
|
36 |
+
def get_cluster_info(self):
|
37 |
+
return self.cluster
|
38 |
+
|
39 |
+
def generate_claims(self):
|
40 |
+
result = []
|
41 |
+
if self.mode == 'desp':
|
42 |
+
for i in range(len(self.cluster)):
|
43 |
+
cluster = self.cluster[i]
|
44 |
+
claims = ''
|
45 |
+
for j in range(len(cluster['info'])):
|
46 |
+
claims = cluster['info'].iloc[j]['retrieval_result'] + '\n' + claims
|
47 |
+
# claims = cluster['info'].iloc[j]['ref_title'] + '\n' + claims
|
48 |
+
result.append(claims)
|
49 |
+
else:
|
50 |
+
for i in range(len(self.cluster)):
|
51 |
+
cluster = self.cluster[i]
|
52 |
+
claims = ''
|
53 |
+
data = cluster['info']
|
54 |
+
for j in range(len(data)):
|
55 |
+
entry = data.iloc[j]
|
56 |
+
title = entry['title']
|
57 |
+
abstract = entry['abstract']
|
58 |
+
prompt = f'''
|
59 |
+
Title:
|
60 |
+
{title}
|
61 |
+
Abstract:
|
62 |
+
{abstract}
|
63 |
+
Task:
|
64 |
+
Conclude new findings and null findings from the abstract in one sentence in the atomic format. Do not separate
|
65 |
+
new findings and null findings. The finding must be relevant to the title. Do not include any other information.
|
66 |
+
Definition:
|
67 |
+
A scientific claim is an atomic verifiable statement expressing a finding about one aspect of a scientific entity or
|
68 |
+
process, which can be verified from a single source.'''
|
69 |
+
|
70 |
+
messages = [
|
71 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
72 |
+
{"role": "user", "content": prompt},
|
73 |
+
]
|
74 |
+
|
75 |
+
outputs = self.pipeline(
|
76 |
+
messages,
|
77 |
+
max_new_tokens=256,
|
78 |
+
)
|
79 |
+
claim = outputs[0]["generated_text"][-1]['content']
|
80 |
+
# print(claim)
|
81 |
+
# print('+++++++++++++++++++++++++++++++++')
|
82 |
+
claims = claims + '\n' + claim
|
83 |
+
result.append(claims)
|
84 |
+
return result
|
85 |
+
|
86 |
+
|
87 |
+
def generate_claims_qwen(self):
|
88 |
+
"""
|
89 |
+
Generate claims for each cluster using Qwen API.
|
90 |
+
|
91 |
+
Returns:
|
92 |
+
list: A list of strings, where each string contains the claims generated
|
93 |
+
for a cluster.
|
94 |
+
"""
|
95 |
+
result = []
|
96 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
97 |
+
openai_api_base = os.getenv("OPENAI_API_BASE")
|
98 |
+
client = OpenAI(
|
99 |
+
api_key=openai_api_key,
|
100 |
+
base_url=openai_api_base,
|
101 |
+
)
|
102 |
+
|
103 |
+
for i in range(len(self.cluster)):
|
104 |
+
cluster = self.cluster[i]
|
105 |
+
claims = ''
|
106 |
+
data = cluster['info']
|
107 |
+
|
108 |
+
for j in range(len(data)):
|
109 |
+
entry = data.iloc[j]
|
110 |
+
title = entry['title']
|
111 |
+
abstract = entry['abstract']
|
112 |
+
|
113 |
+
# Construct the prompt for Qwen
|
114 |
+
prompt = f'''
|
115 |
+
Title:
|
116 |
+
{title}
|
117 |
+
Abstract:
|
118 |
+
{abstract}
|
119 |
+
Task:
|
120 |
+
Conclude new findings and null findings from the abstract in one sentence in the atomic format. Do not separate
|
121 |
+
new findings and null findings. The finding must be relevant to the title. Do not include any other information.
|
122 |
+
Definition:
|
123 |
+
A scientific claim is an atomic verifiable statement expressing a finding about one aspect of a scientific entity or
|
124 |
+
process, which can be verified from a single source.
|
125 |
+
'''
|
126 |
+
|
127 |
+
# Define the input for Qwen
|
128 |
+
messages = [
|
129 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
130 |
+
{"role": "user", "content": prompt},
|
131 |
+
]
|
132 |
+
|
133 |
+
try:
|
134 |
+
# Call Qwen API
|
135 |
+
chat_response = client.chat.completions.create(
|
136 |
+
model=os.environ.get("MODEL"),
|
137 |
+
max_tokens=512,
|
138 |
+
temperature=0.5,
|
139 |
+
messages=messages
|
140 |
+
)
|
141 |
+
|
142 |
+
# Extract the generated claim from Qwen's response
|
143 |
+
claim = ""
|
144 |
+
for chunk in chat_response:
|
145 |
+
if "content" in chunk.choices[0].delta:
|
146 |
+
claim += chunk.choices[0].delta.content
|
147 |
+
|
148 |
+
# Clean and append the claim
|
149 |
+
claims = claims + '\n' + claim.strip()
|
150 |
+
# print("Generated claim:", claim)
|
151 |
+
# print("+++++++++++++++++++++++++++++++++")
|
152 |
+
|
153 |
+
except Exception as e:
|
154 |
+
print(f"Error generating claim for entry {j} in cluster {i}: {e}")
|
155 |
+
continue
|
156 |
+
|
157 |
+
result.append(claims)
|
158 |
+
|
159 |
+
return result
|
160 |
+
|
161 |
+
def generate_outline(self, survey_title):
|
162 |
+
claims = self.generate_claims()
|
163 |
+
cluster_with_claims = ""
|
164 |
+
for i in range(len(self.cluster)):
|
165 |
+
cluster = self.cluster[i]
|
166 |
+
cluster_with_claims = cluster_with_claims + f'Cluster {i}: {cluster["name"]}\n' + "Descriptions for entities in this cluster: \n" + claims[i] + '\n\n'
|
167 |
+
# system_prompt = f'''
|
168 |
+
# You are a helpful assistant who is helping a researcher to generate an outline for a survey paper.
|
169 |
+
# The references used by this survey paper have been clustered into different categories.
|
170 |
+
# The researcher will provides you with the title of the survey paper
|
171 |
+
# together with the cluster names and the descriptions for entities in each cluster.
|
172 |
+
# '''
|
173 |
+
system_prompt = f'''Generate the outline of the survey paper following the format of the example : [[1, '1 Introduction'], [1, '2 Perturbations of (co)differentials'], [2, '2.1 Derivations of the tensor algebra'], [more sections...]].\
|
174 |
+
The first element in the sub-list refers to the hierachy of the section name (from 1 to 3). Sections like Introduction and Conclusion should have the highest level (1)\
|
175 |
+
The second element in the sub-list refers to the section name.
|
176 |
+
'''
|
177 |
+
|
178 |
+
example_json = {"title":"A Survey of Huebschmann and Stasheff's Paper: Formal Solution of the Master Equation via HPT and Deformation Theory","outline":[{"title":"1 Introduction","outline":[]},{"title":"2 Perturbations of (co)differentials","outline":[{"title":"2.1 Derivations of the tensor algebra","outline":[]},{"title":"2.2 Coderivations of the tensor coalgebra","outline":[]},{"title":"2.3 Coderivations of the symmetric coalgebra","outline":[]},{"title":"2.4 DGLA\u2019s and perturbations of the codifferential","outline":[]},{"title":"2.5 Strongly homotopy Lie algebras","outline":[]},{"title":"2.6 The Hochschild chain complex and DGA\u2019s","outline":[]},{"title":"2.7 Strongly homotopy associative algebras","outline":[]}]},{"title":"3 Master equation","outline":[]},{"title":"4 Twisting cochain","outline":[{"title":"4.1 Differential on Hom","outline":[]},{"title":"4.2 Cup product and cup bracket","outline":[]},{"title":"4.3 Twisting cochain","outline":[]}]},{"title":"5 Homological perturbation theory (HPT)","outline":[{"title":"5.1 Contraction","outline":[]},{"title":"5.2 The first main theorem.","outline":[]}]},{"title":"6 Corollaries and the second main theorem","outline":[{"title":"6.1 Other corollaries of Theorem\u00a01.","outline":[]},{"title":"6.2 The second main theorem","outline":[]}]},{"title":"7 Differential Gerstenhaber and BV algebras","outline":[{"title":"7.1 Differential Gerstenhaber algebras","outline":[]},{"title":"7.2 Differential BV algebras","outline":[]},{"title":"7.3 Formality","outline":[{"title":"7.3.1 Formality of differential graded P\ud835\udc43Pitalic_P-algebras","outline":[]},{"title":"7.3.2 Examples","outline":[]}]},{"title":"7.4 Differential BV algebras and formality","outline":[]}]},{"title":"8 Deformation theory","outline":[]},{"title":"References","outline":[]}]}
|
179 |
+
# user_prompt = {"survey_title":survey_title, "claims":cluster_with_claims}
|
180 |
+
user_prompt = f'''Generate the outline of the survey paper given the title:{survey_title}, and three lists of sentences describing each cluster of the references used by this survey:{cluster_with_claims}'''
|
181 |
+
|
182 |
+
messages = [
|
183 |
+
{"role": "system", "content": system_prompt},
|
184 |
+
{"role": "user", "content": user_prompt},
|
185 |
+
{"role": "assistant", "content":"[[1, '1 Abstract'], [1, '2 Introduction'], "}
|
186 |
+
]
|
187 |
+
|
188 |
+
outputs = self.pipeline(
|
189 |
+
messages,
|
190 |
+
max_new_tokens=9192,
|
191 |
+
)
|
192 |
+
result = outputs[0]["generated_text"][-1]['content']
|
193 |
+
|
194 |
+
self.pipeline.model.disable_adapters()
|
195 |
+
|
196 |
+
return messages, result
|
197 |
+
|
198 |
+
def generate_outline_qwen(self, survey_title, cluster_num = 3):
|
199 |
+
claims = self.generate_claims()
|
200 |
+
cluster_with_claims = ""
|
201 |
+
cluster_names = []
|
202 |
+
for i in range(cluster_num): # 改为 cluster_num
|
203 |
+
cluster = self.cluster[i]
|
204 |
+
cluster_with_claims += f'Cluster {i}: {cluster["name"]}\nDescriptions for reference papers in this cluster:\n{claims[i]}\n\n'
|
205 |
+
cluster_names.append(cluster["name"])
|
206 |
+
# system_prompt = f'''
|
207 |
+
# You are a helpful assistant who is helping a researcher to generate an outline for a survey paper.
|
208 |
+
# The references used by this survey paper have been clustered into different categories.
|
209 |
+
# The researcher will provides you with the title of the survey paper
|
210 |
+
# together with the cluster names and the descriptions for entities in each cluster.
|
211 |
+
# '''
|
212 |
+
system_prompt = f'''Finish the outline of the survey paper following the format of the example : [[1, '1 Introduction'], [1, '2 Perturbations of (co)differentials'], [2, '2.1 Derivations of the tensor algebra'], [3, '2.2.1 ...']......].\
|
213 |
+
The first element in the sub-list refers to the hierachy of the section name (from 1 to 3). Sections like Introduction and Conclusion should have the highest level (1)\
|
214 |
+
The second element in the sub-list refers to the section name.
|
215 |
+
You are required to finish the second and third level subsections name under [1, '3 <Cluster 0's name>'], [1, '4 <Cluster 1's name>'] and [1, '5 <Cluster 2's name>']
|
216 |
+
You must not generate third level susections over *3* for each second level subsection, for example, [3, '3.1.4 xxx'], [3, '3.1.5 xxx'] are not allowed.
|
217 |
+
*Try to conclude the main findings of each cluster in the second and third level subsections, use highly abstract terms and phrases to describe*
|
218 |
+
*Do not include colons, e.g. AutoSurvey: Large Language Models Can Automatically Write Surveys should be written in Large Language Models in Writing Surveys*
|
219 |
+
'''
|
220 |
+
# user_prompt = {"survey_title":survey_title, "claims":cluster_with_claims}
|
221 |
+
cluster_sections = "\n".join([f"[1, '{i+3} {cluster_names[i]}'], [level 2 and 3 sections to finish...]" for i in range(cluster_num)])
|
222 |
+
|
223 |
+
user_prompt = f'''Finish the outline of the survey paper given the title: {survey_title}, and lists of sentences describing each cluster of the references used by this survey:\n{cluster_with_claims}
|
224 |
+
The first level sections' hierarchy is given: [[1, '1 Abstract'], [1, '2 Introduction'], {cluster_sections}, [1, '{cluster_num+3} Future Directions'], [1, '{cluster_num+4} Conclusion']].
|
225 |
+
You are required to finish the second and third level subsections under each cluster section with [2, 'a.b xxx'] and [3, 'a.b.c xxx'].
|
226 |
+
You must not generate third level susections over *3* for each second level subsection, for example, [3, '3.1.4 xxx'], [3, '3.1.5 xxx'] are not allowed.
|
227 |
+
*Try to conclude the main findings of each cluster in the second and third level subsections, use highly abstract terms and phrases to describe*
|
228 |
+
*Do not include colons, e.g. AutoSurvey: Large Language Models Can Automatically Write Surveys should be written in Large Language Models in Writing Surveys*
|
229 |
+
'''
|
230 |
+
|
231 |
+
messages = [
|
232 |
+
{"role": "system", "content": system_prompt},
|
233 |
+
{"role": "user", "content": user_prompt},
|
234 |
+
]
|
235 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
236 |
+
openai_api_base = os.getenv("OPENAI_API_BASE")
|
237 |
+
client = OpenAI(
|
238 |
+
# defaults to os.environ.get("OPENAI_API_KEY")
|
239 |
+
api_key = openai_api_key,
|
240 |
+
base_url = openai_api_base,
|
241 |
+
)
|
242 |
+
chat_response = client.chat.completions.create(
|
243 |
+
model=os.environ.get("MODEL"),
|
244 |
+
max_tokens=2048,
|
245 |
+
temperature=0.5,
|
246 |
+
stop="<|im_end|>",
|
247 |
+
stream=True,
|
248 |
+
messages= messages
|
249 |
+
)
|
250 |
+
# Stream the response to console
|
251 |
+
text = ""
|
252 |
+
for chunk in chat_response:
|
253 |
+
if chunk.choices[0].delta.content:
|
254 |
+
text += chunk.choices[0].delta.content
|
255 |
+
# print('The response is :', text)
|
256 |
+
pattern = r'\[(.*)\]'
|
257 |
+
match = re.search(pattern, text, re.DOTALL) # re.DOTALL 允许 . 匹配换行符
|
258 |
+
text = match.group(1)
|
259 |
+
clean_text = re.sub(r'\s+', ' ', text).strip()
|
260 |
+
return messages, clean_text
|
261 |
+
|
262 |
+
def parseOutline(survey_id):
|
263 |
+
file_path = f'./src/static/data/txt/{survey_id}/outline.json'
|
264 |
+
try:
|
265 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
266 |
+
data = json.load(file)
|
267 |
+
except Exception as e:
|
268 |
+
print(f"Error loading JSON file {file_path}: {e}")
|
269 |
+
return []
|
270 |
+
|
271 |
+
response = data.get('outline', '')
|
272 |
+
if not response:
|
273 |
+
print("No outline content found in JSON.")
|
274 |
+
return []
|
275 |
+
|
276 |
+
# 提取文本中第一个 '[' 与最后一个 ']' 之间的内容
|
277 |
+
def extract_first_last(text):
|
278 |
+
first_match = re.search(r'\[', text)
|
279 |
+
last_match = re.search(r'\](?!.*\])', text) # 使用负向前瞻查找最后一个 ']'
|
280 |
+
if first_match and last_match:
|
281 |
+
return '[' + text[first_match.start() + 1:last_match.start()] + ']'
|
282 |
+
return None
|
283 |
+
|
284 |
+
response_extracted = extract_first_last(response)
|
285 |
+
if not response_extracted:
|
286 |
+
print("Failed to extract a valid list string from the outline content.")
|
287 |
+
return []
|
288 |
+
|
289 |
+
# 检查提取结果是否为“列表的列表”格式(应该以 "[[" 开头)
|
290 |
+
fixed_str = response_extracted.strip()
|
291 |
+
if not fixed_str.startswith("[["):
|
292 |
+
# 如果不是,则去掉原有的首尾括号,再重新包装:[[ ... ]]
|
293 |
+
# 注意:这种方式假定内部结构是以逗号分隔的多个列表,而不是单个列表。
|
294 |
+
fixed_str = "[[" + fixed_str[1:-1] + "]]"
|
295 |
+
# 或者根据你的实际情况,也可简单包装外层括号:
|
296 |
+
# fixed_str = "[" + fixed_str + "]"
|
297 |
+
|
298 |
+
try:
|
299 |
+
outline_list = ast.literal_eval(fixed_str)
|
300 |
+
except Exception as e:
|
301 |
+
print(f"Error converting extracted outline to a list.\nExtracted text: {fixed_str}\nError: {e}")
|
302 |
+
return []
|
303 |
+
|
304 |
+
# 如果结果不是列表,则转换成列表
|
305 |
+
if not isinstance(outline_list, list):
|
306 |
+
outline_list = list(outline_list)
|
307 |
+
|
308 |
+
# 如果解析结果不是列表的列表,而是单个列表(例如 [a, b, c]),则将其包装成一个列表
|
309 |
+
if outline_list and not all(isinstance(item, list) for item in outline_list):
|
310 |
+
outline_list = [outline_list]
|
311 |
+
|
312 |
+
result = []
|
313 |
+
for item in outline_list:
|
314 |
+
result.append(item)
|
315 |
+
return result
|
316 |
+
|
317 |
+
|
318 |
+
def generateOutlineHTML_qwen(survey_id):
|
319 |
+
outline_list = parseOutline(survey_id)
|
320 |
+
html = '''
|
321 |
+
<div class="container-fluid w-50 d-flex flex-column justify-content-center align-items-center">
|
322 |
+
|
323 |
+
<style>
|
324 |
+
/* 不同层级的样式 */
|
325 |
+
.level-1 {
|
326 |
+
font-size: 20px;
|
327 |
+
font-weight: bold;
|
328 |
+
position: relative;
|
329 |
+
padding-right: 40px; /* 为箭头留出空间 */
|
330 |
+
}
|
331 |
+
.level-2 {
|
332 |
+
font-size: 18px;
|
333 |
+
padding-left: 40px;
|
334 |
+
}
|
335 |
+
.level-3 {
|
336 |
+
font-size: 16px;
|
337 |
+
padding-left: 80px;
|
338 |
+
}
|
339 |
+
.list-group-item {
|
340 |
+
border: none;
|
341 |
+
}
|
342 |
+
|
343 |
+
/* 自定义卡片样式 */
|
344 |
+
.custom-card {
|
345 |
+
background-color: #fff;
|
346 |
+
border-radius: 8px;
|
347 |
+
padding: 20px;
|
348 |
+
margin-top: 20px;
|
349 |
+
width: 100%;
|
350 |
+
max-width: 800px;
|
351 |
+
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1),
|
352 |
+
0 6px 20px rgba(0, 0, 0, 0.1);
|
353 |
+
}
|
354 |
+
|
355 |
+
/* 自定义卡片主体样式 */
|
356 |
+
.custom-card-body {
|
357 |
+
padding: 20px;
|
358 |
+
}
|
359 |
+
|
360 |
+
/* 折叠图标样式 */
|
361 |
+
.collapse-icon {
|
362 |
+
background: none;
|
363 |
+
border: none;
|
364 |
+
padding: 0;
|
365 |
+
position: absolute;
|
366 |
+
right: 10px;
|
367 |
+
top: 50%;
|
368 |
+
transform: translateY(-50%) rotate(0deg);
|
369 |
+
cursor: pointer;
|
370 |
+
font-size: 16px;
|
371 |
+
/* 旋转过渡效果 */
|
372 |
+
transition: transform 0.2s;
|
373 |
+
}
|
374 |
+
/* 去除按钮聚焦时的轮廓 */
|
375 |
+
.collapse-icon:focus {
|
376 |
+
outline: none;
|
377 |
+
}
|
378 |
+
/* 当折叠展开时旋转图标 */
|
379 |
+
.collapsed .collapse-icon {
|
380 |
+
transform: translateY(-50%) rotate(0deg);
|
381 |
+
}
|
382 |
+
.in .collapse-icon {
|
383 |
+
transform: translateY(-50%) rotate(90deg);
|
384 |
+
}
|
385 |
+
</style>
|
386 |
+
|
387 |
+
<div class="custom-card">
|
388 |
+
<div class="custom-card-body" id="display-outline">
|
389 |
+
<ul class="list-group list-group-flush">
|
390 |
+
'''
|
391 |
+
|
392 |
+
# 添加默认的一级标题内容
|
393 |
+
default_items = []
|
394 |
+
|
395 |
+
# 将默认项与解析出的纲要列表合并
|
396 |
+
combined_list = default_items + outline_list
|
397 |
+
|
398 |
+
# 构建树形结构,以便检测一级标题是否有子标题
|
399 |
+
def build_outline_tree(outline_list):
|
400 |
+
sections = []
|
401 |
+
stack = []
|
402 |
+
for level, content in outline_list:
|
403 |
+
level = int(level)
|
404 |
+
node = {'level': level, 'content': content, 'subitems': []}
|
405 |
+
if level == 1:
|
406 |
+
sections.append(node)
|
407 |
+
stack = [node]
|
408 |
+
elif level == 2:
|
409 |
+
if stack:
|
410 |
+
parent = stack[-1]
|
411 |
+
parent['subitems'].append(node)
|
412 |
+
# stack.append(node)
|
413 |
+
else:
|
414 |
+
sections.append(node)
|
415 |
+
elif level == 3:
|
416 |
+
if stack:
|
417 |
+
parent = stack[-1]
|
418 |
+
parent['subitems'].append(node)
|
419 |
+
else:
|
420 |
+
sections.append(node)
|
421 |
+
return sections
|
422 |
+
|
423 |
+
sections = build_outline_tree(combined_list)
|
424 |
+
|
425 |
+
# 生成 HTML
|
426 |
+
def generate_html_from_sections(sections):
|
427 |
+
html = ''
|
428 |
+
section_index = 1 # 用于生成唯一的 ID
|
429 |
+
|
430 |
+
def generate_node_html(node):
|
431 |
+
nonlocal section_index
|
432 |
+
level = node['level']
|
433 |
+
content = node['content']
|
434 |
+
has_subitems = len(node['subitems']) > 0
|
435 |
+
if level == 1:
|
436 |
+
# 一级标题
|
437 |
+
if has_subitems:
|
438 |
+
# 如果有子标题,添加下拉图标和可折叠功能
|
439 |
+
section_id = f"outline_collapseSection{section_index}"
|
440 |
+
section_index += 1
|
441 |
+
node_html = f'''
|
442 |
+
<li class="list-group-item level-1">
|
443 |
+
{content}
|
444 |
+
<a class="collapsed" data-toggle="collapse" data-target="#{section_id}" aria-expanded="true" aria-controls="{section_id}">
|
445 |
+
▶ <!-- 右箭头表示折叠状态 -->
|
446 |
+
</a>
|
447 |
+
<ul class="list-group collapse in" id="{section_id}">
|
448 |
+
'''
|
449 |
+
for subitem in node['subitems']:
|
450 |
+
node_html += generate_node_html(subitem)
|
451 |
+
node_html += '''
|
452 |
+
</ul>
|
453 |
+
</li>
|
454 |
+
'''
|
455 |
+
else:
|
456 |
+
# 如果没有子标题,不显示下拉图标
|
457 |
+
node_html = f'''
|
458 |
+
<li class="list-group-item level-1">
|
459 |
+
{content}
|
460 |
+
</li>
|
461 |
+
'''
|
462 |
+
elif level == 2:
|
463 |
+
node_html = f'<li class="list-group-item level-2">{content}</li>'
|
464 |
+
elif level == 3:
|
465 |
+
# 三级标题直接显示,已经在二级标题中处理
|
466 |
+
node_html = f'<li class="list-group-item level-3">{content}</li>'
|
467 |
+
return node_html
|
468 |
+
|
469 |
+
for section in sections:
|
470 |
+
html += generate_node_html(section)
|
471 |
+
|
472 |
+
return html
|
473 |
+
|
474 |
+
def generate_list_html(combined_list, editable=True):
|
475 |
+
html = '<ul class="list-group list-group-flush">\n' # 开始 <ul>
|
476 |
+
for level, content in combined_list:
|
477 |
+
# 根据层级添加对应的 class
|
478 |
+
if level == 1: # Level 1 的输入框需要禁用
|
479 |
+
if editable:
|
480 |
+
html += f'<li class="list-group-item level-1"><input type="text" class="form-control" value="{content}" disabled></li>\n'
|
481 |
+
else:
|
482 |
+
html += f'<li class="list-group-item level-1">{content}</li>\n'
|
483 |
+
elif level == 2:
|
484 |
+
if editable:
|
485 |
+
html += f'<li class="list-group-item level-2" style="padding-left: 20px;"><input type="text" class="form-control" value="{content}"></li>\n'
|
486 |
+
else:
|
487 |
+
html += f'<li class="list-group-item level-2" style="padding-left: 20px;">{content}</li>\n'
|
488 |
+
elif level == 3:
|
489 |
+
if editable:
|
490 |
+
html += f'<li class="list-group-item level-3" style="padding-left: 40px;"><input type="text" class="form-control" value="{content}"></li>\n'
|
491 |
+
else:
|
492 |
+
html += f'<li class="list-group-item level-3" style="padding-left: 40px;">{content}</li>\n'
|
493 |
+
html += '</ul>' # 结束 </ul>
|
494 |
+
return html
|
495 |
+
|
496 |
+
# 生成列表 HTML
|
497 |
+
list_html = generate_list_html(combined_list)
|
498 |
+
html += generate_html_from_sections(sections)
|
499 |
+
|
500 |
+
html += f'''
|
501 |
+
</ul>
|
502 |
+
</div>
|
503 |
+
<div class="custom-card-body" style="display: none" id="edit-outline">
|
504 |
+
{list_html}
|
505 |
+
</div>
|
506 |
+
<button type="button" class="btn btn-secondary btn-lg" id="edit-btn" onclick="editOutline()"><i class="bi bi-pen"></i></button>
|
507 |
+
<button type="button" class="btn btn-success btn-lg" id="confirm-btn" style="display: none;" onclick="confirmOutline()"><i class="bi bi-check"></i></button>
|
508 |
+
</div>
|
509 |
+
<!-- 添加 Bootstrap v3.3.0 的 JavaScript 来处理折叠功能 -->
|
510 |
+
<script>
|
511 |
+
$(document).ready(function(){{
|
512 |
+
$('.collapsed').click(function(){{
|
513 |
+
$(this).toggleClass('collapsed');
|
514 |
+
}});
|
515 |
+
}});
|
516 |
+
</script>
|
517 |
+
|
518 |
+
</div>
|
519 |
+
'''
|
520 |
+
|
521 |
+
html+='''
|
522 |
+
<script>
|
523 |
+
// 切换到编辑模式
|
524 |
+
function editOutline() {
|
525 |
+
document.getElementById("display-outline").style.display = "none"; // 隐藏不可编���部分
|
526 |
+
document.getElementById("edit-outline").style.display = "block"; // 显示可编辑部分
|
527 |
+
|
528 |
+
// 显示 "Confirm" 按钮,隐藏 "Edit" 按钮
|
529 |
+
document.getElementById("edit-btn").style.display = "none";
|
530 |
+
document.getElementById("confirm-btn").style.display = "inline-block";
|
531 |
+
}
|
532 |
+
|
533 |
+
// 确认编辑并提交数据
|
534 |
+
function confirmOutline() {
|
535 |
+
const outlineData = []; // 用于存储提交到后端的数据
|
536 |
+
|
537 |
+
// 遍历所有的可编辑输入框
|
538 |
+
document.querySelectorAll("#edit-outline .list-group-item").forEach((item) => {
|
539 |
+
const level = item.classList.contains("level-1") ? 1 :
|
540 |
+
item.classList.contains("level-2") ? 2 : 3; // 获取层级
|
541 |
+
const content = item.querySelector("input").value.trim(); // 获取编辑框的值
|
542 |
+
|
543 |
+
// 将数据转换为数组格式 [level, content]
|
544 |
+
outlineData.push([level, content]);
|
545 |
+
});
|
546 |
+
|
547 |
+
console.log("Submitting to backend:", outlineData); // 打印提交数据以供调试
|
548 |
+
|
549 |
+
// 使用 AJAX 提交数据到后端
|
550 |
+
const csrftoken = getCookie("csrftoken"); // 获取 CSRF token
|
551 |
+
fetch("/save_outline/", {
|
552 |
+
method: "POST",
|
553 |
+
headers: {
|
554 |
+
"Content-Type": "application/json",
|
555 |
+
"X-CSRFToken": csrftoken, // Django 的 CSRF 令牌
|
556 |
+
},
|
557 |
+
body: JSON.stringify({ outline: outlineData }) // 将数据转换为 JSON 字符串
|
558 |
+
})
|
559 |
+
.then((response) => response.json())
|
560 |
+
.then((data) => {
|
561 |
+
if (data.status === "success") {
|
562 |
+
$('#sections_').html(data.html);
|
563 |
+
alert("Outline updated successfully!");
|
564 |
+
} else {
|
565 |
+
alert("Error updating outline: " + data.message);
|
566 |
+
}
|
567 |
+
})
|
568 |
+
.catch((error) => {
|
569 |
+
console.error("Error:", error);
|
570 |
+
alert("Error updating outline. Please check the console for details.");
|
571 |
+
});
|
572 |
+
}
|
573 |
+
</script>
|
574 |
+
'''
|
575 |
+
return html
|
576 |
+
|
577 |
+
def insert_section(content, section_header, section_content):
|
578 |
+
"""
|
579 |
+
在 content 中找到以 section_header 开头的行,并在其后插入 section_content
|
580 |
+
section_header: 标题名称,例如 "Abstract" 或 "Conclusion"
|
581 |
+
section_content: 要插入的内容(字符串)
|
582 |
+
"""
|
583 |
+
# 修改正则表达式,使得数字后的点是可选的
|
584 |
+
pattern = re.compile(
|
585 |
+
r'(^#\s+\d+\.?\s+' + re.escape(section_header) + r'\s*$)',
|
586 |
+
re.MULTILINE | re.IGNORECASE
|
587 |
+
)
|
588 |
+
replacement = r'\1\n\n' + section_content + '\n'
|
589 |
+
new_content, count = pattern.subn(replacement, content)
|
590 |
+
if count == 0:
|
591 |
+
print(f"警告: 未找到标题 '{section_header}'。无法插入内容。")
|
592 |
+
return new_content
|
593 |
+
|
594 |
+
def generateOutlineHTML(survey_id):
|
595 |
+
outline_list = parseOutline(survey_id)
|
596 |
+
html = '''
|
597 |
+
<div class="container-fluid w-50 d-flex flex-column justify-content-center align-items-center">
|
598 |
+
|
599 |
+
<style>
|
600 |
+
/* 不同层级的样式 */
|
601 |
+
.level-1 {
|
602 |
+
font-size: 20px;
|
603 |
+
font-weight: bold;
|
604 |
+
position: relative;
|
605 |
+
padding-right: 40px; /* 为箭头留出空间 */
|
606 |
+
}
|
607 |
+
.level-2 {
|
608 |
+
font-size: 18px;
|
609 |
+
padding-left: 40px;
|
610 |
+
}
|
611 |
+
.level-3 {
|
612 |
+
font-size: 16px;
|
613 |
+
padding-left: 80px;
|
614 |
+
}
|
615 |
+
.list-group-item {
|
616 |
+
border: none;
|
617 |
+
}
|
618 |
+
|
619 |
+
/* 自定义卡片样式 */
|
620 |
+
.custom-card {
|
621 |
+
background-color: #fff;
|
622 |
+
border-radius: 8px;
|
623 |
+
padding: 20px;
|
624 |
+
margin-top: 20px;
|
625 |
+
width: 100%;
|
626 |
+
max-width: 800px;
|
627 |
+
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1),
|
628 |
+
0 6px 20px rgba(0, 0, 0, 0.1);
|
629 |
+
}
|
630 |
+
|
631 |
+
/* 自定义卡片主体样式 */
|
632 |
+
.custom-card-body {
|
633 |
+
padding: 20px;
|
634 |
+
}
|
635 |
+
|
636 |
+
/* 折叠图标样式 */
|
637 |
+
.collapse-icon {
|
638 |
+
background: none;
|
639 |
+
border: none;
|
640 |
+
padding: 0;
|
641 |
+
position: absolute;
|
642 |
+
right: 10px;
|
643 |
+
top: 50%;
|
644 |
+
transform: translateY(-50%) rotate(0deg);
|
645 |
+
cursor: pointer;
|
646 |
+
font-size: 16px;
|
647 |
+
/* 旋转过渡效果 */
|
648 |
+
transition: transform 0.2s;
|
649 |
+
}
|
650 |
+
/* 去除按钮聚焦时的轮廓 */
|
651 |
+
.collapse-icon:focus {
|
652 |
+
outline: none;
|
653 |
+
}
|
654 |
+
/* 当折叠展开时旋转图标 */
|
655 |
+
.collapsed .collapse-icon {
|
656 |
+
transform: translateY(-50%) rotate(0deg);
|
657 |
+
}
|
658 |
+
.in .collapse-icon {
|
659 |
+
transform: translateY(-50%) rotate(90deg);
|
660 |
+
}
|
661 |
+
</style>
|
662 |
+
|
663 |
+
<div class="custom-card">
|
664 |
+
<div class="custom-card-body">
|
665 |
+
<ul class="list-group list-group-flush">
|
666 |
+
'''
|
667 |
+
|
668 |
+
# 添加默认的一级标题内容
|
669 |
+
default_items = [[1, '1 Abstract'], [1, '2 Introduction']]
|
670 |
+
|
671 |
+
# 将默认项与解析出的纲要列表合并
|
672 |
+
combined_list = default_items + outline_list
|
673 |
+
|
674 |
+
# 构建树形结构,以便检测一级标题是否有子标题
|
675 |
+
def build_outline_tree(outline_list):
|
676 |
+
sections = []
|
677 |
+
stack = []
|
678 |
+
for level, content in outline_list:
|
679 |
+
level = int(level)
|
680 |
+
node = {'level': level, 'content': content, 'subitems': []}
|
681 |
+
if level == 1:
|
682 |
+
sections.append(node)
|
683 |
+
stack = [node]
|
684 |
+
elif level == 2:
|
685 |
+
if stack:
|
686 |
+
parent = stack[-1]
|
687 |
+
parent['subitems'].append(node)
|
688 |
+
# stack.append(node)
|
689 |
+
else:
|
690 |
+
sections.append(node)
|
691 |
+
elif level == 3:
|
692 |
+
if stack:
|
693 |
+
parent = stack[-1]
|
694 |
+
parent['subitems'].append(node)
|
695 |
+
else:
|
696 |
+
sections.append(node)
|
697 |
+
return sections
|
698 |
+
|
699 |
+
sections = build_outline_tree(combined_list)
|
700 |
+
|
701 |
+
# 生成 HTML
|
702 |
+
def generate_html_from_sections(sections):
|
703 |
+
html = ''
|
704 |
+
section_index = 1 # 用于生成唯一的 ID
|
705 |
+
|
706 |
+
def generate_node_html(node):
|
707 |
+
nonlocal section_index
|
708 |
+
level = node['level']
|
709 |
+
content = node['content']
|
710 |
+
has_subitems = len(node['subitems']) > 0
|
711 |
+
if level == 1:
|
712 |
+
# 一级标题
|
713 |
+
if has_subitems:
|
714 |
+
# 如果有子标题,添加下拉图标和可折叠功能
|
715 |
+
section_id = f"outline_collapseSection{section_index}"
|
716 |
+
section_index += 1
|
717 |
+
node_html = f'''
|
718 |
+
<li class="list-group-item level-1">
|
719 |
+
{content}
|
720 |
+
<a class="collapsed" data-toggle="collapse" data-target="#{section_id}" aria-expanded="true" aria-controls="{section_id}">
|
721 |
+
▶ <!-- 右箭头表示折叠状态 -->
|
722 |
+
</a>
|
723 |
+
<ul class="list-group collapse in" id="{section_id}">
|
724 |
+
'''
|
725 |
+
for subitem in node['subitems']:
|
726 |
+
node_html += generate_node_html(subitem)
|
727 |
+
node_html += '''
|
728 |
+
</ul>
|
729 |
+
</li>
|
730 |
+
'''
|
731 |
+
else:
|
732 |
+
# 如果没有子标题,不显示下拉图标
|
733 |
+
node_html = f'''
|
734 |
+
<li class="list-group-item level-1">
|
735 |
+
{content}
|
736 |
+
</li>
|
737 |
+
'''
|
738 |
+
elif level == 2:
|
739 |
+
node_html = f'<li class="list-group-item level-2">{content}</li>'
|
740 |
+
elif level == 3:
|
741 |
+
# 三级标题直接显示,已经在二级标题中处理
|
742 |
+
node_html = f'<li class="list-group-item level-3">{content}</li>'
|
743 |
+
return node_html
|
744 |
+
|
745 |
+
for section in sections:
|
746 |
+
html += generate_node_html(section)
|
747 |
+
|
748 |
+
return html
|
749 |
+
|
750 |
+
html += generate_html_from_sections(sections)
|
751 |
+
|
752 |
+
html += '''
|
753 |
+
</ul>
|
754 |
+
</div>
|
755 |
+
</div>
|
756 |
+
<!-- 添加 Bootstrap v3.3.0 的 JavaScript 来处理折叠功能 -->
|
757 |
+
<script>
|
758 |
+
$(document).ready(function(){
|
759 |
+
// 切换箭头方向
|
760 |
+
$('.collapsed').click(function(){
|
761 |
+
$(this).toggleClass('collapsed');
|
762 |
+
});
|
763 |
+
});
|
764 |
+
</script>
|
765 |
+
</div>
|
766 |
+
'''
|
767 |
+
return html
|
768 |
+
|
769 |
+
def insert_section(content, section_header, section_content):
|
770 |
+
"""
|
771 |
+
在 content 中找到以 section_header 开头的行,并在其后插入 section_content
|
772 |
+
section_header: 标题名称,例如 "Abstract" 或 "Conclusion"
|
773 |
+
section_content: 要插入的内容(字符串)
|
774 |
+
"""
|
775 |
+
# 修改正则表达式,使得数字后的点是可选的
|
776 |
+
pattern = re.compile(
|
777 |
+
r'(^#\s+\d+\.?\s+' + re.escape(section_header) + r'\s*$)',
|
778 |
+
re.MULTILINE | re.IGNORECASE
|
779 |
+
)
|
780 |
+
replacement = r'\1\n\n' + section_content + '\n'
|
781 |
+
new_content, count = pattern.subn(replacement, content)
|
782 |
+
if count == 0:
|
783 |
+
print(f"警告: 未找到标题 '{section_header}'。无法插入内容。")
|
784 |
+
return new_content
|
785 |
+
|
786 |
+
def generateSurvey(survey_id, title, collection_list, pipeline):
|
787 |
+
outline = parseOutline(survey_id)
|
788 |
+
default_items = [[1, '1 Abstract'], [1, '2 Introduction'], [1, '3 Overview']]
|
789 |
+
outline = str(default_items + outline)
|
790 |
+
|
791 |
+
client = getQwenClient()
|
792 |
+
|
793 |
+
context_list = generate_context_list(outline, collection_list)
|
794 |
+
|
795 |
+
temp = {
|
796 |
+
"survey_id": survey_id,
|
797 |
+
"outline": str(default_items),
|
798 |
+
"survey_title": title,
|
799 |
+
"context": context_list,
|
800 |
+
"abstract": "",
|
801 |
+
"introduction": "",
|
802 |
+
"content": "",
|
803 |
+
"conclusion": "",
|
804 |
+
"references": ""
|
805 |
+
}
|
806 |
+
|
807 |
+
generated_survey_paper = generate_survey_paper_new(outline, context_list, client)
|
808 |
+
|
809 |
+
generated_introduction = generate_introduction(generated_survey_paper, client)
|
810 |
+
# print("\nGenerated Introduction:\n", generated_introduction)
|
811 |
+
|
812 |
+
abs_generator = AbstractGenerator(pipeline)
|
813 |
+
abstract = abs_generator.generate(title, generated_introduction)
|
814 |
+
con_generator = ConclusionGenerator(pipeline)
|
815 |
+
conclusion = con_generator.generate(title, generated_introduction)
|
816 |
+
|
817 |
+
abstract = abstract.replace("Abstract:", "")
|
818 |
+
conclusion = conclusion.replace("Conclusion:", "")
|
819 |
+
|
820 |
+
temp["abstract"] = abstract
|
821 |
+
temp["introduction"] = generated_introduction
|
822 |
+
temp["content"] = generated_survey_paper
|
823 |
+
temp["conclusion"] = conclusion
|
824 |
+
|
825 |
+
temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
|
826 |
+
temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
|
827 |
+
|
828 |
+
output_path = f'./src/static/data/txt/{survey_id}/generated_result.json'
|
829 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
830 |
+
json.dump(temp, f, ensure_ascii=False, indent=4)
|
831 |
+
print(f"Survey has been saved to {output_path}.")
|
832 |
+
|
833 |
+
return
|
834 |
+
|
835 |
+
def generate_future_directions_qwen(client, title, intro):
|
836 |
+
system_prompt = f'''You are a helpful assistant that help to generate the future directions of the survey paper given the survey title and survey introduction.'''
|
837 |
+
# user_prompt = {"survey_title":survey_title, "claims":cluster_with_claims}
|
838 |
+
user_prompt = f'''Help me to generate the future directions of a survey paper given the title: *{title}*, and and the introduction:{intro} within 300 words.'''
|
839 |
+
|
840 |
+
messages = [
|
841 |
+
{"role": "system", "content": system_prompt},
|
842 |
+
{"role": "user", "content": user_prompt},
|
843 |
+
{"role": "assistant", "content":"Future Directions:"}
|
844 |
+
]
|
845 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
846 |
+
openai_api_base = os.getenv("OPENAI_API_BASE")
|
847 |
+
client = OpenAI(
|
848 |
+
# defaults to os.environ.get("OPENAI_API_KEY")
|
849 |
+
api_key = openai_api_key,
|
850 |
+
base_url = openai_api_base,
|
851 |
+
)
|
852 |
+
chat_response = client.chat.completions.create(
|
853 |
+
model=os.environ.get("MODEL"),
|
854 |
+
max_tokens=768,
|
855 |
+
temperature=0.5,
|
856 |
+
stop="<|im_end|>",
|
857 |
+
stream=True,
|
858 |
+
messages= messages
|
859 |
+
)
|
860 |
+
# Stream the response to console
|
861 |
+
text = ""
|
862 |
+
for chunk in chat_response:
|
863 |
+
if chunk.choices[0].delta.content:
|
864 |
+
text += chunk.choices[0].delta.content
|
865 |
+
return text
|
866 |
+
|
867 |
+
def generateSurvey_qwen(survey_id, title, collection_list, pipeline):
|
868 |
+
outline = str(parseOutline(survey_id))
|
869 |
+
|
870 |
+
client = getQwenClient()
|
871 |
+
|
872 |
+
context_list = generate_context_list(outline, collection_list)
|
873 |
+
|
874 |
+
temp = {
|
875 |
+
"survey_id": survey_id,
|
876 |
+
"outline": outline,
|
877 |
+
"survey_title": title,
|
878 |
+
"context": context_list,
|
879 |
+
"abstract": "",
|
880 |
+
"introduction": "",
|
881 |
+
"content": "",
|
882 |
+
"future_directions":"",
|
883 |
+
"conclusion": "",
|
884 |
+
"references": ""
|
885 |
+
}
|
886 |
+
|
887 |
+
generated_survey_paper = generate_survey_paper_new(title, outline, context_list, client)
|
888 |
+
# print("Generated Survey Paper:\n", generated_survey_paper)
|
889 |
+
|
890 |
+
generated_introduction = generate_introduction(generated_survey_paper, client)
|
891 |
+
# print("\nGenerated Introduction:\n", generated_introduction)
|
892 |
+
abs_generator = AbstractGenerator(pipeline)
|
893 |
+
abstract = abs_generator.generate(title, generated_introduction)
|
894 |
+
con_generator = ConclusionGenerator(pipeline)
|
895 |
+
# conclusion = con_generator.generate(title, generated_introduction)
|
896 |
+
#New version: 12/03
|
897 |
+
conclusion = generate_conclusion(generated_survey_paper, client)
|
898 |
+
abstract = abstract.replace("Abstract:", "")
|
899 |
+
conclusion = conclusion.replace("Conclusion:", "")
|
900 |
+
# future_directions = generate_future_directions_qwen(client, title, generated_introduction).replace("Future Directions:","")
|
901 |
+
#New version: 12/03
|
902 |
+
future_directions = generate_future_work(generated_survey_paper, client)
|
903 |
+
# references = generate_references_dir('./src/static/data/txt/'+survey_id)
|
904 |
+
temp["abstract"] = abstract
|
905 |
+
temp["introduction"] = generated_introduction
|
906 |
+
temp["content"] = generated_survey_paper
|
907 |
+
temp["conclusion"] = conclusion
|
908 |
+
temp["future_directions"] = future_directions
|
909 |
+
# temp["references"] = "\n\n".join([f"{ref}" for i, ref in enumerate(references)])
|
910 |
+
temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
|
911 |
+
temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
|
912 |
+
temp["content"] = insert_section(temp["content"], "Future Directions", temp["future_directions"])
|
913 |
+
output_path = f'./src/static/data/txt/{survey_id}/generated_result.json'
|
914 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
915 |
+
json.dump(temp, f, ensure_ascii=False, indent=4)
|
916 |
+
print(f"Survey has been saved to {output_path}.")
|
917 |
+
return
|
918 |
+
|
919 |
+
# wza
|
920 |
+
def generateSurvey_qwen_new(survey_id, title, collection_list, pipeline, citation_data_list):
|
921 |
+
outline = str(parseOutline(survey_id))
|
922 |
+
client = getQwenClient()
|
923 |
+
context_list = generate_context_list(outline, collection_list)
|
924 |
+
|
925 |
+
temp = {
|
926 |
+
"survey_id": survey_id,
|
927 |
+
"outline": outline,
|
928 |
+
"survey_title": title,
|
929 |
+
"context": context_list,
|
930 |
+
"abstract": "",
|
931 |
+
"introduction": "",
|
932 |
+
"content": "",
|
933 |
+
"future_directions": "",
|
934 |
+
"conclusion": "",
|
935 |
+
"references": ""
|
936 |
+
}
|
937 |
+
|
938 |
+
# 调用generate_survey_paper_new时传入citation_data_list
|
939 |
+
generated_survey_paper = generate_survey_paper_new(title, outline, context_list, client, citation_data_list)
|
940 |
+
|
941 |
+
generated_introduction = generate_introduction_alternate(title, generated_survey_paper, client)
|
942 |
+
# generated_introduction = introduction_with_citations(generated_introduction, citation_data_list)
|
943 |
+
# print("\nGenerated Introduction:\n", generated_introduction)
|
944 |
+
# abs_generator = AbstractGenerator(pipeline)
|
945 |
+
# abstract = abs_generator.generate(title, generated_introduction)
|
946 |
+
abstract = generate_abstract(generated_survey_paper, client)
|
947 |
+
# con_generator = ConclusionGenerator(pipeline)
|
948 |
+
# conclusion = con_generator.generate(title, generated_introduction)
|
949 |
+
conclusion = generate_conclusion(generated_survey_paper, client)
|
950 |
+
abstract = abstract.replace("Abstract:", "")
|
951 |
+
conclusion = conclusion.replace("Conclusion:", "")
|
952 |
+
# future_directions = generate_future_directions_qwen(client, title, generated_introduction).replace("Future Directions:","")
|
953 |
+
#New version: 12/03
|
954 |
+
future_directions = generate_future_work(generated_survey_paper, client)
|
955 |
+
# references = generate_references_dir('./src/static/data/txt/'+survey_id)
|
956 |
+
temp["abstract"] = abstract
|
957 |
+
temp["introduction"] = generated_introduction
|
958 |
+
temp["content"] = generated_survey_paper
|
959 |
+
temp["conclusion"] = conclusion
|
960 |
+
temp["future_directions"] = future_directions
|
961 |
+
# temp["references"] = "\n\n".join([f"{ref}" for i, ref in enumerate(references)])
|
962 |
+
temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
|
963 |
+
temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
|
964 |
+
temp["content"] = insert_section(temp["content"], "Future Directions", temp["future_directions"])
|
965 |
+
output_path = f'./src/static/data/txt/{survey_id}/generated_result.json'
|
966 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
967 |
+
json.dump(temp, f, ensure_ascii=False, indent=4)
|
968 |
+
print(f"Survey has been saved to {output_path}.")
|
969 |
+
return
|
970 |
+
|
971 |
+
|
972 |
+
def generate_references_dir(dir):
|
973 |
+
client = getQwenClient()
|
974 |
+
papers_info = []
|
975 |
+
for file in os.listdir(dir):
|
976 |
+
if file.endswith(".json"):
|
977 |
+
file_path = os.path.join(dir, file)
|
978 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
979 |
+
data = json.load(f)
|
980 |
+
|
981 |
+
papers_info.append({
|
982 |
+
"file_path": file_path,
|
983 |
+
"title": data.get("title", "Unknown Title"),
|
984 |
+
"authors": data.get("authors", "Unknown Author")
|
985 |
+
})
|
986 |
+
print("The length of papers_info is: ", len(papers_info))
|
987 |
+
references = generate_references(papers_info, client)
|
988 |
+
return references
|
989 |
+
|
990 |
+
if __name__ == '__main__':
|
991 |
+
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
992 |
+
|
993 |
+
context = '''
|
994 |
+
Many paradigms have been proposed to asses informativeness of data samples for active learning. One of the popular approaches is selecting the most uncertain data sample, i.e the data sample in which current classifier is least confident. Some other approaches are selecting the sample which yields a model with minimum risk or the data sample which yields fastest convergence in gradient based methods.//
|
995 |
+
An active under-sampling approach is presented in this paper to change the data distribution of training datasets, and improve the classification accuracy of minority classes while maintaining overall classification performance.//
|
996 |
+
In this paper, we propose an uncertainty-based active learning algorithm which requires only samples of one class and a set of unlabeled data in order to operate.//
|
997 |
+
The principal contribution of our work is twofold: First, we use Bayes’ rule and density estimation to avoid the need to have a model of all classes for computing the uncertainty measure.//
|
998 |
+
This technique reduces the number of input parameters of the problem. At the rest of this paper, we first review recent related works in the fields of active learning and active one-class learning (section II).//
|
999 |
+
The classifier predicts that all the samples are non-fraud, it will have a quite high accuracy. However, for problems like fraud detection, minority class classification accuracy is more critical.//
|
1000 |
+
The algorithm used and the features selected are always the key points at design time, and many experiments are needed to select the final algorithm and the best suited feature set.//
|
1001 |
+
Active learning works by selecting among unlabeled data, the most informative data sample. The informativeness of a sample is the amount of accuracy gain achieved after adding it to the training set.//
|
1002 |
+
Some other approaches are selecting the sample which yields a model with minimum risk or the data sample which yields fastest convergence in gradient based methods.//
|
1003 |
+
In this paper, we propose a novel approach reducing each within group error, BABoost, that is a variant of AdaBoost.//
|
1004 |
+
Simulations on different unbalanced distribution data and experiments performed on several real datasets show that the new method is able to achieve a lower within group error.//
|
1005 |
+
Active learning with early stopping can achieve a faster and scalable solution without sacrificing prediction performance.//
|
1006 |
+
We also propose an efficient Support Vector Machine (SVM) active learning strategy which queries a small pool of data at each iterative step instead of querying the entire dataset.//
|
1007 |
+
The second part consists of applying a treatment method and inducing a classifier for each class distribution.//
|
1008 |
+
This time we measured the percentage of the performance loss that was recovered by the treatment method.//
|
1009 |
+
We used two well-known over-sampling methods, random over-sampling and SMOTE.//
|
1010 |
+
We tested our proposed technique on a sample of three representative functional genomic problems: splice site, protein subcellular localization and phosphorylation site prediction problems.//
|
1011 |
+
Among the possible PTMs, phosphorylation is the most studied and perhaps the most important.//
|
1012 |
+
The second part consists of applying a treatment method and inducing a classifier for each class distribution.//
|
1013 |
+
We show that Active Learning (AL) strategy can be a more efficient alternative to resampling methods to form a balanced training set for the learner in early stages of the learning.//
|
1014 |
+
'''
|
1015 |
+
|
1016 |
+
collection_list = ['activelearningfrompositiveandunlabeleddata', ]
|
1017 |
+
|
1018 |
+
Global_pipeline = transformers.pipeline(
|
1019 |
+
"text-generation",
|
1020 |
+
model=model_id,
|
1021 |
+
model_kwargs={"torch_dtype": torch.bfloat16},
|
1022 |
+
token = os.getenv('HF_API_KEY'),
|
1023 |
+
device_map="auto",
|
1024 |
+
)
|
1025 |
+
Global_pipeline.model.load_adapter(peft_model_id = "technicolor/llama3.1_8b_outline_generation", adapter_name="outline")
|
1026 |
+
Global_pipeline.model.load_adapter(peft_model_id ="technicolor/llama3.1_8b_conclusion_generation", adapter_name="conclusion")
|
1027 |
+
Global_pipeline.model.load_adapter(peft_model_id ="technicolor/llama3.1_8b_abstract_generation", adapter_name="abstract")
|
1028 |
+
|
1029 |
+
generateSurvey("test", "Predictive modeling of imbalanced data", collection_list, Global_pipeline)
|
src/demo/asg_query.py
ADDED
@@ -0,0 +1,326 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from openai import OpenAI
|
3 |
+
from datetime import datetime, timedelta
|
4 |
+
import re
|
5 |
+
|
6 |
+
def generate_abstract_qwen(topic):
|
7 |
+
|
8 |
+
# Initialize the OpenAI client using environment variables
|
9 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
10 |
+
openai_api_base = os.getenv("OPENAI_API_BASE")
|
11 |
+
client = OpenAI(
|
12 |
+
api_key = openai_api_key,
|
13 |
+
base_url = openai_api_base,
|
14 |
+
)
|
15 |
+
|
16 |
+
###########################
|
17 |
+
# Step 1: Generate a survey abstract for the given topic.
|
18 |
+
###########################
|
19 |
+
system_prompt_abstract = """
|
20 |
+
You are a skilled research survey writer. Your task is to generate a survey abstract on the given topic. The abstract should cover the main challenges, key concepts, and research directions associated with the topic. Write in clear, concise academic English.
|
21 |
+
"""
|
22 |
+
user_prompt_abstract = f"""
|
23 |
+
Topic: {topic}
|
24 |
+
|
25 |
+
Please generate a comprehensive survey abstract for this topic. Include discussion of core challenges, key terminologies, and emerging methodologies that are critical in the field. The total length of the abstract should be around 300–500 words.
|
26 |
+
"""
|
27 |
+
messages_abstract = [
|
28 |
+
{"role": "system", "content": system_prompt_abstract},
|
29 |
+
{"role": "user", "content": user_prompt_abstract}
|
30 |
+
]
|
31 |
+
|
32 |
+
abstract_response = client.chat.completions.create(
|
33 |
+
model=os.environ.get("MODEL"),
|
34 |
+
max_tokens=2048,
|
35 |
+
temperature=0.5,
|
36 |
+
stop="<|im_end|>",
|
37 |
+
stream=True,
|
38 |
+
messages=messages_abstract
|
39 |
+
)
|
40 |
+
|
41 |
+
abstract_text = ""
|
42 |
+
for chunk in abstract_response:
|
43 |
+
if chunk.choices[0].delta.content:
|
44 |
+
abstract_text += chunk.choices[0].delta.content
|
45 |
+
abstract_text = abstract_text.strip()
|
46 |
+
# print("The abstract is:", abstract_text)
|
47 |
+
|
48 |
+
return abstract_text
|
49 |
+
|
50 |
+
def generate_entity_lists_qwen(topic, abstract_text):
|
51 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
52 |
+
openai_api_base = os.getenv("OPENAI_API_BASE")
|
53 |
+
client = OpenAI(
|
54 |
+
api_key = openai_api_key,
|
55 |
+
base_url = openai_api_base,
|
56 |
+
)
|
57 |
+
system_prompt_abstract = f"""
|
58 |
+
You are an AI assistant specializing in natural language processing and entity recognition. Your task is to extract key entities and core concepts from a given abstract based on a specified topic.
|
59 |
+
|
60 |
+
You should return two distinct lists:
|
61 |
+
1. **Entity list**: 5 Entities that are synonymous or closely related to the given topic (nouns only). These should be concise (no more than two words) and simplified to their root forms (e.g., removing suffixes like "-ing", "-ed") such as llm for large language model.
|
62 |
+
2. **Concept list**: Core concepts from the abstract that are highly relevant to the topic. These should also be concise (no more than two words) and in their simplest form, one single word is preferred unless the term is inseparable.
|
63 |
+
|
64 |
+
Ensure that your response follows this exact format:
|
65 |
+
Entity list: [entity1, entity2, entity3, entity4, entity5]
|
66 |
+
Concept list: [concept1, concept2, concept3, ...concept n]
|
67 |
+
Do not include any explanations or additional text.
|
68 |
+
|
69 |
+
### **Example**
|
70 |
+
#### **Input:**
|
71 |
+
Topic: Large Language Models
|
72 |
+
Abstract: Ever since the Turing Test was proposed in the 1950s, humans have explored the mastering of language intelligence by machine. Language is essentially a complex, intricate system of human expressions governed by grammatical rules. It poses a significant challenge to develop capable artificial intelligence (AI) algorithms for comprehending and grasping a language. As a major approach, language modeling has been widely studied for language understanding and generation in the past two decades, evolving from statistical language models to neural language models. Recently, pre-trained language models (PLMs) have been proposed by pretraining Transformer models over large-scale corpora, showing strong capabilities in solving various natural language processing (NLP) tasks. Since the researchers have found that model scaling can lead to an improved model capacity, they further investigate the scaling effect by increasing the parameter scale to an even larger size. Interestingly, when the parameter scale exceeds a certain level, these enlarged language models not only achieve a significant performance improvement, but also exhibit some special abilities (e.g., in-context learning) that are not present in small-scale language models (e.g., BERT). To discriminate the language models in different parameter scales, the research community has coined the term large language models (LLM) for the PLMs of significant size (e.g., containing tens or hundreds of billions of parameters). Recently, the research on LLMs has been largely advanced by both academia and industry, and a remarkable progress is the launch of ChatGPT (a powerful AI chatbot developed based on LLMs), which has attracted widespread attention from society. The technical evolution of LLMs has been making an important impact on the entire AI community, which would revolutionize the way how we develop and use AI algorithms. Considering this rapid technical progress, in this survey, we review the recent advances of LLMs by introducing the background, key findings, and mainstream techniques. In particular, we focus on four major aspects of LLMs, namely pre-training, adaptation tuning, utilization, and capacity evaluation. Furthermore, we also summarize the available resources for developing LLMs and discuss the remaining issues for future directions. This survey provides an up-to-date review of the literature on LLMs, which can be a useful resource for both researchers and engineers.
|
73 |
+
|
74 |
+
#### **Expected Output:**
|
75 |
+
"entity list": ["language model", "plm", "large language", "llm", "llms"]
|
76 |
+
"concept list": ["turing", "language intelligence", "ai", "generation", "statistical", "neural", "pre-train", "transformer", "corpora", "nlp", "in-context", "bert", "chatgpt", "adaptation", "utilization"]
|
77 |
+
Make sure to strictly follow this format in your response.
|
78 |
+
"""
|
79 |
+
|
80 |
+
user_prompt_abstract = f"""
|
81 |
+
Topic: {topic}
|
82 |
+
Abstract: {abstract_text}
|
83 |
+
|
84 |
+
Based on the given topic and abstract, extract the following:
|
85 |
+
1. A **list of 5 most key entities (nouns)** that are synonymous or closely related to the topic. Keep each entity under two words and in its simplest form.
|
86 |
+
2. A **list of core concepts (terms) as many as possible** from the abstract that are highly relevant to the topic. Keep each concept under two words and in its simplest form.
|
87 |
+
"""
|
88 |
+
|
89 |
+
messages_abstract = [
|
90 |
+
{"role": "system", "content": system_prompt_abstract},
|
91 |
+
{"role": "user", "content": user_prompt_abstract}
|
92 |
+
]
|
93 |
+
|
94 |
+
entity_response = client.chat.completions.create(
|
95 |
+
model=os.environ.get("MODEL"),
|
96 |
+
max_tokens=2048,
|
97 |
+
temperature=0.5,
|
98 |
+
stop="<|im_end|>",
|
99 |
+
stream=True,
|
100 |
+
messages=messages_abstract
|
101 |
+
)
|
102 |
+
|
103 |
+
entity_list = ""
|
104 |
+
for chunk in entity_response:
|
105 |
+
if chunk.choices[0].delta.content:
|
106 |
+
entity_list += chunk.choices[0].delta.content
|
107 |
+
entity_list = entity_list.strip()
|
108 |
+
# print("The entity lists are:", entity_list)
|
109 |
+
|
110 |
+
return entity_list
|
111 |
+
|
112 |
+
def generate_query_qwen(topic):
|
113 |
+
# Calculate date range for the arXiv query (last 5 years)
|
114 |
+
abstract_text = generate_abstract_qwen(topic)
|
115 |
+
entity_list = generate_entity_lists_qwen(topic, abstract_text)
|
116 |
+
today = datetime.now()
|
117 |
+
five_years_ago = today - timedelta(days=10 * 365) # approximate calculation
|
118 |
+
start_date = five_years_ago.strftime('%Y%m%d')
|
119 |
+
end_date = today.strftime('%Y%m%d')
|
120 |
+
|
121 |
+
|
122 |
+
# System prompt: Focus on how to extract keywords from the abstract.
|
123 |
+
system_prompt_query = """
|
124 |
+
You are a research assistant specializing in constructing effective arXiv search queries. Your task is to generate a structured search query using **pre-extracted entity and concept lists** from a given abstract and the topic. Follow these instructions exactly:
|
125 |
+
|
126 |
+
1. **Input Data:**
|
127 |
+
- **Entity List:** A list of entities that are synonymous or closely related to the given topic.
|
128 |
+
- **Concept List:** A list of core concepts from the abstract that are highly relevant to the topic.
|
129 |
+
|
130 |
+
2. **Ensure Minimum Keyword Count:**
|
131 |
+
- **Entity List** must contain at least **3** nouns of entities. If there are fewer, intelligently supplement additional relevant terms, ensuring that entities are synonyms or closely related to the key entity in the topic (e.g., "LLM" for "Large Language Model").
|
132 |
+
- **Concept List** must contain **12-15** domain-specific terms. If there are fewer, intelligently supplement additional relevant terms. Avoid broad terms like "combine" or "introduce."
|
133 |
+
|
134 |
+
3. **Standardize Formatting:**
|
135 |
+
- Convert all terms to their **base form** without adding any wildcard (`*`).
|
136 |
+
- All terms must be **in lowercase**.
|
137 |
+
|
138 |
+
4. **Construct the Final Query:**
|
139 |
+
- The query must follow this exact structure:
|
140 |
+
```
|
141 |
+
(abs:"<Term1>" AND abs:"<Term2>") AND
|
142 |
+
(abs:"<Entity1>" OR abs:"<Entity2>" OR abs:"<Entity3>" OR abs:"<Entity4>" OR abs:"<Entity5>") AND
|
143 |
+
(abs:"<Concept1>" OR abs:"<Concept2>" OR ... OR abs:"<Concept12>")
|
144 |
+
```
|
145 |
+
- **Terms are 2 or 3 keywords or phrases extracted from the topic that you think **must** occur in the abstract of the searching results and are grouped together using `AND` in the first part.** (most important)
|
146 |
+
- **Entities are grouped together using `OR` in the second part.**
|
147 |
+
- **Concepts are grouped together using `OR` in the third part.**
|
148 |
+
- **The two groups are combined using `AND`.**
|
149 |
+
- **For compound words with hyphens (e.g., "in-context"), replace `-` with a space, resulting in `"in context"`.**
|
150 |
+
- **Do not include any explanations or extra text. Output only the final query.**
|
151 |
+
"""
|
152 |
+
|
153 |
+
# User prompt: Provide examples of topics with corresponding query formats.
|
154 |
+
# User prompt: Provide examples of topics with corresponding query formats.
|
155 |
+
# User prompt: Uses pre-extracted entities and concepts, ensures minimum count, and applies stemming + wildcards.
|
156 |
+
user_prompt_query = f"""
|
157 |
+
Below are the pre-extracted keywords for constructing the final arXiv query.
|
158 |
+
|
159 |
+
**Topic:** {topic}
|
160 |
+
**Entity list and Concept list:** {entity_list}
|
161 |
+
|
162 |
+
### **Processing Rules Applied:**
|
163 |
+
- **Ensure the key terms in the topic are included**.
|
164 |
+
- **Ensure at least 5 entities** (if fewer, supplement additional relevant terms).
|
165 |
+
- **Ensure 12-15 concepts** (if fewer, supplement additional relevant terms).
|
166 |
+
- **Convert all terms to lowercase.**
|
167 |
+
- **For compound words with hyphens (e.g., "in-context"), replace `-` with a space, resulting in `"in context"`**.
|
168 |
+
- **Output only the final query with no extra text.**
|
169 |
+
|
170 |
+
### **Example Query Format:**
|
171 |
+
|
172 |
+
1. **Topic:** Large Language Models in Recommendation Systems
|
173 |
+
**Transformed Entity List:** ["language model", "plm", "large language", "llm", "deep model"]
|
174 |
+
**Transformed Concept List:** ["tur", "language intelligence", "ai", "generation", "statistical", "neural", "pretraining", "transformer", "corpora", "nlp", "in context", "bert", "chatgpt", "adaptation", "utilization"]
|
175 |
+
**Query:**
|
176 |
+
(abs:"large language model" AND abs:"recommendation") AND (abs:"language model" OR abs:"plm" OR abs:"large language" OR abs:"llm" OR abs:"deep model") AND (abs:"tur" OR abs:"language intelligence" OR abs:"ai" OR abs:"generation" OR abs:"statistical" OR abs:"neural" OR abs:"pretraining" OR abs:"transformer" OR abs:"corpora" OR abs:"nlp" OR abs:"in context" OR abs:"bert" OR abs:"chatgpt" OR abs:"adaptation" OR abs:"utilization")
|
177 |
+
|
178 |
+
2. **Topic:** Quantum Computing in Physics
|
179 |
+
**Transformed Entity List:** ["quantum computing", "qubit", "qc", "quantum device", "topological computing"]
|
180 |
+
**Transformed Concept List:** ["decoherence", "entanglement", "error", "topology", "annealing", "photon", "superconducting", "algorithm", "optimization", "verification", "fault tolerance", "noise", "circuit", "quantum machine", "measurement"]
|
181 |
+
**Query:**
|
182 |
+
(abs:"quantum computing" AND abs:"physics") AND (abs:"quantum computing" OR abs:"qubit" OR abs:"qc" OR abs:"quantum device" OR abs:"topological computing") AND (abs:"decoherence" OR abs:"entanglement" OR abs:"error" OR abs:"topology" OR abs:"annealing" OR abs:"photon" OR abs:"superconducting" OR abs:"algorithm" OR abs:"optimization" OR abs:"verification" OR abs:"fault tolerance" OR abs:"noise" OR abs:"circuit" OR abs:"quantum machine" OR abs:"measurement")
|
183 |
+
|
184 |
+
---
|
185 |
+
|
186 |
+
### **Now Generate the Query for This Topic:**
|
187 |
+
**Topic:** {topic}
|
188 |
+
Using the provided **Entity List** and **Concept List**, apply the following steps:
|
189 |
+
1. **Ensure Entity List contains at least 5 items.** If fewer, supplement additional relevant terms.
|
190 |
+
2. **Ensure Concept List contains 12-15 items.** If fewer, supplement additional relevant terms.
|
191 |
+
3. **Convert all terms to lowercase.**
|
192 |
+
4. **For compound words with hyphens (`-`), replace `-` with a space, e.g., `"in-context"` → `"in context"`.**
|
193 |
+
5. **Construct the arXiv search query in the same format as the examples above.**
|
194 |
+
6. **Return only the final query. Do not include explanations or additional text.**
|
195 |
+
All the terms in query should not exceed 2 words!
|
196 |
+
"""
|
197 |
+
|
198 |
+
# Initialize the OpenAI API client
|
199 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
200 |
+
openai_api_base = os.getenv("OPENAI_API_BASE")
|
201 |
+
client = OpenAI(
|
202 |
+
api_key=openai_api_key,
|
203 |
+
base_url=openai_api_base,
|
204 |
+
)
|
205 |
+
|
206 |
+
messages = [
|
207 |
+
{"role": "system", "content": system_prompt_query},
|
208 |
+
{"role": "user", "content": user_prompt_query}
|
209 |
+
]
|
210 |
+
|
211 |
+
response = client.chat.completions.create(
|
212 |
+
model=os.environ.get("MODEL"),
|
213 |
+
max_tokens=512,
|
214 |
+
temperature=0.5,
|
215 |
+
stop="<|im_end|>",
|
216 |
+
stream=True,
|
217 |
+
messages=messages
|
218 |
+
)
|
219 |
+
|
220 |
+
output_query = ""
|
221 |
+
for chunk in response:
|
222 |
+
if chunk.choices[0].delta.content:
|
223 |
+
output_query += chunk.choices[0].delta.content
|
224 |
+
match = re.search(r'\(.*\)', output_query, re.DOTALL)
|
225 |
+
|
226 |
+
if match:
|
227 |
+
extracted_query = match.group(0) # 保留匹配到的整个括号内容
|
228 |
+
else:
|
229 |
+
extracted_query = output_query.strip() # 如果匹配失败,使用原始查询
|
230 |
+
|
231 |
+
# 重新拼接 `submittedDate`
|
232 |
+
# updated_query = f"{extracted_query} AND submittedDate:[{start_date} TO {end_date}]"
|
233 |
+
updated_query = f"{extracted_query}"
|
234 |
+
print('The response is :', updated_query)
|
235 |
+
return updated_query.strip()
|
236 |
+
|
237 |
+
def generate_generic_query_qwen(original_query, topic):
|
238 |
+
"""
|
239 |
+
Transforms an overly strict arXiv query into a simplified, more generic version.
|
240 |
+
|
241 |
+
The new query must be in the format:
|
242 |
+
(abs:"<GenericTerm1>" AND abs:"<GenericTerm2>") OR (abs:"<GenericTerm3>" AND abs:"<GenericTerm4>")
|
243 |
+
|
244 |
+
Here, <GenericTerm1> and <GenericTerm2> represent two generic and common keywords,
|
245 |
+
while <GenericTerm3> and <GenericTerm4> are synonyms or closely related terms to the first two.
|
246 |
+
related to the given topic. If the terms in the original query are too strict,
|
247 |
+
replace them with broader terms that improve matching against arXiv articles.
|
248 |
+
|
249 |
+
Parameters:
|
250 |
+
original_query (str): The output query from generate_query_qwen() which is too strict.
|
251 |
+
topic (str): The research topic.
|
252 |
+
|
253 |
+
Returns:
|
254 |
+
str: The simplified arXiv query.
|
255 |
+
"""
|
256 |
+
|
257 |
+
system_prompt = """
|
258 |
+
You are a research assistant specializing in constructing effective and broad arXiv search queries.
|
259 |
+
Your job is to transform an overly strict query into a simplified, generic one.
|
260 |
+
|
261 |
+
Instructions:
|
262 |
+
1. Input:
|
263 |
+
- A strict query that might be too specific.
|
264 |
+
- A topic which the query intends to capture.
|
265 |
+
|
266 |
+
2. Requirements:
|
267 |
+
- Create a new query that only has the structure:
|
268 |
+
(abs:"<GenericTerm1>" AND abs:"<GenericTerm2>") OR (abs:"<GenericTerm3>" AND abs:"<GenericTerm4>")
|
269 |
+
- Replace <GenericTerm1> and <GenericTerm2> with two generic and common keywords for the topic.
|
270 |
+
- Replace <GenericTerm3> and <GenericTerm4> with the synonyms or closely related terms to the <GenericTerm1> and <GenericTerm2>.
|
271 |
+
- If the terms from the original query are too narrow, modify them to more broadly represent the given topic.
|
272 |
+
- All keywords must be in lowercase and in their base form.
|
273 |
+
- Each term should be one or two words.
|
274 |
+
|
275 |
+
3. Output:
|
276 |
+
- Return only the final query in the exact format with no extra explanations.
|
277 |
+
"""
|
278 |
+
|
279 |
+
user_prompt = f"""
|
280 |
+
Original Query: {original_query}
|
281 |
+
Topic: {topic}
|
282 |
+
|
283 |
+
The original query may be too strict and fails to match a broad range of arXiv articles.
|
284 |
+
Please generate a new query in the format:
|
285 |
+
(abs:"<GenericTerm1>" AND abs:"<GenericTerm2>") OR (abs:"<GenericTerm3>" AND abs:"<GenericTerm4>")
|
286 |
+
Replace <GenericTerm1> and <GenericTerm2> with more generic and commonly used terms that represent the topic.
|
287 |
+
Output only the final query.
|
288 |
+
"""
|
289 |
+
|
290 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
291 |
+
openai_api_base = os.getenv("OPENAI_API_BASE")
|
292 |
+
|
293 |
+
# Initialize the OpenAI API client (assuming a similar interface as before)
|
294 |
+
client = OpenAI(
|
295 |
+
api_key=openai_api_key,
|
296 |
+
base_url=openai_api_base,
|
297 |
+
)
|
298 |
+
|
299 |
+
messages = [
|
300 |
+
{"role": "system", "content": system_prompt},
|
301 |
+
{"role": "user", "content": user_prompt},
|
302 |
+
]
|
303 |
+
|
304 |
+
response = client.chat.completions.create(
|
305 |
+
model=os.environ.get("MODEL"),
|
306 |
+
max_tokens=512,
|
307 |
+
temperature=0.5,
|
308 |
+
stop="<|im_end|>",
|
309 |
+
stream=True,
|
310 |
+
messages=messages
|
311 |
+
)
|
312 |
+
|
313 |
+
output_query = ""
|
314 |
+
for chunk in response:
|
315 |
+
if chunk.choices[0].delta.content:
|
316 |
+
output_query += chunk.choices[0].delta.content
|
317 |
+
|
318 |
+
# Use regex to extract the new simplified query in the exact required format
|
319 |
+
match = re.search(r'\(.*\)', output_query, re.DOTALL)
|
320 |
+
if match:
|
321 |
+
extracted_query = match.group(0)
|
322 |
+
else:
|
323 |
+
extracted_query = output_query.strip()
|
324 |
+
|
325 |
+
print('The response is :', extracted_query)
|
326 |
+
return extracted_query.strip()
|
src/demo/asg_retriever.py
ADDED
@@ -0,0 +1,364 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import uuid
|
3 |
+
import re
|
4 |
+
import os
|
5 |
+
import json
|
6 |
+
import chromadb
|
7 |
+
from .asg_splitter import TextSplitting
|
8 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
9 |
+
import time
|
10 |
+
import concurrent.futures
|
11 |
+
|
12 |
+
class Retriever:
|
13 |
+
client = None
|
14 |
+
cur_dir = os.getcwd()
|
15 |
+
chromadb_path = os.path.join(cur_dir, "chromadb")
|
16 |
+
|
17 |
+
def __init__ (self):
|
18 |
+
self.client = chromadb.PersistentClient(path=self.chromadb_path)
|
19 |
+
|
20 |
+
def create_collection_chroma(self, collection_name: str):
|
21 |
+
"""
|
22 |
+
The Collection will be created with collection_name, the name must follow the rules:\n
|
23 |
+
0. Collection name must be unique, if the name exists then try to get this collection\n
|
24 |
+
1. The length of the name must be between 3 and 63 characters.\n
|
25 |
+
2. The name must start and end with a lowercase letter or a digit, and it can contain dots, dashes, and underscores in between.\n
|
26 |
+
3. The name must not contain two consecutive dots.\n
|
27 |
+
4. The name must not be a valid IP address.\n
|
28 |
+
"""
|
29 |
+
try:
|
30 |
+
self.client.create_collection(name=collection_name)
|
31 |
+
except chromadb.db.base.UniqueConstraintError:
|
32 |
+
self.get_collection_chroma(collection_name)
|
33 |
+
return collection_name
|
34 |
+
|
35 |
+
def get_collection_chroma (self, collection_name: str):
|
36 |
+
collection = self.client.get_collection(name=collection_name)
|
37 |
+
return collection
|
38 |
+
|
39 |
+
def add_documents_chroma (self, collection_name: str, embeddings_list: list[list[float]], documents_list: list[dict], metadata_list: list[dict]) :
|
40 |
+
"""
|
41 |
+
Please make sure that embeddings_list and metadata_list are matched with documents_list\n
|
42 |
+
Example of one metadata: {"doc_name": "Test2.pdf", "page": "9"}\n
|
43 |
+
The id will be created automatically as uuid v4
|
44 |
+
The chunks content and metadata will be logged (appended) into ./logs/<collection_name>.json
|
45 |
+
"""
|
46 |
+
collection = self.get_collection_chroma(collection_name)
|
47 |
+
num = len(documents_list)
|
48 |
+
ids=[str(uuid.uuid4()) for i in range(num) ]
|
49 |
+
|
50 |
+
collection.add(
|
51 |
+
documents= documents_list,
|
52 |
+
metadatas= metadata_list,
|
53 |
+
embeddings= embeddings_list,
|
54 |
+
ids=ids
|
55 |
+
)
|
56 |
+
logpath = os.path.join(self.cur_dir, "logs", f"{collection_name}.json")
|
57 |
+
os.makedirs(os.path.dirname(logpath), exist_ok=True)
|
58 |
+
logs = []
|
59 |
+
try:
|
60 |
+
with open (logpath, 'r', encoding="utf-8") as chunklog:
|
61 |
+
logs = json.load(chunklog)
|
62 |
+
except (FileNotFoundError, json.decoder.JSONDecodeError):
|
63 |
+
logs = []
|
64 |
+
|
65 |
+
added_log= [{"chunk_id": ids[i], "metadata": metadata_list[i], "page_content": documents_list[i]} \
|
66 |
+
for i in range(num)]
|
67 |
+
|
68 |
+
logs.extend(added_log)
|
69 |
+
|
70 |
+
# write back
|
71 |
+
with open (logpath, "w", encoding="utf-8") as chunklog:
|
72 |
+
json.dump(logs, chunklog, indent=4)
|
73 |
+
print(f"Logged document information to '{logpath}'.")
|
74 |
+
|
75 |
+
def query_chroma(self, collection_name: str, query_embeddings: list[list[float]], n_results: int = 5) -> dict:
|
76 |
+
# return n closest results (chunks and metadatas) in order
|
77 |
+
collection = self.get_collection_chroma(collection_name)
|
78 |
+
result = collection.query(
|
79 |
+
query_embeddings=query_embeddings,
|
80 |
+
n_results=n_results,
|
81 |
+
)
|
82 |
+
return result
|
83 |
+
|
84 |
+
def update_chroma (self, collection_name: str, id_list: list[str], embeddings_list: list[list[float]], documents_list: list[str], metadata_list: list[dict]):
|
85 |
+
collection = self.get_collection_chroma(collection_name)
|
86 |
+
num = len(documents_list)
|
87 |
+
collection.update(
|
88 |
+
ids=id_list,
|
89 |
+
embeddings=embeddings_list,
|
90 |
+
metadatas=metadata_list,
|
91 |
+
documents=documents_list,
|
92 |
+
)
|
93 |
+
update_list = [{"chunk_id": id_list[i], "metadata": metadata_list[i], "page_content": documents_list[i]} for i in range(num)]
|
94 |
+
|
95 |
+
# update the chunk log
|
96 |
+
logs = []
|
97 |
+
|
98 |
+
logpath = os.path.join(self.cur_dir, "logs", f"{collection_name}.json")
|
99 |
+
try:
|
100 |
+
with open (logpath, 'r', encoding="utf-8") as chunklog:
|
101 |
+
logs = json.load(chunklog)
|
102 |
+
except (FileNotFoundError, json.decoder.JSONDecodeError):
|
103 |
+
logs = [] # old_log does not exist or empty, then no need to update
|
104 |
+
else:
|
105 |
+
for i in range(num):
|
106 |
+
for log in logs:
|
107 |
+
if (log["chunk_id"] == update_list[i]["chunk_id"]):
|
108 |
+
log["metadata"] = update_list[i]["metadata"]
|
109 |
+
log["page_content"] = update_list[i]["page_content"]
|
110 |
+
break
|
111 |
+
|
112 |
+
with open (logpath, "w", encoding="utf-8") as chunklog:
|
113 |
+
json.dump(logs, chunklog, indent=4)
|
114 |
+
print(f"Updated log file at '{logpath}'.")
|
115 |
+
|
116 |
+
def delete_collection_entries_chroma(self, collection_name: str, id_list: list[str]):
|
117 |
+
collection = self.get_collection_chroma(collection_name)
|
118 |
+
collection.delete(ids=id_list)
|
119 |
+
print(f"Deleted entries with ids: {id_list} from collection '{collection_name}'.")
|
120 |
+
|
121 |
+
def delete_collection_chroma(self, collection_name: str):
|
122 |
+
print(f"The collection {collection_name} will be deleted forever!")
|
123 |
+
self.client.delete_collection(collection_name)
|
124 |
+
try:
|
125 |
+
logpath = os.path.join(self.cur_dir, "logs", f"{collection_name}.json")
|
126 |
+
print(f"Collection {collection_name} has been removed, deleting log file of this collection")
|
127 |
+
os.remove(logpath)
|
128 |
+
except FileNotFoundError:
|
129 |
+
print("The log of this collection did not exist!")
|
130 |
+
|
131 |
+
def list_collections_chroma(self):
|
132 |
+
collections = self.client.list_collections()
|
133 |
+
|
134 |
+
# Generate a legal collection name from a PDF filename
|
135 |
+
def legal_pdf(filename: str) -> str:
|
136 |
+
pdf_index = filename.lower().rfind('.pdf')
|
137 |
+
if pdf_index != -1:
|
138 |
+
name_before_pdf = filename[:pdf_index]
|
139 |
+
else:
|
140 |
+
name_before_pdf = filename
|
141 |
+
name_before_pdf = name_before_pdf.strip()
|
142 |
+
name = re.sub(r'[^a-zA-Z0-9._-]', '', name_before_pdf)
|
143 |
+
name = name.lower()
|
144 |
+
while '..' in name:
|
145 |
+
name = name.replace('..', '.')
|
146 |
+
name = name[:63]
|
147 |
+
if len(name) < 3:
|
148 |
+
name = name.ljust(3, '0') # fill with '0' if the length is less than 3
|
149 |
+
if not re.match(r'^[a-z0-9]', name):
|
150 |
+
name = 'a' + name[1:]
|
151 |
+
if not re.match(r'[a-z0-9]$', name):
|
152 |
+
name = name[:-1] + 'a'
|
153 |
+
ip_pattern = re.compile(r'^(\d{1,3}\.){3}\d{1,3}$')
|
154 |
+
if ip_pattern.match(name):
|
155 |
+
name = 'ip_' + name
|
156 |
+
return name
|
157 |
+
|
158 |
+
def process_pdf(file_path: str, survey_id: str, embedder: HuggingFaceEmbeddings, mode: str):
|
159 |
+
# Load and split the PDF
|
160 |
+
split_start_time = time.time()
|
161 |
+
splitters = TextSplitting().mineru_recursive_splitter(file_path, survey_id, mode)
|
162 |
+
|
163 |
+
documents_list = [document.page_content for document in splitters]
|
164 |
+
for i in range(len(documents_list)):
|
165 |
+
documents_list[i] = documents_list[i].replace('\n', ' ')
|
166 |
+
print(f"Splitting took {time.time() - split_start_time} seconds.")
|
167 |
+
|
168 |
+
# Embed the documents
|
169 |
+
# embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
170 |
+
embed_start_time = time.time()
|
171 |
+
doc_results = embedder.embed_documents(documents_list)
|
172 |
+
if isinstance(doc_results, torch.Tensor):
|
173 |
+
embeddings_list = doc_results.tolist()
|
174 |
+
else:
|
175 |
+
embeddings_list = doc_results
|
176 |
+
print(f"Embedding took {time.time() - embed_start_time} seconds.")
|
177 |
+
|
178 |
+
# Prepare metadata
|
179 |
+
metadata_list = [{"doc_name": os.path.basename(file_path)} for i in range(len(documents_list))]
|
180 |
+
|
181 |
+
title = os.path.splitext(os.path.basename(file_path))[0]
|
182 |
+
|
183 |
+
|
184 |
+
title_new = title.strip()
|
185 |
+
invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*','_']
|
186 |
+
for char in invalid_chars:
|
187 |
+
title_new = title_new.replace(char, ' ')
|
188 |
+
collection_name = legal_pdf(title_new)
|
189 |
+
|
190 |
+
retriever = Retriever()
|
191 |
+
retriever.list_collections_chroma()
|
192 |
+
retriever.create_collection_chroma(collection_name)
|
193 |
+
retriever.add_documents_chroma(
|
194 |
+
collection_name=collection_name,
|
195 |
+
embeddings_list=embeddings_list,
|
196 |
+
documents_list=documents_list,
|
197 |
+
metadata_list=metadata_list
|
198 |
+
)
|
199 |
+
|
200 |
+
return collection_name, embeddings_list, documents_list, metadata_list,title_new
|
201 |
+
|
202 |
+
def query_embeddings(collection_name: str, query_list: list):
|
203 |
+
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
204 |
+
retriever = Retriever()
|
205 |
+
|
206 |
+
final_context = ""
|
207 |
+
|
208 |
+
seen_chunks = set()
|
209 |
+
for query_text in query_list:
|
210 |
+
query_embeddings = embedder.embed_query(query_text)
|
211 |
+
query_result = retriever.query_chroma(collection_name=collection_name, query_embeddings=[query_embeddings], n_results=2)
|
212 |
+
|
213 |
+
query_result_chunks = query_result["documents"][0]
|
214 |
+
# query_result_ids = query_result["ids"][0]
|
215 |
+
|
216 |
+
for chunk in query_result_chunks:
|
217 |
+
if chunk not in seen_chunks:
|
218 |
+
final_context += chunk.strip() + "//\n"
|
219 |
+
seen_chunks.add(chunk)
|
220 |
+
return final_context
|
221 |
+
|
222 |
+
# new, may be in parallel
|
223 |
+
def query_embeddings_new(collection_name: str, query_list: list):
|
224 |
+
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
225 |
+
retriever = Retriever()
|
226 |
+
|
227 |
+
final_context = ""
|
228 |
+
|
229 |
+
seen_chunks = set()
|
230 |
+
def process_query(query_text):
|
231 |
+
query_embeddings = embedder.embed_query(query_text)
|
232 |
+
query_result = retriever.query_chroma(
|
233 |
+
collection_name=collection_name,
|
234 |
+
query_embeddings=[query_embeddings],
|
235 |
+
n_results=2
|
236 |
+
)
|
237 |
+
query_result_chunks = query_result["documents"][0]
|
238 |
+
return query_result_chunks
|
239 |
+
|
240 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
241 |
+
futures = {executor.submit(process_query, query_text): query_text for query_text in query_list}
|
242 |
+
for future in concurrent.futures.as_completed(futures):
|
243 |
+
query_result_chunks = future.result()
|
244 |
+
for chunk in query_result_chunks:
|
245 |
+
if chunk not in seen_chunks:
|
246 |
+
final_context += chunk.strip() + "//\n"
|
247 |
+
seen_chunks.add(chunk)
|
248 |
+
return final_context
|
249 |
+
|
250 |
+
# wza
|
251 |
+
def query_embeddings_new_new(collection_name: str, query_list: list):
|
252 |
+
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
253 |
+
retriever = Retriever()
|
254 |
+
|
255 |
+
final_context = "" # Stores concatenated context
|
256 |
+
citation_data_list = [] # Stores chunk content and collection name as source
|
257 |
+
seen_chunks = set() # Ensures unique chunks are added
|
258 |
+
|
259 |
+
def process_query(query_text):
|
260 |
+
# Embed the query text and retrieve relevant chunks
|
261 |
+
query_embeddings = embedder.embed_query(query_text)
|
262 |
+
query_result = retriever.query_chroma(
|
263 |
+
collection_name=collection_name,
|
264 |
+
query_embeddings=[query_embeddings],
|
265 |
+
n_results=5 # Fixed number of results
|
266 |
+
)
|
267 |
+
return query_result
|
268 |
+
|
269 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
270 |
+
future_to_query = {executor.submit(process_query, q): q for q in query_list}
|
271 |
+
for future in concurrent.futures.as_completed(future_to_query):
|
272 |
+
query_text = future_to_query[future]
|
273 |
+
try:
|
274 |
+
query_result = future.result()
|
275 |
+
except Exception as e:
|
276 |
+
print(f"Query '{query_text}' failed with exception: {e}")
|
277 |
+
continue
|
278 |
+
|
279 |
+
if "documents" not in query_result or "distances" not in query_result:
|
280 |
+
continue
|
281 |
+
if not query_result["documents"] or not query_result["distances"]:
|
282 |
+
continue
|
283 |
+
docs_list = query_result["documents"][0] if query_result["documents"] else []
|
284 |
+
dist_list = query_result["distances"][0] if query_result["distances"] else []
|
285 |
+
|
286 |
+
if len(docs_list) != len(dist_list):
|
287 |
+
continue
|
288 |
+
|
289 |
+
for chunk, distance in zip(docs_list, dist_list):
|
290 |
+
processed_chunk = chunk.strip()
|
291 |
+
if processed_chunk not in seen_chunks:
|
292 |
+
final_context += processed_chunk + "//\n"
|
293 |
+
seen_chunks.add(processed_chunk)
|
294 |
+
citation_data_list.append({
|
295 |
+
"source": collection_name,
|
296 |
+
"distance": distance,
|
297 |
+
"content": processed_chunk,
|
298 |
+
})
|
299 |
+
|
300 |
+
return final_context, citation_data_list
|
301 |
+
|
302 |
+
# concurrent version for both collection names and queries
|
303 |
+
def query_multiple_collections(collection_names: list[str], query_list: list[str], survey_id: str) -> dict:
|
304 |
+
"""
|
305 |
+
Query multiple collections in parallel and return the combined results.
|
306 |
+
|
307 |
+
Args:
|
308 |
+
collection_names (list[str]): List of collection names to query.
|
309 |
+
query_list (list[str]): List of queries to execute on each collection.
|
310 |
+
|
311 |
+
Returns:
|
312 |
+
dict: Combined results from all collections, grouped by collection.
|
313 |
+
"""
|
314 |
+
# Define embedder inside the function
|
315 |
+
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
316 |
+
retriever = Retriever()
|
317 |
+
|
318 |
+
def query_single_collection(collection_name: str):
|
319 |
+
"""
|
320 |
+
Query a single collection for all queries in the query_list.
|
321 |
+
"""
|
322 |
+
final_context = ""
|
323 |
+
seen_chunks = set()
|
324 |
+
|
325 |
+
def process_query(query_text):
|
326 |
+
# Embed the query
|
327 |
+
query_embeddings = embedder.embed_query(query_text)
|
328 |
+
# Query the collection
|
329 |
+
query_result = retriever.query_chroma(
|
330 |
+
collection_name=collection_name,
|
331 |
+
query_embeddings=[query_embeddings],
|
332 |
+
n_results=5
|
333 |
+
)
|
334 |
+
query_result_chunks = query_result["documents"][0]
|
335 |
+
return query_result_chunks
|
336 |
+
|
337 |
+
# Process all queries in parallel for the given collection
|
338 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
339 |
+
futures = {executor.submit(process_query, query_text): query_text for query_text in query_list}
|
340 |
+
for future in concurrent.futures.as_completed(futures):
|
341 |
+
query_result_chunks = future.result()
|
342 |
+
for chunk in query_result_chunks:
|
343 |
+
if chunk not in seen_chunks:
|
344 |
+
final_context += chunk.strip() + "//\n"
|
345 |
+
seen_chunks.add(chunk)
|
346 |
+
|
347 |
+
return final_context
|
348 |
+
|
349 |
+
# Outer parallelism for multiple collections
|
350 |
+
results = {}
|
351 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
352 |
+
futures = {executor.submit(query_single_collection, collection_name): collection_name for collection_name in collection_names}
|
353 |
+
for future in concurrent.futures.as_completed(futures):
|
354 |
+
collection_name = futures[future]
|
355 |
+
results[collection_name] = future.result()
|
356 |
+
|
357 |
+
# Automatically save the results to a JSON file
|
358 |
+
file_path = f'./src/static/data/info/{survey_id}/retrieved_context.json'
|
359 |
+
with open(file_path, 'w', encoding='utf-8') as f:
|
360 |
+
json.dump(results, f, ensure_ascii=False, indent=4)
|
361 |
+
|
362 |
+
print(f"Results saved to {file_path}")
|
363 |
+
|
364 |
+
return results
|
src/demo/asg_splitter.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .asg_loader import DocumentLoading
|
2 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
3 |
+
|
4 |
+
class TextSplitting:
|
5 |
+
def mineru_recursive_splitter(self, file_path, survey_id, mode):
|
6 |
+
docs = DocumentLoading().load_pdf(file_path, survey_id, mode)
|
7 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
8 |
+
chunk_size=400,
|
9 |
+
chunk_overlap=30,
|
10 |
+
length_function=len,
|
11 |
+
is_separator_regex=False,
|
12 |
+
)
|
13 |
+
texts = text_splitter.create_documents([docs])
|
14 |
+
return texts
|
15 |
+
|
16 |
+
def pypdf_recursive_splitter(self, file_path, survey_id):
|
17 |
+
docs = DocumentLoading().pypdf_loader(file_path, survey_id)
|
18 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
19 |
+
chunk_size=300,
|
20 |
+
chunk_overlap=20,
|
21 |
+
length_function=len,
|
22 |
+
is_separator_regex=False,
|
23 |
+
)
|
24 |
+
texts = text_splitter.create_documents([docs])
|
25 |
+
return texts
|
src/demo/category_and_tsne.py
ADDED
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.metrics import silhouette_score
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import seaborn as sns
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
from sklearn.manifold import TSNE
|
8 |
+
from sklearn.cluster import AgglomerativeClustering
|
9 |
+
import json
|
10 |
+
|
11 |
+
IMG_PATH = './src/static/img/'
|
12 |
+
|
13 |
+
plt.switch_backend('agg')
|
14 |
+
device = 0
|
15 |
+
# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", model_max_length = 128)
|
16 |
+
# model = AutoModel.from_pretrained("bert-base-uncased").to(device)
|
17 |
+
|
18 |
+
from sentence_transformers import SentenceTransformer
|
19 |
+
from umap import UMAP
|
20 |
+
from sklearn.decomposition import PCA
|
21 |
+
from sklearn.cluster import AgglomerativeClustering
|
22 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
23 |
+
from bertopic.vectorizers import ClassTfidfTransformer
|
24 |
+
from bertopic.representation import KeyBERTInspired
|
25 |
+
from bertopic import BERTopic
|
26 |
+
|
27 |
+
import numpy as np
|
28 |
+
import matplotlib.pyplot as plt
|
29 |
+
from sklearn.manifold import TSNE
|
30 |
+
import seaborn as sns
|
31 |
+
|
32 |
+
class DimensionalityReduction:
|
33 |
+
def fit(self, X):
|
34 |
+
return self
|
35 |
+
|
36 |
+
def transform(self, X):
|
37 |
+
return X
|
38 |
+
|
39 |
+
class ClusteringWithTopic:
|
40 |
+
def __init__(self, df, n_topics=3):
|
41 |
+
embedding_model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)
|
42 |
+
# umap_model = DimensionalityReduction()
|
43 |
+
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', init = 'pca')
|
44 |
+
hdbscan_model = AgglomerativeClustering(n_clusters=n_topics)
|
45 |
+
vectorizer_model = CountVectorizer(stop_words="english", min_df=1, ngram_range=(1, 2))
|
46 |
+
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=False)# True
|
47 |
+
keybert_model = KeyBERTInspired()
|
48 |
+
|
49 |
+
self.df = df
|
50 |
+
self.embeddings = embeddings = embedding_model.encode(df, show_progress_bar=True)
|
51 |
+
|
52 |
+
representation_model = {
|
53 |
+
"KeyBERT": keybert_model,
|
54 |
+
# "OpenAI": openai_model, # Uncomment if you will use OpenAI
|
55 |
+
# "MMR": mmr_model,
|
56 |
+
# "POS": pos_model
|
57 |
+
}
|
58 |
+
self.topic_model = BERTopic(
|
59 |
+
|
60 |
+
# Pipeline models
|
61 |
+
embedding_model=embedding_model,
|
62 |
+
umap_model=umap_model,
|
63 |
+
hdbscan_model=hdbscan_model,
|
64 |
+
vectorizer_model=vectorizer_model,
|
65 |
+
ctfidf_model=ctfidf_model,
|
66 |
+
representation_model=representation_model,
|
67 |
+
|
68 |
+
# Hyperparameters
|
69 |
+
top_n_words=10,
|
70 |
+
verbose=True
|
71 |
+
)
|
72 |
+
|
73 |
+
def __init__(self, df, n_topics_list):
|
74 |
+
"""
|
75 |
+
初始化 ClusteringWithTopic,接受一个 n_topics_list,其中包含多个聚类数目,
|
76 |
+
选取 silhouette_score 最高的结果。
|
77 |
+
"""
|
78 |
+
embedding_model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)
|
79 |
+
self.embeddings = embedding_model.encode(df, show_progress_bar=True)
|
80 |
+
|
81 |
+
self.df = df
|
82 |
+
self.n_topics_list = n_topics_list
|
83 |
+
|
84 |
+
self.embedding_model = embedding_model
|
85 |
+
self.umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine',init ='pca')
|
86 |
+
self.vectorizer_model = CountVectorizer(stop_words="english", min_df=1, ngram_range=(1, 2))
|
87 |
+
self.ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=False)
|
88 |
+
self.keybert_model = KeyBERTInspired()
|
89 |
+
self.representation_model = {"KeyBERT": self.keybert_model}
|
90 |
+
|
91 |
+
# 用于存储不同聚类数目的结果
|
92 |
+
self.best_n_topics = None
|
93 |
+
self.best_labels = None
|
94 |
+
self.best_score = -1
|
95 |
+
# def fit_and_get_labels(self, X):
|
96 |
+
# topics, probs = self.topic_model.fit_transform(self.df, self.embeddings)
|
97 |
+
# return topics
|
98 |
+
def fit_and_get_labels(self):
|
99 |
+
"""
|
100 |
+
对不同的 n_topics 进行聚类,计算 silhouette_score,选取最佳的 n_topics 进行后续操作。
|
101 |
+
"""
|
102 |
+
for n_topics in self.n_topics_list:
|
103 |
+
hdbscan_model = AgglomerativeClustering(n_clusters=n_topics)
|
104 |
+
|
105 |
+
topic_model = BERTopic(
|
106 |
+
embedding_model= self.embedding_model,
|
107 |
+
umap_model=self.umap_model,
|
108 |
+
hdbscan_model=hdbscan_model,
|
109 |
+
vectorizer_model=self.vectorizer_model,
|
110 |
+
ctfidf_model=self.ctfidf_model,
|
111 |
+
representation_model=self.representation_model,
|
112 |
+
top_n_words=10,
|
113 |
+
verbose=False
|
114 |
+
)
|
115 |
+
|
116 |
+
topics, _ = topic_model.fit_transform(self.df, self.embeddings)
|
117 |
+
|
118 |
+
# 计算 silhouette_score
|
119 |
+
if len(set(topics)) > 1: # silhouette_score 需要至少 2 个类别
|
120 |
+
score = silhouette_score(self.embeddings, topics)
|
121 |
+
else:
|
122 |
+
score = -1 # 单个类别时,silhouette_score 无意义
|
123 |
+
|
124 |
+
print(f"n_topics={n_topics}, silhouette_score={score}")
|
125 |
+
|
126 |
+
# 记录最佳的 n_topics
|
127 |
+
if score > self.best_score:
|
128 |
+
self.best_score = score
|
129 |
+
self.best_n_topics = n_topics
|
130 |
+
self.best_labels = topics
|
131 |
+
self.best_topic_model = topic_model
|
132 |
+
|
133 |
+
print(f"Best n_topics={self.best_n_topics}, Best silhouette_score={self.best_score}")
|
134 |
+
return self.best_labels, self.best_topic_model, self.best_n_topics
|
135 |
+
|
136 |
+
def clustering(df, n_cluster, survey_id):
|
137 |
+
text = df['retrieval_result'].astype(str)
|
138 |
+
clustering = ClusteringWithTopic(text, n_cluster)
|
139 |
+
df['label'] = clustering.fit_and_get_labels(text)
|
140 |
+
|
141 |
+
print("The clustering result is: ")
|
142 |
+
for col in df.columns:
|
143 |
+
print(f"{col}: {df.iloc[0][col]}")
|
144 |
+
|
145 |
+
# Save topic model information as JSON
|
146 |
+
topic_json = clustering.topic_model.get_topic_info().to_json()
|
147 |
+
with open(f'./src/static/data/info/{survey_id}/topic.json', 'w', encoding="utf-8") as file:
|
148 |
+
file.write(topic_json)
|
149 |
+
|
150 |
+
# Create a dictionary from 'ref_title' and 'retrieval_result' columns
|
151 |
+
description_dict = dict(zip(df['ref_title'], df['retrieval_result']))
|
152 |
+
|
153 |
+
# Save the dictionary to description.json
|
154 |
+
with open(f'./src/static/data/info/{survey_id}/description.json', 'w', encoding="utf-8") as file:
|
155 |
+
json.dump(description_dict, file, ensure_ascii=False, indent=4)
|
156 |
+
# df['top_n_words'] = clustering.topic_model.get_topic_info()['Representation'].tolist()
|
157 |
+
# df['topic_word'] = clustering.topic_model.get_topic_info()['KeyBERT'].tolist()
|
158 |
+
|
159 |
+
|
160 |
+
X = np.array(clustering.embeddings)
|
161 |
+
perplexity = 10
|
162 |
+
if X.shape[0] <= perplexity:
|
163 |
+
perplexity = max(1, X.shape[0] // 2)
|
164 |
+
|
165 |
+
tsne = TSNE(n_components=2, init='pca', perplexity=perplexity, random_state=42)
|
166 |
+
X_tsne = tsne.fit_transform(X)
|
167 |
+
colors = scatter(X_tsne, df['label'])
|
168 |
+
|
169 |
+
plt.savefig(IMG_PATH + 'tsne_' + survey_id + '.png', dpi=800, transparent=True)
|
170 |
+
|
171 |
+
plt.close()
|
172 |
+
output_tsv_filename = "./src/static/data/tsv/" + survey_id + '.tsv'
|
173 |
+
df.to_csv(output_tsv_filename, sep='\t')
|
174 |
+
return df, colors
|
175 |
+
|
176 |
+
def clustering(df, n_topics_list, survey_id):
|
177 |
+
text = df['retrieval_result'].astype(str)
|
178 |
+
clustering = ClusteringWithTopic(text, n_topics_list)
|
179 |
+
df['label'], topic_model, best_n_topics = clustering.fit_and_get_labels()
|
180 |
+
|
181 |
+
print("The clustering result is: ")
|
182 |
+
for col in df.columns:
|
183 |
+
print(f"{col}: {df.iloc[0][col]}")
|
184 |
+
|
185 |
+
# 保存 topic model 信息
|
186 |
+
topic_json = topic_model.get_topic_info().to_json()
|
187 |
+
with open(f'./src/static/data/info/{survey_id}/topic.json', 'w', encoding="utf-8") as file:
|
188 |
+
file.write(topic_json)
|
189 |
+
|
190 |
+
# 创建描述信息
|
191 |
+
description_dict = dict(zip(df['ref_title'], df['retrieval_result']))
|
192 |
+
with open(f'./src/static/data/info/{survey_id}/description.json', 'w', encoding="utf-8") as file:
|
193 |
+
json.dump(description_dict, file, ensure_ascii=False, indent=4)
|
194 |
+
|
195 |
+
# t-SNE 降维可视化
|
196 |
+
X = np.array(clustering.embeddings)
|
197 |
+
perplexity = min(10, max(1, X.shape[0] // 2)) # 避免 perplexity 过大
|
198 |
+
|
199 |
+
tsne = TSNE(n_components=2, init='pca', perplexity=perplexity, random_state=42)
|
200 |
+
X_tsne = tsne.fit_transform(X)
|
201 |
+
|
202 |
+
colors = scatter(X_tsne, df['label']) # 计算颜色
|
203 |
+
|
204 |
+
plt.savefig(IMG_PATH + 'tsne_' + survey_id + '.png', dpi=800, transparent=True)
|
205 |
+
|
206 |
+
plt.close()
|
207 |
+
output_tsv_filename = "./src/static/data/tsv/" + survey_id + '.tsv'
|
208 |
+
df.to_csv(output_tsv_filename, sep='\t')
|
209 |
+
return df, colors, best_n_topics
|
210 |
+
|
211 |
+
def scatter(x, colors):
|
212 |
+
sns.set_style('whitegrid')
|
213 |
+
sns.set_palette('Set1')
|
214 |
+
sns.set_context("notebook", font_scale=1.5,
|
215 |
+
rc={"lines.linewidth": 2.5})
|
216 |
+
# We choose a color palette with seaborn.
|
217 |
+
palette = np.array(sns.hls_palette(8, l=0.4, s=.8))
|
218 |
+
color_hex = sns.color_palette(sns.hls_palette(8, l=0.4, s=.8)).as_hex()
|
219 |
+
# We create a scatter plot.
|
220 |
+
f = plt.figure(figsize=(8, 8))
|
221 |
+
ax = plt.subplot(aspect='equal')
|
222 |
+
sc = ax.scatter(x[:, 0], x[:, 1], lw=0, s=1,
|
223 |
+
c=palette[colors.astype(np.int32)])
|
224 |
+
c = [palette[x] if x >= 0 else (0.0, 0.0, 0.0) for x in colors]
|
225 |
+
for i in range(x.shape[0]):
|
226 |
+
ax.text(x[i, 0], x[i, 1], '[' + str(i) + ']', fontsize=20, color=c[i], weight='1000')
|
227 |
+
plt.xlim(-25, 25)
|
228 |
+
plt.ylim(-25, 25)
|
229 |
+
ax.axis('off')
|
230 |
+
ax.axis('tight')
|
231 |
+
return color_hex[:colors.nunique()]
|
src/demo/count_files.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
def count_files_in_folders(parent_folder):
|
4 |
+
folder_counts = {}
|
5 |
+
|
6 |
+
for folder in os.listdir(parent_folder):
|
7 |
+
folder_path = os.path.join(parent_folder, folder)
|
8 |
+
if os.path.isdir(folder_path):
|
9 |
+
num_files = len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])
|
10 |
+
folder_counts[folder] = num_files
|
11 |
+
|
12 |
+
return folder_counts
|
13 |
+
|
14 |
+
# 指定arxiv_downloads文件夹路径
|
15 |
+
parent_folder = "arxiv_downloads"
|
16 |
+
counts = count_files_in_folders(parent_folder)
|
17 |
+
|
18 |
+
# 输出统计结果
|
19 |
+
for folder, num_files in counts.items():
|
20 |
+
print(f"{folder}: {num_files} files")
|
src/demo/download.py
ADDED
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import requests
|
3 |
+
import xml.etree.ElementTree as ET
|
4 |
+
import urllib.parse
|
5 |
+
from tqdm import tqdm
|
6 |
+
import time
|
7 |
+
import re
|
8 |
+
|
9 |
+
import requests
|
10 |
+
from asg_query import generate_generic_query_qwen, generate_query_qwen
|
11 |
+
from dotenv import load_dotenv
|
12 |
+
|
13 |
+
load_dotenv()
|
14 |
+
|
15 |
+
PARENT_FOLDER = "arxiv_downloads_new_new_new"
|
16 |
+
os.makedirs(PARENT_FOLDER, exist_ok=True)
|
17 |
+
|
18 |
+
def sanitize_filename(filename):
|
19 |
+
filename = filename.replace("\n", "").strip()
|
20 |
+
filename = re.sub(r'[\/:*?"<>|]', '_', filename)
|
21 |
+
return filename[:100] + ".pdf"
|
22 |
+
|
23 |
+
def search_arxiv_papers(topic, max_results=50):
|
24 |
+
query_qwen = generate_query_qwen(topic)
|
25 |
+
encoded_query = urllib.parse.quote_plus(query_qwen)
|
26 |
+
url = f"https://export.arxiv.org/api/query?search_query={encoded_query}&start=0&max_results={max_results}&sortBy=submittedDate"
|
27 |
+
|
28 |
+
# base_url = "http://export.arxiv.org/api/query?"
|
29 |
+
# query = f"search_query=all:{topic.replace(' ', '+')}&start=0&max_results={max_results}&sortBy=relevance&sortOrder=descending"
|
30 |
+
# url = base_url + query
|
31 |
+
|
32 |
+
response = requests.get(url)
|
33 |
+
if response.status_code != 200:
|
34 |
+
print(f"Error fetching data for {topic}: {response.status_code}")
|
35 |
+
return []
|
36 |
+
|
37 |
+
root = ET.fromstring(response.text)
|
38 |
+
entries = root.findall("{http://www.w3.org/2005/Atom}entry")
|
39 |
+
|
40 |
+
papers = []
|
41 |
+
for entry in entries:
|
42 |
+
title = entry.find("{http://www.w3.org/2005/Atom}title").text
|
43 |
+
pdf_link = entry.find("{http://www.w3.org/2005/Atom}id").text.replace("abs", "pdf")
|
44 |
+
papers.append({"title": title, "pdf_link": pdf_link})
|
45 |
+
|
46 |
+
return papers
|
47 |
+
|
48 |
+
def download_pdf(url, folder, filename):
|
49 |
+
file_path = os.path.join(folder, filename)
|
50 |
+
|
51 |
+
response = requests.get(url, stream=True)
|
52 |
+
if response.status_code == 200:
|
53 |
+
with open(file_path, 'wb') as file:
|
54 |
+
for chunk in response.iter_content(chunk_size=1024):
|
55 |
+
file.write(chunk)
|
56 |
+
else:
|
57 |
+
print(f"Failed to download {url}")
|
58 |
+
|
59 |
+
def download_arxiv_papers(topic, max_results=50):
|
60 |
+
folder_name = os.path.join(PARENT_FOLDER, topic.replace(" ", "_"))
|
61 |
+
os.makedirs(folder_name, exist_ok=True)
|
62 |
+
|
63 |
+
papers = search_arxiv_papers(topic, max_results)
|
64 |
+
|
65 |
+
if not papers:
|
66 |
+
print(f"No papers found for topic: {topic}")
|
67 |
+
return
|
68 |
+
|
69 |
+
print(f"Downloading {len(papers)} papers for topic: {topic}")
|
70 |
+
|
71 |
+
for paper in tqdm(papers, total=len(papers)):
|
72 |
+
filename = sanitize_filename(paper['title'])
|
73 |
+
pdf_link = paper["pdf_link"]
|
74 |
+
download_pdf(pdf_link, folder_name, filename)
|
75 |
+
time.sleep(2)
|
76 |
+
|
77 |
+
print(f"Download complete. Papers saved in: {folder_name}")
|
78 |
+
|
79 |
+
|
80 |
+
def search_arxiv_with_query(query, max_results=50):
|
81 |
+
"""
|
82 |
+
Query the arXiv API with a given query string.
|
83 |
+
|
84 |
+
Parameters:
|
85 |
+
query (str): The query string (URL-unencoded).
|
86 |
+
max_results (int): Maximum number of results to request.
|
87 |
+
|
88 |
+
Returns:
|
89 |
+
list: A list of dictionaries containing paper metadata.
|
90 |
+
"""
|
91 |
+
encoded_query = urllib.parse.quote_plus(query)
|
92 |
+
url = f"https://export.arxiv.org/api/query?search_query={encoded_query}&start=0&max_results={max_results}&sortBy=submittedDate"
|
93 |
+
|
94 |
+
response = requests.get(url)
|
95 |
+
if response.status_code != 200:
|
96 |
+
print(f"Error fetching data with query: {query} | status code: {response.status_code}")
|
97 |
+
return []
|
98 |
+
|
99 |
+
try:
|
100 |
+
root = ET.fromstring(response.text)
|
101 |
+
except Exception as e:
|
102 |
+
print("Error parsing XML:", e)
|
103 |
+
return []
|
104 |
+
|
105 |
+
entries = root.findall("{http://www.w3.org/2005/Atom}entry")
|
106 |
+
papers = []
|
107 |
+
for entry in entries:
|
108 |
+
title = entry.find("{http://www.w3.org/2005/Atom}title").text.strip()
|
109 |
+
pdf_link = entry.find("{http://www.w3.org/2005/Atom}id").text.replace("abs", "pdf")
|
110 |
+
papers.append({"title": title, "pdf_link": pdf_link})
|
111 |
+
return papers
|
112 |
+
|
113 |
+
def download_arxiv_papers_new(topic, max_results=50, min_results=10):
|
114 |
+
"""
|
115 |
+
Download arXiv papers for a given topic.
|
116 |
+
|
117 |
+
Process:
|
118 |
+
1. Use a strict query generated by generate_query_qwen(topic) to query arXiv.
|
119 |
+
2. If the number of results is fewer than `min_results`, then generate a more generic query
|
120 |
+
using generate_generic_query_qwen() and run additional searches.
|
121 |
+
3. Combine non-duplicate papers (filtered by title) until reaching max_results or exhausting attempts.
|
122 |
+
4. Download the PDF of each paper.
|
123 |
+
|
124 |
+
Parameters:
|
125 |
+
topic (str): The research topic.
|
126 |
+
max_results (int): Total maximum number of papers to download (default is 50).
|
127 |
+
min_results (int): Minimum acceptable number of papers from the first query (default is 10).
|
128 |
+
"""
|
129 |
+
folder_name = os.path.join(PARENT_FOLDER, topic.replace(" ", "_"))
|
130 |
+
os.makedirs(folder_name, exist_ok=True)
|
131 |
+
|
132 |
+
# 1. Initial strict query.
|
133 |
+
strict_query = generate_query_qwen(topic)
|
134 |
+
papers = search_arxiv_with_query(strict_query, max_results=max_results)
|
135 |
+
|
136 |
+
# Use a dict keyed by title to avoid duplicates.
|
137 |
+
total_papers = {paper["title"]: paper for paper in papers}
|
138 |
+
print(f"[Strict Query] Found {len(total_papers)} papers for topic: {topic}")
|
139 |
+
|
140 |
+
# 2. If the strict query returns fewer than min_results papers,
|
141 |
+
# use the generic query to broaden the search.
|
142 |
+
attempts = 0
|
143 |
+
MAX_ATTEMPTS = 5 # Limit attempts to avoid infinite loops.
|
144 |
+
while len(total_papers) < max_results and len(total_papers) < min_results and attempts < MAX_ATTEMPTS:
|
145 |
+
# Generate a less strict (generic) query
|
146 |
+
generic_query = generate_generic_query_qwen(strict_query, topic)
|
147 |
+
print(f"[Generic Query Attempt {attempts + 1}] Using generic query: {generic_query}")
|
148 |
+
generic_papers = search_arxiv_with_query(generic_query, max_results=max_results)
|
149 |
+
|
150 |
+
new_count = 0
|
151 |
+
for paper in generic_papers:
|
152 |
+
if paper["title"] not in total_papers:
|
153 |
+
total_papers[paper["title"]] = paper
|
154 |
+
new_count += 1
|
155 |
+
if len(total_papers) >= max_results:
|
156 |
+
break
|
157 |
+
|
158 |
+
attempts += 1
|
159 |
+
strict_query = generic_query # Update the query for the next iteration.
|
160 |
+
|
161 |
+
total_paper_list = list(total_papers.values())[:max_results]
|
162 |
+
|
163 |
+
if not total_paper_list:
|
164 |
+
print(f"No papers found for topic: {topic}")
|
165 |
+
return
|
166 |
+
|
167 |
+
print(f"Downloading {len(total_paper_list)} papers for topic: {topic}")
|
168 |
+
for paper in tqdm(total_paper_list, total=len(total_paper_list)):
|
169 |
+
filename = sanitize_filename(paper['title'])
|
170 |
+
pdf_link = paper["pdf_link"]
|
171 |
+
download_pdf(pdf_link, folder_name, filename)
|
172 |
+
time.sleep(2) # Delay to avoid overwhelming the arXiv API
|
173 |
+
|
174 |
+
print(f"Download complete. Papers saved in: {folder_name}")
|
175 |
+
|
176 |
+
first_topics = [
|
177 |
+
"quantum computing: bqp, quantum supremacy, and related concepts",
|
178 |
+
"fixed-parameter tractability and related concepts in computational complexity",
|
179 |
+
"fundamental concepts in computational complexity theory",
|
180 |
+
"pcp theorem and its implications in approximation and complexity theory",
|
181 |
+
"interconnections in theoretical computer science: seth, 3sum, apsp, and related concepts",
|
182 |
+
"nosql database systems for flexible and scalable data management",
|
183 |
+
"temporal databases, real-time databases, and data management systems",
|
184 |
+
"large language model integration with databases for enhanced data management and survey analysis",
|
185 |
+
"ai-driven database management",
|
186 |
+
"distributed systems and databases: key concepts and technologies",
|
187 |
+
"graph databases and query languages: traversal, indexing, and analytics",
|
188 |
+
"graph databases: models, data modeling, and applications",
|
189 |
+
"multi-model databases: mongodb, arangodb, and jsonb",
|
190 |
+
"time-series data management and analytics",
|
191 |
+
"advanced data management and retrieval techniques",
|
192 |
+
"vector databases and their role in modern data management and retrieval",
|
193 |
+
"content delivery networks: technologies and strategies for optimization",
|
194 |
+
"lpwan technologies: lora, zigbee 3.0, 6lowpan, and related protocols in iot",
|
195 |
+
"network slicing and emerging technologies in 6g networks",
|
196 |
+
"advanced concepts and technologies in software-defined networking and network function virtualization",
|
197 |
+
"battery electrolyte formulation in lithium-ion batteries",
|
198 |
+
"flow batteries as energy storage systems",
|
199 |
+
"internal consistency, self-feedback, and reliability in large language models",
|
200 |
+
"attention mechanisms in large language models",
|
201 |
+
"controlled text generation with large language models in natural language processing",
|
202 |
+
"domain adaptation and specialized nlp applications",
|
203 |
+
"evaluation of large language models for natural language processing",
|
204 |
+
"information extraction and large language models in natural language processing",
|
205 |
+
"techniques for low-resource natural language processing",
|
206 |
+
"model compression techniques for transformer models",
|
207 |
+
"multi-agent offline policy reinforcement learning: decentralized learning and cooperative policy optimization",
|
208 |
+
"multimodal learning and its applications",
|
209 |
+
"reasoning capabilities of large language models",
|
210 |
+
"transformer models in natural language processing"
|
211 |
+
]
|
212 |
+
|
213 |
+
second_topics = [
|
214 |
+
"semi-supervised learning",
|
215 |
+
"out-of-distribution detection",
|
216 |
+
"in-context learning"
|
217 |
+
]
|
218 |
+
|
219 |
+
if __name__ == '__main__':
|
220 |
+
for topic in first_topics:
|
221 |
+
print(f"\nProcessing topic (first list): {topic}")
|
222 |
+
download_arxiv_papers_new(topic, max_results=50, min_results=20)
|
223 |
+
for topic in second_topics:
|
224 |
+
print(f"\nProcessing topic (second list): {topic}")
|
225 |
+
download_arxiv_papers_new(topic, max_results=50, min_results=20)
|
src/demo/latex_template/acl.sty
ADDED
@@ -0,0 +1,312 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
% This is the LaTex style file for *ACL.
|
2 |
+
% The official sources can be found at
|
3 |
+
%
|
4 |
+
% https://github.com/acl-org/acl-style-files/
|
5 |
+
%
|
6 |
+
% This package is activated by adding
|
7 |
+
%
|
8 |
+
% \usepackage{acl}
|
9 |
+
%
|
10 |
+
% to your LaTeX file. When submitting your paper for review, add the "review" option:
|
11 |
+
%
|
12 |
+
% \usepackage[review]{acl}
|
13 |
+
|
14 |
+
\newif\ifacl@finalcopy
|
15 |
+
\newif\ifacl@anonymize
|
16 |
+
\newif\ifacl@linenumbers
|
17 |
+
\newif\ifacl@pagenumbers
|
18 |
+
\DeclareOption{final}{\acl@finalcopytrue\acl@anonymizefalse\acl@linenumbersfalse\acl@pagenumbersfalse}
|
19 |
+
\DeclareOption{review}{\acl@finalcopyfalse\acl@anonymizetrue\acl@linenumberstrue\acl@pagenumberstrue}
|
20 |
+
\DeclareOption{preprint}{\acl@finalcopytrue\acl@anonymizefalse\acl@linenumbersfalse\acl@pagenumberstrue}
|
21 |
+
\ExecuteOptions{final} % final copy is the default
|
22 |
+
|
23 |
+
% include hyperref, unless user specifies nohyperref option like this:
|
24 |
+
% \usepackage[nohyperref]{acl}
|
25 |
+
\newif\ifacl@hyperref
|
26 |
+
\DeclareOption{hyperref}{\acl@hyperreftrue}
|
27 |
+
\DeclareOption{nohyperref}{\acl@hyperreffalse}
|
28 |
+
\ExecuteOptions{hyperref} % default is to use hyperref
|
29 |
+
\ProcessOptions\relax
|
30 |
+
|
31 |
+
\typeout{Conference Style for ACL}
|
32 |
+
|
33 |
+
\usepackage{xcolor}
|
34 |
+
|
35 |
+
\ifacl@linenumbers
|
36 |
+
% Add draft line numbering via the lineno package
|
37 |
+
% https://texblog.org/2012/02/08/adding-line-numbers-to-documents/
|
38 |
+
\usepackage[switch,mathlines]{lineno}
|
39 |
+
|
40 |
+
% Line numbers in gray Helvetica 8pt
|
41 |
+
\font\aclhv = phvb at 8pt
|
42 |
+
\renewcommand\linenumberfont{\aclhv\color{lightgray}}
|
43 |
+
|
44 |
+
% Zero-fill line numbers
|
45 |
+
% NUMBER with left flushed zeros \fillzeros[<WIDTH>]<NUMBER>
|
46 |
+
\newcount\cv@tmpc@ \newcount\cv@tmpc
|
47 |
+
\def\fillzeros[#1]#2{\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi
|
48 |
+
\cv@tmpc=1 %
|
49 |
+
\loop\ifnum\cv@tmpc@<10 \else \divide\cv@tmpc@ by 10 \advance\cv@tmpc by 1 \fi
|
50 |
+
\ifnum\cv@tmpc@=10\relax\cv@tmpc@=11\relax\fi \ifnum\cv@tmpc@>10 \repeat
|
51 |
+
\ifnum#2<0\advance\cv@tmpc1\relax-\fi
|
52 |
+
\loop\ifnum\cv@tmpc<#1\relax0\advance\cv@tmpc1\relax\fi \ifnum\cv@tmpc<#1 \repeat
|
53 |
+
\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi \relax\the\cv@tmpc@}%
|
54 |
+
\renewcommand\thelinenumber{\fillzeros[3]{\arabic{linenumber}}}
|
55 |
+
\linenumbers
|
56 |
+
|
57 |
+
\setlength{\linenumbersep}{1.6cm}
|
58 |
+
|
59 |
+
% Bug: An equation with $$ ... $$ isn't numbered, nor is the previous line.
|
60 |
+
|
61 |
+
% Patch amsmath commands so that the previous line and the equation itself
|
62 |
+
% are numbered. Bug: multline has an extra line number.
|
63 |
+
% https://tex.stackexchange.com/questions/461186/how-to-use-lineno-with-amsmath-align
|
64 |
+
\usepackage{etoolbox} %% <- for \pretocmd, \apptocmd and \patchcmd
|
65 |
+
|
66 |
+
\newcommand*\linenomathpatch[1]{%
|
67 |
+
\expandafter\pretocmd\csname #1\endcsname {\linenomath}{}{}%
|
68 |
+
\expandafter\pretocmd\csname #1*\endcsname {\linenomath}{}{}%
|
69 |
+
\expandafter\apptocmd\csname end#1\endcsname {\endlinenomath}{}{}%
|
70 |
+
\expandafter\apptocmd\csname end#1*\endcsname {\endlinenomath}{}{}%
|
71 |
+
}
|
72 |
+
\newcommand*\linenomathpatchAMS[1]{%
|
73 |
+
\expandafter\pretocmd\csname #1\endcsname {\linenomathAMS}{}{}%
|
74 |
+
\expandafter\pretocmd\csname #1*\endcsname {\linenomathAMS}{}{}%
|
75 |
+
\expandafter\apptocmd\csname end#1\endcsname {\endlinenomath}{}{}%
|
76 |
+
\expandafter\apptocmd\csname end#1*\endcsname {\endlinenomath}{}{}%
|
77 |
+
}
|
78 |
+
|
79 |
+
%% Definition of \linenomathAMS depends on whether the mathlines option is provided
|
80 |
+
\expandafter\ifx\linenomath\linenomathWithnumbers
|
81 |
+
\let\linenomathAMS\linenomathWithnumbers
|
82 |
+
%% The following line gets rid of an extra line numbers at the bottom:
|
83 |
+
\patchcmd\linenomathAMS{\advance\postdisplaypenalty\linenopenalty}{}{}{}
|
84 |
+
\else
|
85 |
+
\let\linenomathAMS\linenomathNonumbers
|
86 |
+
\fi
|
87 |
+
|
88 |
+
\AtBeginDocument{%
|
89 |
+
\linenomathpatch{equation}%
|
90 |
+
\linenomathpatchAMS{gather}%
|
91 |
+
\linenomathpatchAMS{multline}%
|
92 |
+
\linenomathpatchAMS{align}%
|
93 |
+
\linenomathpatchAMS{alignat}%
|
94 |
+
\linenomathpatchAMS{flalign}%
|
95 |
+
}
|
96 |
+
\else
|
97 |
+
% Hack to ignore these commands, which review mode puts into the .aux file.
|
98 |
+
\newcommand{\@LN@col}[1]{}
|
99 |
+
\newcommand{\@LN}[2]{}
|
100 |
+
\newcommand{\nolinenumbers}{}
|
101 |
+
\fi
|
102 |
+
|
103 |
+
\PassOptionsToPackage{a4paper,margin=2.5cm,heightrounded=true}{geometry}
|
104 |
+
\RequirePackage{geometry}
|
105 |
+
|
106 |
+
\setlength\columnsep{0.6cm}
|
107 |
+
\newlength\titlebox
|
108 |
+
\setlength\titlebox{11\baselineskip}
|
109 |
+
% \titlebox should be a multiple of \baselineskip so that
|
110 |
+
% column height remaining fits an exact number of lines of text
|
111 |
+
|
112 |
+
\flushbottom \twocolumn \sloppy
|
113 |
+
|
114 |
+
% We're never going to need a table of contents, so just flush it to
|
115 |
+
% save space --- suggested by drstrip@sandia-2
|
116 |
+
\def\addcontentsline#1#2#3{}
|
117 |
+
|
118 |
+
\ifacl@pagenumbers
|
119 |
+
\pagenumbering{arabic}
|
120 |
+
\else
|
121 |
+
\thispagestyle{empty}
|
122 |
+
\pagestyle{empty}
|
123 |
+
\fi
|
124 |
+
|
125 |
+
%% Title and Authors %%
|
126 |
+
|
127 |
+
\let\Thanks\thanks % \Thanks and \thanks used to be different, but keep this for backwards compatibility.
|
128 |
+
|
129 |
+
\newcommand\outauthor{%
|
130 |
+
\begin{tabular}[t]{c}
|
131 |
+
\ifacl@anonymize
|
132 |
+
\bfseries Anonymous ACL submission
|
133 |
+
\else
|
134 |
+
\bfseries\@author
|
135 |
+
\fi
|
136 |
+
\end{tabular}}
|
137 |
+
|
138 |
+
% Mostly taken from deproc.
|
139 |
+
\AtBeginDocument{
|
140 |
+
\def\maketitle{\par
|
141 |
+
\begingroup
|
142 |
+
\def\thefootnote{\fnsymbol{footnote}}
|
143 |
+
\twocolumn[\@maketitle]
|
144 |
+
\@thanks
|
145 |
+
\endgroup
|
146 |
+
\setcounter{footnote}{0}
|
147 |
+
\let\maketitle\relax
|
148 |
+
\let\@maketitle\relax
|
149 |
+
\gdef\@thanks{}\gdef\@author{}\gdef\@title{}\let\thanks\relax}
|
150 |
+
\def\@maketitle{\vbox to \titlebox{\hsize\textwidth
|
151 |
+
\linewidth\hsize \vskip 0.125in minus 0.125in \centering
|
152 |
+
{\Large\bfseries \@title \par} \vskip 0.2in plus 1fil minus 0.1in
|
153 |
+
{\def\and{\unskip\enspace{\rmfamily and}\enspace}%
|
154 |
+
\def\And{\end{tabular}\hss \egroup \hskip 1in plus 2fil
|
155 |
+
\hbox to 0pt\bgroup\hss \begin{tabular}[t]{c}\bfseries}%
|
156 |
+
\def\AND{\end{tabular}\hss\egroup \hfil\hfil\egroup
|
157 |
+
\vskip 0.25in plus 1fil minus 0.125in
|
158 |
+
\hbox to \linewidth\bgroup\large \hfil\hfil
|
159 |
+
\hbox to 0pt\bgroup\hss \begin{tabular}[t]{c}\bfseries}
|
160 |
+
\hbox to \linewidth\bgroup\large \hfil\hfil
|
161 |
+
\hbox to 0pt\bgroup\hss
|
162 |
+
\outauthor
|
163 |
+
\hss\egroup
|
164 |
+
\hfil\hfil\egroup}
|
165 |
+
\vskip 0.3in plus 2fil minus 0.1in
|
166 |
+
}}
|
167 |
+
}
|
168 |
+
|
169 |
+
% margins and font size for abstract
|
170 |
+
\renewenvironment{abstract}%
|
171 |
+
{\begin{center}\large\textbf{\abstractname}\end{center}%
|
172 |
+
\begin{list}{}%
|
173 |
+
{\setlength{\rightmargin}{0.6cm}%
|
174 |
+
\setlength{\leftmargin}{0.6cm}}%
|
175 |
+
\item[]\ignorespaces%
|
176 |
+
\@setsize\normalsize{12pt}\xpt\@xpt
|
177 |
+
}%
|
178 |
+
{\unskip\end{list}}
|
179 |
+
|
180 |
+
% Resizing figure and table captions - SL
|
181 |
+
% Support for interacting with the caption, subfigure, and subcaption packages - SL
|
182 |
+
\RequirePackage{caption}
|
183 |
+
\DeclareCaptionFont{10pt}{\fontsize{10pt}{12pt}\selectfont}
|
184 |
+
\captionsetup{font=10pt}
|
185 |
+
|
186 |
+
\RequirePackage{natbib}
|
187 |
+
% for citation commands in the .tex, authors can use:
|
188 |
+
% \citep, \citet, and \citeyearpar for compatibility with natbib, or
|
189 |
+
% \cite, \newcite, and \shortcite for compatibility with older ACL .sty files
|
190 |
+
\renewcommand\cite{\citep} % to get "(Author Year)" with natbib
|
191 |
+
\newcommand\shortcite{\citeyearpar}% to get "(Year)" with natbib
|
192 |
+
\newcommand\newcite{\citet} % to get "Author (Year)" with natbib
|
193 |
+
\newcommand{\citeposs}[1]{\citeauthor{#1}'s (\citeyear{#1})} % to get "Author's (Year)"
|
194 |
+
|
195 |
+
\bibliographystyle{acl_natbib}
|
196 |
+
|
197 |
+
% Bibliography
|
198 |
+
|
199 |
+
% Don't put a label in the bibliography at all. Just use the unlabeled format
|
200 |
+
% instead.
|
201 |
+
\def\thebibliography#1{\vskip\parskip%
|
202 |
+
\vskip\baselineskip%
|
203 |
+
\def\baselinestretch{1}%
|
204 |
+
\ifx\@currsize\normalsize\@normalsize\else\@currsize\fi%
|
205 |
+
\vskip-\parskip%
|
206 |
+
\vskip-\baselineskip%
|
207 |
+
\section*{References\@mkboth
|
208 |
+
{References}{References}}\list
|
209 |
+
{}{\setlength{\labelwidth}{0pt}\setlength{\leftmargin}{\parindent}
|
210 |
+
\setlength{\itemindent}{-\parindent}}
|
211 |
+
\def\newblock{\hskip .11em plus .33em minus -.07em}
|
212 |
+
\sloppy\clubpenalty4000\widowpenalty4000
|
213 |
+
\sfcode`\.=1000\relax}
|
214 |
+
\let\endthebibliography=\endlist
|
215 |
+
|
216 |
+
|
217 |
+
% Allow for a bibliography of sources of attested examples
|
218 |
+
\def\thesourcebibliography#1{\vskip\parskip%
|
219 |
+
\vskip\baselineskip%
|
220 |
+
\def\baselinestretch{1}%
|
221 |
+
\ifx\@currsize\normalsize\@normalsize\else\@currsize\fi%
|
222 |
+
\vskip-\parskip%
|
223 |
+
\vskip-\baselineskip%
|
224 |
+
\section*{Sources of Attested Examples\@mkboth
|
225 |
+
{Sources of Attested Examples}{Sources of Attested Examples}}\list
|
226 |
+
{}{\setlength{\labelwidth}{0pt}\setlength{\leftmargin}{\parindent}
|
227 |
+
\setlength{\itemindent}{-\parindent}}
|
228 |
+
\def\newblock{\hskip .11em plus .33em minus -.07em}
|
229 |
+
\sloppy\clubpenalty4000\widowpenalty4000
|
230 |
+
\sfcode`\.=1000\relax}
|
231 |
+
\let\endthesourcebibliography=\endlist
|
232 |
+
|
233 |
+
% sections with less space
|
234 |
+
\def\section{\@startsection {section}{1}{\z@}{-2.0ex plus
|
235 |
+
-0.5ex minus -.2ex}{1.5ex plus 0.3ex minus .2ex}{\large\bfseries\raggedright}}
|
236 |
+
\def\subsection{\@startsection{subsection}{2}{\z@}{-1.8ex plus
|
237 |
+
-0.5ex minus -.2ex}{0.8ex plus .2ex}{\normalsize\bfseries\raggedright}}
|
238 |
+
%% changed by KO to - values to get the initial parindent right
|
239 |
+
\def\subsubsection{\@startsection{subsubsection}{3}{\z@}{-1.5ex plus
|
240 |
+
-0.5ex minus -.2ex}{0.5ex plus .2ex}{\normalsize\bfseries\raggedright}}
|
241 |
+
\def\paragraph{\@startsection{paragraph}{4}{\z@}{1.5ex plus
|
242 |
+
0.5ex minus .2ex}{-1em}{\normalsize\bfseries}}
|
243 |
+
\def\subparagraph{\@startsection{subparagraph}{5}{\parindent}{1.5ex plus
|
244 |
+
0.5ex minus .2ex}{-1em}{\normalsize\bfseries}}
|
245 |
+
|
246 |
+
% Footnotes
|
247 |
+
\footnotesep 6.65pt %
|
248 |
+
\skip\footins 9pt plus 4pt minus 2pt
|
249 |
+
\def\footnoterule{\kern-3pt \hrule width 5pc \kern 2.6pt }
|
250 |
+
\setcounter{footnote}{0}
|
251 |
+
|
252 |
+
% Lists and paragraphs
|
253 |
+
\parindent 1em
|
254 |
+
\topsep 4pt plus 1pt minus 2pt
|
255 |
+
\partopsep 1pt plus 0.5pt minus 0.5pt
|
256 |
+
\itemsep 2pt plus 1pt minus 0.5pt
|
257 |
+
\parsep 2pt plus 1pt minus 0.5pt
|
258 |
+
|
259 |
+
\leftmargin 2em \leftmargini\leftmargin \leftmarginii 2em
|
260 |
+
\leftmarginiii 1.5em \leftmarginiv 1.0em \leftmarginv .5em \leftmarginvi .5em
|
261 |
+
\labelwidth\leftmargini\advance\labelwidth-\labelsep \labelsep 5pt
|
262 |
+
|
263 |
+
\def\@listi{\leftmargin\leftmargini}
|
264 |
+
\def\@listii{\leftmargin\leftmarginii
|
265 |
+
\labelwidth\leftmarginii\advance\labelwidth-\labelsep
|
266 |
+
\topsep 2pt plus 1pt minus 0.5pt
|
267 |
+
\parsep 1pt plus 0.5pt minus 0.5pt
|
268 |
+
\itemsep \parsep}
|
269 |
+
\def\@listiii{\leftmargin\leftmarginiii
|
270 |
+
\labelwidth\leftmarginiii\advance\labelwidth-\labelsep
|
271 |
+
\topsep 1pt plus 0.5pt minus 0.5pt
|
272 |
+
\parsep \z@ \partopsep 0.5pt plus 0pt minus 0.5pt
|
273 |
+
\itemsep \topsep}
|
274 |
+
\def\@listiv{\leftmargin\leftmarginiv
|
275 |
+
\labelwidth\leftmarginiv\advance\labelwidth-\labelsep}
|
276 |
+
\def\@listv{\leftmargin\leftmarginv
|
277 |
+
\labelwidth\leftmarginv\advance\labelwidth-\labelsep}
|
278 |
+
\def\@listvi{\leftmargin\leftmarginvi
|
279 |
+
\labelwidth\leftmarginvi\advance\labelwidth-\labelsep}
|
280 |
+
|
281 |
+
\abovedisplayskip 7pt plus2pt minus5pt%
|
282 |
+
\belowdisplayskip \abovedisplayskip
|
283 |
+
\abovedisplayshortskip 0pt plus3pt%
|
284 |
+
\belowdisplayshortskip 4pt plus3pt minus3pt%
|
285 |
+
|
286 |
+
% Less leading in most fonts (due to the narrow columns)
|
287 |
+
% The choices were between 1-pt and 1.5-pt leading
|
288 |
+
\def\@normalsize{\@setsize\normalsize{11pt}\xpt\@xpt}
|
289 |
+
\def\small{\@setsize\small{10pt}\ixpt\@ixpt}
|
290 |
+
\def\footnotesize{\@setsize\footnotesize{10pt}\ixpt\@ixpt}
|
291 |
+
\def\scriptsize{\@setsize\scriptsize{8pt}\viipt\@viipt}
|
292 |
+
\def\tiny{\@setsize\tiny{7pt}\vipt\@vipt}
|
293 |
+
\def\large{\@setsize\large{14pt}\xiipt\@xiipt}
|
294 |
+
\def\Large{\@setsize\Large{16pt}\xivpt\@xivpt}
|
295 |
+
\def\LARGE{\@setsize\LARGE{20pt}\xviipt\@xviipt}
|
296 |
+
\def\huge{\@setsize\huge{23pt}\xxpt\@xxpt}
|
297 |
+
\def\Huge{\@setsize\Huge{28pt}\xxvpt\@xxvpt}
|
298 |
+
|
299 |
+
% The hyperref manual (section 9) says hyperref should be loaded after natbib
|
300 |
+
\ifacl@hyperref
|
301 |
+
\PassOptionsToPackage{breaklinks}{hyperref}
|
302 |
+
\RequirePackage{hyperref}
|
303 |
+
% make links dark blue
|
304 |
+
\definecolor{darkblue}{rgb}{0, 0, 0.5}
|
305 |
+
\hypersetup{colorlinks=true, citecolor=darkblue, linkcolor=darkblue, urlcolor=darkblue}
|
306 |
+
\else
|
307 |
+
% This definition is used if the hyperref package is not loaded.
|
308 |
+
% It provides a backup, no-op definiton of \href.
|
309 |
+
% This is necessary because \href command is used in the acl_natbib.bst file.
|
310 |
+
\def\href#1#2{{#2}}
|
311 |
+
\usepackage{url}
|
312 |
+
\fi
|
src/demo/latex_template/template.tex
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
\documentclass[11pt]{article}
|
2 |
+
|
3 |
+
\usepackage[final]{acl}
|
4 |
+
\usepackage{times}
|
5 |
+
\usepackage{latexsym}
|
6 |
+
\usepackage[T1]{fontenc}
|
7 |
+
\usepackage{microtype}
|
8 |
+
\usepackage{graphicx}
|
9 |
+
\usepackage{amsmath}
|
10 |
+
|
11 |
+
\author{
|
12 |
+
InteractiveSurvey \\
|
13 |
+
Affiliation Line 1 \\
|
14 |
+
Affiliation Line 2 \\
|
15 |
+
\texttt{InteractiveSurvey@domain} \\
|
16 |
+
}
|
17 |
+
|
18 |
+
\begin{document}
|
19 |
+
\maketitle
|
20 |
+
|
21 |
+
|
22 |
+
\end{document}
|
src/demo/main.py
ADDED
@@ -0,0 +1,448 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import csv
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
|
5 |
+
import pandas as pd
|
6 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
7 |
+
from asg_retriever import legal_pdf
|
8 |
+
from asg_loader import DocumentLoading
|
9 |
+
from asg_retriever import Retriever, query_embeddings_new_new
|
10 |
+
from asg_generator import generate_sentence_patterns, generate
|
11 |
+
from category_and_tsne import clustering
|
12 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
13 |
+
import time
|
14 |
+
import torch
|
15 |
+
import re
|
16 |
+
import transformers
|
17 |
+
from dotenv import load_dotenv
|
18 |
+
from asg_clustername import generate_cluster_name_new
|
19 |
+
from asg_outline import OutlineGenerator, generateSurvey_qwen_new
|
20 |
+
import os
|
21 |
+
from markdown_pdf import MarkdownPdf, Section # Assuming you are using markdown_pdf
|
22 |
+
from typing import Any
|
23 |
+
|
24 |
+
def clean_str(input_str):
|
25 |
+
input_str = str(input_str).strip().lower()
|
26 |
+
if input_str == "none" or input_str == "nan" or len(input_str) == 0:
|
27 |
+
return ""
|
28 |
+
input_str = input_str.replace('\\n',' ').replace('\n',' ').replace('\r',' ').replace('——',' ').replace('——',' ').replace('__',' ').replace('__',' ').replace('........','.').replace('....','.').replace('....','.').replace('..','.').replace('..','.').replace('..','.').replace('. . . . . . . . ','. ').replace('. . . . ','. ').replace('. . . . ','. ').replace('. . ','. ').replace('. . ','. ')
|
29 |
+
input_str = re.sub(r'\\u[0-9a-z]{4}', ' ', input_str).replace(' ',' ').replace(' ',' ')
|
30 |
+
return input_str
|
31 |
+
|
32 |
+
def remove_invalid_citations(text, valid_collection_names):
|
33 |
+
"""
|
34 |
+
只保留 [xxx\] 中的 xxx 属于 valid_collection_names 的引用,
|
35 |
+
其余的引用标记一律删除。
|
36 |
+
"""
|
37 |
+
pattern = r"\[(.*?)\\\]" # 匹配形如 [xxx\] 的内容
|
38 |
+
all_matches = re.findall(pattern, text)
|
39 |
+
|
40 |
+
new_text = text
|
41 |
+
for match in all_matches:
|
42 |
+
cleaned_match = match.rstrip('\\') # 去除末尾的 \
|
43 |
+
if cleaned_match not in valid_collection_names:
|
44 |
+
new_text = new_text.replace(f"[{match}\\]", "")
|
45 |
+
return new_text
|
46 |
+
def normalize_citations_with_mapping(paper_text):
|
47 |
+
# 使用正则表达式匹配所有引用标记(形如 [citation1])
|
48 |
+
citations = re.findall(r'\[.*?\]', paper_text)
|
49 |
+
# 去重并保持顺序
|
50 |
+
unique_citations = list(dict.fromkeys(citations))
|
51 |
+
# 生成引用映射表,把原始引用标记映射为数字引用
|
52 |
+
citation_mapping = {citation: f'[{i + 1}]' for i, citation in enumerate(unique_citations)}
|
53 |
+
|
54 |
+
# 在文本中替换老引用为新引用
|
55 |
+
normalized_text = paper_text
|
56 |
+
for old_citation, new_citation in citation_mapping.items():
|
57 |
+
normalized_text = normalized_text.replace(old_citation, new_citation)
|
58 |
+
|
59 |
+
# 生成从数字到原始引用标记的反向映射
|
60 |
+
# 用 rstrip('\\') 去掉末尾的反斜杠
|
61 |
+
reverse_mapping = {
|
62 |
+
i + 1: unique_citations[i].strip('[]').rstrip('\\')
|
63 |
+
for i in range(len(unique_citations))
|
64 |
+
}
|
65 |
+
|
66 |
+
return normalized_text, reverse_mapping
|
67 |
+
def generate_references_section(citation_mapping, collection_pdf_mapping):
|
68 |
+
|
69 |
+
references = ["# References"] # 生成引用部分
|
70 |
+
for num in sorted(citation_mapping.keys()):
|
71 |
+
collection_name = citation_mapping[num]
|
72 |
+
pdf_name = collection_pdf_mapping.get(collection_name, "Unknown PDF")
|
73 |
+
if pdf_name.endswith(".pdf"):
|
74 |
+
pdf_name = pdf_name[:-4]
|
75 |
+
# 在每一行末尾添加两个空格以确保换行
|
76 |
+
references.append(f"[{num}] {pdf_name} ")
|
77 |
+
|
78 |
+
return "\n".join(references)
|
79 |
+
def fix_citation_punctuation_md(text):
|
80 |
+
"""
|
81 |
+
把类似于 'some text. \[1]' 或 'some text. \[2]' 调整为 'some text \[1].'
|
82 |
+
仅针对已经变成 \[1], \[2] 之类数字引用的 Markdown 情况有效。
|
83 |
+
如果还没有变成 \[数字],则需先经过 normalize_citations_with_mapping。
|
84 |
+
"""
|
85 |
+
# 正则表达式匹配点号后带有空格或无空格,紧接 \[数字] 的情况
|
86 |
+
pattern = r'\.\s*(\\\[\d+\])'
|
87 |
+
replacement = r' \1.'
|
88 |
+
fixed_text = re.sub(pattern, replacement, text)
|
89 |
+
return fixed_text
|
90 |
+
def finalize_survey_paper(paper_text,
|
91 |
+
Global_collection_names,
|
92 |
+
Global_file_names):
|
93 |
+
|
94 |
+
# 1) 删除所有不想要的旧引用(包括 [数字]、[Sewon, 2021] 等)
|
95 |
+
paper_text = remove_invalid_citations(paper_text, Global_collection_names)
|
96 |
+
|
97 |
+
# 2) 规范化引用 => [1][2]...
|
98 |
+
normalized_text, citation_mapping = normalize_citations_with_mapping(paper_text)
|
99 |
+
|
100 |
+
# 3) 修复标点,比如 .[1] => [1].
|
101 |
+
normalized_text = fix_citation_punctuation_md(normalized_text)
|
102 |
+
|
103 |
+
# 4) 构造 {collection_name: pdf_file_name} 字典
|
104 |
+
collection_pdf_mapping = dict(zip(Global_collection_names, Global_file_names))
|
105 |
+
|
106 |
+
# 5) 生成 References
|
107 |
+
references_section = generate_references_section(citation_mapping, collection_pdf_mapping)
|
108 |
+
|
109 |
+
# 6) 合并正文和 References
|
110 |
+
final_paper = normalized_text.strip() + "\n\n" + references_section
|
111 |
+
return final_paper
|
112 |
+
|
113 |
+
class ASG_system:
|
114 |
+
def __init__(self, root_path: str, survey_id:str, pdf_path: str, survey_title: str, cluster_standard: str) -> None:
|
115 |
+
load_dotenv()
|
116 |
+
self.pdf_path = pdf_path
|
117 |
+
self.txt_path = root_path + "/txt"
|
118 |
+
self.tsv_path = root_path + "/tsv"
|
119 |
+
self.md_path = root_path + "/md"
|
120 |
+
self.info_path = root_path + "/info"
|
121 |
+
self.result_path = root_path + "/result"
|
122 |
+
|
123 |
+
self.survey_id = survey_id
|
124 |
+
self.survey_title = survey_title
|
125 |
+
self.cluster_standard = cluster_standard
|
126 |
+
|
127 |
+
self.collection_names = []
|
128 |
+
self.file_names = []
|
129 |
+
self.citation_data = []
|
130 |
+
self.description_list = []
|
131 |
+
self.ref_list = []
|
132 |
+
self.cluster_names = []
|
133 |
+
self.collection_names_clustered = []
|
134 |
+
self.df_selected = ''
|
135 |
+
|
136 |
+
|
137 |
+
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
138 |
+
self.embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
139 |
+
self.pipeline = transformers.pipeline(
|
140 |
+
"text-generation",
|
141 |
+
model=model_id,
|
142 |
+
model_kwargs={"torch_dtype": torch.bfloat16},
|
143 |
+
token = os.getenv('HF_API_KEY'),
|
144 |
+
device_map="auto",
|
145 |
+
)
|
146 |
+
self.pipeline.model.load_adapter(peft_model_id = "technicolor/llama3.1_8b_outline_generation", adapter_name="outline")
|
147 |
+
self.pipeline.model.load_adapter(peft_model_id ="technicolor/llama3.1_8b_abstract_generation", adapter_name="abstract")
|
148 |
+
self.pipeline.model.load_adapter(peft_model_id ="technicolor/llama3.1_8b_conclusion_generation", adapter_name="conclusion")
|
149 |
+
|
150 |
+
os.makedirs(self.txt_path, exist_ok=True)
|
151 |
+
os.makedirs(f'{self.txt_path}/{self.survey_id}', exist_ok=True)
|
152 |
+
|
153 |
+
os.makedirs(self.tsv_path, exist_ok=True)
|
154 |
+
|
155 |
+
os.makedirs(self.md_path, exist_ok=True)
|
156 |
+
os.makedirs(f'{self.md_path}/{self.survey_id}', exist_ok=True)
|
157 |
+
|
158 |
+
os.makedirs(self.info_path, exist_ok=True)
|
159 |
+
os.makedirs(f'{self.info_path}/{self.survey_id}', exist_ok=True)
|
160 |
+
|
161 |
+
os.makedirs(self.result_path, exist_ok=True)
|
162 |
+
os.makedirs(f'{self.result_path}/{self.survey_id}', exist_ok=True)
|
163 |
+
|
164 |
+
def parsing_pdfs(self, mode="intro") -> None:
|
165 |
+
pdf_files = os.listdir(self.pdf_path)
|
166 |
+
loader = DocumentLoading()
|
167 |
+
|
168 |
+
|
169 |
+
for pdf_file in pdf_files:
|
170 |
+
|
171 |
+
pdf_file = os.path.join(self.pdf_path, pdf_file)
|
172 |
+
|
173 |
+
split_start_time = time.time()
|
174 |
+
|
175 |
+
base_name = os.path.splitext(os.path.basename(pdf_file))[0]
|
176 |
+
target_dir = os.path.join(self.md_path, self.survey_id, base_name, "auto")
|
177 |
+
md_dir = os.path.join(self.md_path, self.survey_id)
|
178 |
+
|
179 |
+
loader.convert_pdf_to_md(pdf_file, md_dir)
|
180 |
+
|
181 |
+
md_file_path = os.path.join(target_dir, f"{base_name}.md")
|
182 |
+
print(md_file_path)
|
183 |
+
print("*"*24)
|
184 |
+
if not os.path.exists(md_file_path):
|
185 |
+
raise FileNotFoundError(f"Markdown file {md_file_path} does not exist. Conversion might have failed.")
|
186 |
+
|
187 |
+
if mode == "intro":
|
188 |
+
doc = loader.process_md_file(md_file_path, self.survey_id, self.txt_path)
|
189 |
+
elif mode == "full":
|
190 |
+
doc = loader.process_md_file_full(md_file_path, self.survey_id,self.txt_path)
|
191 |
+
|
192 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
193 |
+
chunk_size=400,
|
194 |
+
chunk_overlap=30,
|
195 |
+
length_function=len,
|
196 |
+
is_separator_regex=False,
|
197 |
+
)
|
198 |
+
splitters = text_splitter.create_documents([doc])
|
199 |
+
documents_list = [document.page_content for document in splitters]
|
200 |
+
for i in range(len(documents_list)):
|
201 |
+
documents_list[i] = documents_list[i].replace('\n', ' ')
|
202 |
+
print(f"Splitting took {time.time() - split_start_time} seconds.")
|
203 |
+
|
204 |
+
embed_start_time = time.time()
|
205 |
+
|
206 |
+
doc_results = self.embedder.embed_documents(documents_list)
|
207 |
+
if isinstance(doc_results, torch.Tensor):
|
208 |
+
embeddings_list = doc_results.tolist()
|
209 |
+
else:
|
210 |
+
embeddings_list = doc_results
|
211 |
+
print(f"Embedding took {time.time() - embed_start_time} seconds.")
|
212 |
+
|
213 |
+
# Prepare metadata
|
214 |
+
metadata_list = [{"doc_name": os.path.basename(pdf_file)} for i in range(len(documents_list))]
|
215 |
+
|
216 |
+
title = os.path.splitext(os.path.basename(pdf_file))[0]
|
217 |
+
|
218 |
+
|
219 |
+
title_new = title.strip()
|
220 |
+
invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*','_']
|
221 |
+
for char in invalid_chars:
|
222 |
+
title_new = title_new.replace(char, ' ')
|
223 |
+
print("============================")
|
224 |
+
print(title_new)
|
225 |
+
|
226 |
+
# New logic to create collection_name
|
227 |
+
# filename = os.path.basename(file_path)
|
228 |
+
collection_name = legal_pdf(title_new)
|
229 |
+
|
230 |
+
retriever = Retriever()
|
231 |
+
retriever.list_collections_chroma()
|
232 |
+
retriever.create_collection_chroma(collection_name)
|
233 |
+
retriever.add_documents_chroma(
|
234 |
+
collection_name=collection_name,
|
235 |
+
embeddings_list=embeddings_list,
|
236 |
+
documents_list=documents_list,
|
237 |
+
metadata_list=metadata_list
|
238 |
+
)
|
239 |
+
|
240 |
+
self.collection_names.append(collection_name)
|
241 |
+
self.file_names.append(title_new)
|
242 |
+
print(self.collection_names)
|
243 |
+
print(self.file_names)
|
244 |
+
|
245 |
+
json_files = os.listdir(os.path.join(self.txt_path, self.survey_id))
|
246 |
+
ref_paper_num = len(json_files)
|
247 |
+
print(f'The length of the json files is {ref_paper_num}')
|
248 |
+
|
249 |
+
|
250 |
+
json_data_pd = pd.DataFrame()
|
251 |
+
for _ in json_files:
|
252 |
+
file_path = os.path.join(self.txt_path, self.survey_id, _)
|
253 |
+
|
254 |
+
with open(file_path, 'r', encoding="utf-8") as file:
|
255 |
+
data = json.load(file)
|
256 |
+
|
257 |
+
# Extract necessary information
|
258 |
+
title = data.get("title", "")
|
259 |
+
abstract = data.get("abstract", "")
|
260 |
+
authors = data.get("authors", "")
|
261 |
+
introduction = data.get("introduction", "")
|
262 |
+
new_data = {
|
263 |
+
"reference paper title": title,
|
264 |
+
"reference paper citation information (can be collected from Google scholar/DBLP)": authors,
|
265 |
+
"reference paper abstract (Please copy the text AND paste here)": abstract,
|
266 |
+
"reference paper introduction (Please copy the text AND paste here)": introduction,
|
267 |
+
"reference paper doi link (optional)": "",
|
268 |
+
"reference paper category label (optional)": ""
|
269 |
+
}
|
270 |
+
|
271 |
+
# 将新数据转换为 DataFrame
|
272 |
+
new_data_df = pd.DataFrame([new_data])
|
273 |
+
|
274 |
+
# 使用 pd.concat 而不是 append
|
275 |
+
json_data_pd = pd.concat([json_data_pd, new_data_df], ignore_index=True)
|
276 |
+
|
277 |
+
# Save the DataFrame to a variable for further use
|
278 |
+
input_pd = json_data_pd
|
279 |
+
|
280 |
+
if ref_paper_num>0:
|
281 |
+
|
282 |
+
## change col name
|
283 |
+
input_pd['ref_title'] = [filename for filename in self.file_names]
|
284 |
+
input_pd["ref_context"] = [""]*ref_paper_num
|
285 |
+
input_pd["ref_entry"] = input_pd["reference paper citation information (can be collected from Google scholar/DBLP)"]
|
286 |
+
input_pd["abstract"] = input_pd["reference paper abstract (Please copy the text AND paste here)"].apply(lambda x: clean_str(x) if len(str(x))>0 else 'Invalid abstract')
|
287 |
+
input_pd["intro"] = input_pd["reference paper introduction (Please copy the text AND paste here)"].apply(lambda x: clean_str(x) if len(str(x))>0 else 'Invalid introduction')
|
288 |
+
|
289 |
+
# optional columns
|
290 |
+
input_pd["label"] = input_pd["reference paper category label (optional)"].apply(lambda x: str(x) if len(str(x))>0 else '')
|
291 |
+
#input_pd["label"] = input_pd["reference paper category id (optional)"].apply(lambda x: str(x) if len(str(x))>0 else '')
|
292 |
+
## output tsv
|
293 |
+
# output_tsv_filename = self.tsv_path + self.survey_id + '.tsv'
|
294 |
+
output_tsv_filename = os.path.join(self.tsv_path, self.survey_id + '.tsv')
|
295 |
+
|
296 |
+
#output_df = input_pd[["ref_title","ref_context","ref_entry","abstract","intro","description"]]
|
297 |
+
output_df = input_pd[["ref_title","ref_context","ref_entry","abstract","intro", 'label']]
|
298 |
+
# print(output_df)
|
299 |
+
|
300 |
+
#pdb.set_trace()
|
301 |
+
output_df.to_csv(output_tsv_filename, sep='\t')
|
302 |
+
|
303 |
+
def description_generation(self) -> None:
|
304 |
+
query= self.cluster_standard
|
305 |
+
query_list = generate_sentence_patterns(query)
|
306 |
+
for name in self.collection_names:
|
307 |
+
context, citation_data = query_embeddings_new_new(name, query_list)
|
308 |
+
self.citation_data.extend(citation_data)
|
309 |
+
|
310 |
+
description = generate(context, query, name)
|
311 |
+
self.description_list.append(description)
|
312 |
+
|
313 |
+
citation_path = f'{self.info_path}/{self.survey_id}/citation_data.json'
|
314 |
+
os.makedirs(os.path.dirname(citation_path), exist_ok=True)
|
315 |
+
with open(citation_path, 'w', encoding="utf-8") as outfile:
|
316 |
+
json.dump(self.citation_data, outfile, indent=4, ensure_ascii=False)
|
317 |
+
|
318 |
+
file_path = f'{self.tsv_path}/{self.survey_id}.tsv'
|
319 |
+
|
320 |
+
with open(file_path, 'r', newline='', encoding='utf-8') as infile:
|
321 |
+
reader = csv.reader(infile, delimiter='\t')
|
322 |
+
rows = list(reader)
|
323 |
+
if rows:
|
324 |
+
headers = rows[0]
|
325 |
+
headers.append('retrieval_result')
|
326 |
+
|
327 |
+
updated_rows = [headers]
|
328 |
+
for row, description in zip(rows[1:], self.description_list):
|
329 |
+
row.append(description)
|
330 |
+
updated_rows.append(row)
|
331 |
+
|
332 |
+
with open(file_path, 'w', newline='', encoding='utf-8') as outfile:
|
333 |
+
writer = csv.writer(outfile, delimiter='\t')
|
334 |
+
writer.writerows(updated_rows)
|
335 |
+
|
336 |
+
print('Updated file has been saved to', file_path)
|
337 |
+
else:
|
338 |
+
print('Input file is empty.')
|
339 |
+
|
340 |
+
def agglomerative_clustering(self) -> None:
|
341 |
+
df = pd.read_csv(f'{self.tsv_path}/{self.survey_id}.tsv', sep='\t', index_col=0, encoding='utf-8')
|
342 |
+
df_selected = df
|
343 |
+
|
344 |
+
df_selected, _ = clustering(df_selected, 3, self.survey_id, self.info_path, self.tsv_path)
|
345 |
+
self.df_selected = df_selected
|
346 |
+
|
347 |
+
df_tmp = df_selected.reset_index()
|
348 |
+
df_tmp['index'] = df_tmp.index
|
349 |
+
ref_titles = list(df_tmp.groupby(df_tmp['label'])['ref_title'].apply(list))
|
350 |
+
# ref_indexs = list(df_tmp.groupby(df_tmp['label'])['index'].apply(list))
|
351 |
+
|
352 |
+
category_label_summarized = generate_cluster_name_new(f"{self.tsv_path}/{self.survey_id}.tsv", self.survey_title)
|
353 |
+
self.cluster_names = category_label_summarized
|
354 |
+
|
355 |
+
cluster_info = {category_label_summarized[i]:ref_titles[i] for i in range(len(category_label_summarized))}
|
356 |
+
for key, value in cluster_info.items():
|
357 |
+
temp = [legal_pdf(i) for i in value]
|
358 |
+
cluster_info[key] = temp
|
359 |
+
self.collection_names_clustered.append(temp)
|
360 |
+
cluster_info_path = f'{self.info_path}/{self.survey_id}/cluster_info.json'
|
361 |
+
with open(cluster_info_path, 'w', encoding="utf-8") as outfile:
|
362 |
+
json.dump(cluster_info, outfile, indent=4, ensure_ascii=False)
|
363 |
+
|
364 |
+
def outline_generation(self) -> None:
|
365 |
+
print(self.df_selected)
|
366 |
+
print(self.cluster_names)
|
367 |
+
outline_generator = OutlineGenerator(self.pipeline, self.df_selected, self.cluster_names)
|
368 |
+
outline_generator.get_cluster_info()
|
369 |
+
messages, outline = outline_generator.generate_outline_qwen(self.survey_title)
|
370 |
+
outline_json = {'messages':messages, 'outline': outline}
|
371 |
+
output_path = f'{self.info_path}/{self.survey_id}/outline.json'
|
372 |
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
373 |
+
with open(output_path, 'w', encoding="utf-8") as outfile:
|
374 |
+
json.dump(outline_json, outfile, indent=4, ensure_ascii=False)
|
375 |
+
|
376 |
+
|
377 |
+
def section_generation(self) -> None:
|
378 |
+
generateSurvey_qwen_new(self.survey_id, self.survey_title, self.collection_names_clustered, self.pipeline, self.citation_data, './txt','./info')
|
379 |
+
|
380 |
+
def citation_generation(self) -> None:
|
381 |
+
"""
|
382 |
+
Generate citation Markdown and PDF files from JSON and store them in the specified result path.
|
383 |
+
"""
|
384 |
+
|
385 |
+
json_filepath = os.path.join(self.info_path, self.survey_id, "generated_result.json")
|
386 |
+
|
387 |
+
markdown_dir = f'{self.result_path}/{self.survey_id}'
|
388 |
+
markdown_filename = f'survey_{self.survey_id}.md'
|
389 |
+
markdown_filepath = os.path.join(markdown_dir, markdown_filename)
|
390 |
+
pdf_filename = f'survey_{self.survey_id}.pdf'
|
391 |
+
pdf_filepath = os.path.join(markdown_dir, pdf_filename)
|
392 |
+
|
393 |
+
markdown_content = self.get_markdown_content(json_filepath)
|
394 |
+
if not markdown_content:
|
395 |
+
raise ValueError("Markdown content is empty. Cannot generate citation files.")
|
396 |
+
|
397 |
+
try:
|
398 |
+
with open(markdown_filepath, 'w', encoding='utf-8', encoding="utf-8") as markdown_file:
|
399 |
+
markdown_file.write(markdown_content)
|
400 |
+
print(f"Markdown content saved to: {markdown_filepath}")
|
401 |
+
except Exception as e:
|
402 |
+
raise RuntimeError(f"Failed to save Markdown file: {e}")
|
403 |
+
|
404 |
+
try:
|
405 |
+
pdf = MarkdownPdf()
|
406 |
+
pdf.meta["title"] = "Citation Results"
|
407 |
+
pdf.add_section(Section(markdown_content, toc=False))
|
408 |
+
pdf.save(pdf_filepath)
|
409 |
+
print(f"PDF content saved to: {pdf_filepath}")
|
410 |
+
except Exception as e:
|
411 |
+
raise RuntimeError(f"Failed to generate PDF file: {e}")
|
412 |
+
print(f"Files generated successfully: \nMarkdown: {markdown_filepath}\nPDF: {pdf_filepath}")
|
413 |
+
|
414 |
+
def get_markdown_content(self, json_filepath: str) -> str:
|
415 |
+
"""
|
416 |
+
Read a JSON file and generate Markdown content based on its data.
|
417 |
+
|
418 |
+
:param json_filepath: Path to the JSON file containing survey data.
|
419 |
+
:return: A string containing the generated Markdown content.
|
420 |
+
"""
|
421 |
+
try:
|
422 |
+
with open(json_filepath, 'r', encoding='utf-8', encoding="utf-8") as json_file:
|
423 |
+
survey_data = json.load(json_file)
|
424 |
+
except Exception as e:
|
425 |
+
raise RuntimeError(f"Failed to read JSON file: {e}")
|
426 |
+
|
427 |
+
topic = survey_data.get('survey_title', 'Default Topic')
|
428 |
+
content = survey_data.get('content', 'No content available.')
|
429 |
+
|
430 |
+
survey_title_markdown = f"# A Survey of {topic}\n\n"
|
431 |
+
survey_content_markdown = content + "\n\n"
|
432 |
+
|
433 |
+
markdown_content = survey_title_markdown + survey_content_markdown
|
434 |
+
markdown_content = finalize_survey_paper(markdown_content, self.collection_names, self.file_names)
|
435 |
+
return markdown_content
|
436 |
+
|
437 |
+
if __name__ == "__main__":
|
438 |
+
root_path = "."
|
439 |
+
pdf_path = "./pdfs/test"
|
440 |
+
survey_title = "Automating Literature Review Generation with LLM"
|
441 |
+
cluster_standard = "method"
|
442 |
+
asg_system = ASG_system(root_path, 'test', pdf_path, survey_title, cluster_standard)
|
443 |
+
asg_system.parsing_pdfs()
|
444 |
+
asg_system.description_generation()
|
445 |
+
asg_system.agglomerative_clustering()
|
446 |
+
asg_system.outline_generation()
|
447 |
+
asg_system.section_generation()
|
448 |
+
asg_system.citation_generation()
|
src/demo/migrations/__init__.py
ADDED
File without changes
|
src/demo/models.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from django.db import models
|
2 |
+
|
3 |
+
# Create your models here.
|
4 |
+
|
5 |
+
# class Choose_Topic(models.Model):
|
6 |
+
# question_text = models.CharField(max_length=200)
|
7 |
+
# pub_date = models.DateTimeField('date published')
|
8 |
+
#
|
9 |
+
#
|
10 |
+
# class Choice(models.Model):
|
11 |
+
# question = models.ForeignKey(Question, on_delete=models.CASCADE)
|
12 |
+
# choice_text = models.CharField(max_length=200)
|
13 |
+
# votes = models.IntegerField(default=0)
|
14 |
+
|
src/demo/postprocess.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
def reindex_citations(content):
|
4 |
+
"""
|
5 |
+
将content中所有形如[collection_name]的引用标记,全局重编号为[1]、[2]、[3]...。
|
6 |
+
返回:
|
7 |
+
new_content: 替换后的文本
|
8 |
+
source_map: {collection_name: index, ...}
|
9 |
+
"""
|
10 |
+
pattern = r"\[([^\[\]]+)\]"
|
11 |
+
source_map = {}
|
12 |
+
current_index = 1
|
13 |
+
|
14 |
+
def replace_func(match):
|
15 |
+
source = match.group(1)
|
16 |
+
nonlocal current_index
|
17 |
+
if source not in source_map:
|
18 |
+
source_map[source] = current_index
|
19 |
+
current_index += 1
|
20 |
+
return f"[{source_map[source]}]"
|
21 |
+
|
22 |
+
new_content = re.sub(pattern, replace_func, content)
|
23 |
+
return new_content, source_map
|
24 |
+
|
25 |
+
def generate_references_section(source_map):
|
26 |
+
"""
|
27 |
+
根据source_map生成References部分的文本。
|
28 |
+
source_map: {collection_name: index, ...}
|
29 |
+
|
30 |
+
返回值:
|
31 |
+
str:
|
32 |
+
"References\n1 collection_name_1\n2 collection_name_2\n..."
|
33 |
+
"""
|
34 |
+
# 将source_map按index排序
|
35 |
+
index_to_source = sorted(source_map.items(), key=lambda x: x[1])
|
36 |
+
refs_lines = ["References"]
|
37 |
+
for source, idx in index_to_source:
|
38 |
+
refs_lines.append(f"{idx} {source}")
|
39 |
+
return "\n".join(refs_lines)
|
src/demo/query1.py
ADDED
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from openai import OpenAI
|
3 |
+
from datetime import datetime, timedelta
|
4 |
+
import re
|
5 |
+
|
6 |
+
def generate_abstract_qwen(topic):
|
7 |
+
|
8 |
+
# Initialize the OpenAI client using environment variables
|
9 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
10 |
+
openai_api_base = os.getenv("OPENAI_API_BASE")
|
11 |
+
client = OpenAI(
|
12 |
+
api_key = openai_api_key,
|
13 |
+
base_url = openai_api_base,
|
14 |
+
)
|
15 |
+
|
16 |
+
###########################
|
17 |
+
# Step 1: Generate a survey abstract for the given topic.
|
18 |
+
###########################
|
19 |
+
system_prompt_abstract = """
|
20 |
+
You are a skilled research survey writer. Your task is to generate a survey abstract on the given topic. The abstract should cover the main challenges, key concepts, and research directions associated with the topic. Write in clear, concise academic English.
|
21 |
+
"""
|
22 |
+
user_prompt_abstract = f"""
|
23 |
+
Topic: {topic}
|
24 |
+
|
25 |
+
Please generate a comprehensive survey abstract for this topic. Include discussion of core challenges, key terminologies, and emerging methodologies that are critical in the field. The total length of the abstract should be around 300–500 words.
|
26 |
+
"""
|
27 |
+
messages_abstract = [
|
28 |
+
{"role": "system", "content": system_prompt_abstract},
|
29 |
+
{"role": "user", "content": user_prompt_abstract}
|
30 |
+
]
|
31 |
+
|
32 |
+
abstract_response = client.chat.completions.create(
|
33 |
+
model="Qwen2.5-72B-Instruct",
|
34 |
+
max_tokens=2048,
|
35 |
+
temperature=0.5,
|
36 |
+
stop="<|im_end|>",
|
37 |
+
stream=True,
|
38 |
+
messages=messages_abstract
|
39 |
+
)
|
40 |
+
|
41 |
+
abstract_text = ""
|
42 |
+
for chunk in abstract_response:
|
43 |
+
if chunk.choices[0].delta.content:
|
44 |
+
abstract_text += chunk.choices[0].delta.content
|
45 |
+
abstract_text = abstract_text.strip()
|
46 |
+
print("The abstract is:", abstract_text)
|
47 |
+
|
48 |
+
return abstract_text
|
49 |
+
|
50 |
+
def generate_entity_lists_qwen(topic, abstract_text):
|
51 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
52 |
+
openai_api_base = os.getenv("OPENAI_API_BASE")
|
53 |
+
client = OpenAI(
|
54 |
+
api_key = openai_api_key,
|
55 |
+
base_url = openai_api_base,
|
56 |
+
)
|
57 |
+
system_prompt_abstract = f"""
|
58 |
+
You are an AI assistant specializing in natural language processing and entity recognition. Your task is to extract key entities and core concepts from a given abstract based on a specified topic.
|
59 |
+
|
60 |
+
You should return two distinct lists:
|
61 |
+
1. **Entity list**: Entities that are synonymous or closely related to the given topic. These should be concise (no more than two words) and simplified to their root forms (e.g., removing suffixes like "-ing", "-ed").
|
62 |
+
2. **Concept list**: Core concepts from the abstract that are highly relevant to the topic. These should also be concise (no more than two words) and in their simplest form.
|
63 |
+
|
64 |
+
Ensure that your response follows this exact format:
|
65 |
+
Entity list: [entity1, entity2, entity3, ...]
|
66 |
+
Concept list: [concept1, concept2, concept3, ...]
|
67 |
+
Do not include any explanations or additional text.
|
68 |
+
|
69 |
+
### **Example**
|
70 |
+
#### **Input:**
|
71 |
+
Topic: Large Language Models
|
72 |
+
Abstract: Ever since the Turing Test was proposed in the 1950s, humans have explored the mastering of language intelligence by machine. Language is essentially a complex, intricate system of human expressions governed by grammatical rules. It poses a significant challenge to develop capable artificial intelligence (AI) algorithms for comprehending and grasping a language. As a major approach, language modeling has been widely studied for language understanding and generation in the past two decades, evolving from statistical language models to neural language models. Recently, pre-trained language models (PLMs) have been proposed by pretraining Transformer models over large-scale corpora, showing strong capabilities in solving various natural language processing (NLP) tasks. Since the researchers have found that model scaling can lead to an improved model capacity, they further investigate the scaling effect by increasing the parameter scale to an even larger size. Interestingly, when the parameter scale exceeds a certain level, these enlarged language models not only achieve a significant performance improvement, but also exhibit some special abilities (e.g., in-context learning) that are not present in small-scale language models (e.g., BERT). To discriminate the language models in different parameter scales, the research community has coined the term large language models (LLM) for the PLMs of significant size (e.g., containing tens or hundreds of billions of parameters). Recently, the research on LLMs has been largely advanced by both academia and industry, and a remarkable progress is the launch of ChatGPT (a powerful AI chatbot developed based on LLMs), which has attracted widespread attention from society. The technical evolution of LLMs has been making an important impact on the entire AI community, which would revolutionize the way how we develop and use AI algorithms. Considering this rapid technical progress, in this survey, we review the recent advances of LLMs by introducing the background, key findings, and mainstream techniques. In particular, we focus on four major aspects of LLMs, namely pre-training, adaptation tuning, utilization, and capacity evaluation. Furthermore, we also summarize the available resources for developing LLMs and discuss the remaining issues for future directions. This survey provides an up-to-date review of the literature on LLMs, which can be a useful resource for both researchers and engineers.
|
73 |
+
|
74 |
+
#### **Expected Output:**
|
75 |
+
"entity list": ["language model", "plm", "large language", "llm"]
|
76 |
+
"concept list": ["turing", "language intelligence", "ai", "generation", "statistical", "neural", "pre-train", "transformer", "corpora", "nlp", "in-context", "bert", "chatgpt", "adaptation", "utilization"]
|
77 |
+
Make sure to strictly follow this format in your response.
|
78 |
+
"""
|
79 |
+
|
80 |
+
user_prompt_abstract = f"""
|
81 |
+
Topic: {topic}
|
82 |
+
Abstract: {abstract_text}
|
83 |
+
|
84 |
+
Based on the given topic and abstract, extract the following:
|
85 |
+
1. A **list of entities** that are synonymous or closely related to the topic. Keep each entity under two words and in its simplest form.
|
86 |
+
2. A **list of core concepts** from the abstract that are highly relevant to the topic. Keep each concept under two words and in its simplest form.
|
87 |
+
"""
|
88 |
+
|
89 |
+
messages_abstract = [
|
90 |
+
{"role": "system", "content": system_prompt_abstract},
|
91 |
+
{"role": "user", "content": user_prompt_abstract}
|
92 |
+
]
|
93 |
+
|
94 |
+
entity_response = client.chat.completions.create(
|
95 |
+
model="Qwen2.5-72B-Instruct",
|
96 |
+
max_tokens=2048,
|
97 |
+
temperature=0.5,
|
98 |
+
stop="<|im_end|>",
|
99 |
+
stream=True,
|
100 |
+
messages=messages_abstract
|
101 |
+
)
|
102 |
+
|
103 |
+
entity_list = ""
|
104 |
+
for chunk in entity_response:
|
105 |
+
if chunk.choices[0].delta.content:
|
106 |
+
entity_list += chunk.choices[0].delta.content
|
107 |
+
entity_list = entity_list.strip()
|
108 |
+
print("The entity lists are:", entity_list)
|
109 |
+
|
110 |
+
return entity_list
|
111 |
+
|
112 |
+
|
113 |
+
def generate_query_qwen(topic):
|
114 |
+
# Calculate date range for the arXiv query (last 5 years)
|
115 |
+
abstract_text = generate_abstract_qwen(topic)
|
116 |
+
entity_list = generate_entity_lists_qwen(topic, abstract_text)
|
117 |
+
today = datetime.now()
|
118 |
+
five_years_ago = today - timedelta(days=10 * 365) # approximate calculation
|
119 |
+
start_date = five_years_ago.strftime('%Y%m%d')
|
120 |
+
end_date = today.strftime('%Y%m%d')
|
121 |
+
|
122 |
+
|
123 |
+
# System prompt: Focus on how to extract keywords from the abstract.
|
124 |
+
system_prompt_query = """
|
125 |
+
You are a research assistant specializing in constructing effective arXiv search queries. Your task is to generate a structured search query using **pre-extracted entity and concept lists** from a given abstract. Follow these instructions exactly:
|
126 |
+
|
127 |
+
1. **Input Data:**
|
128 |
+
- **Entity List:** A list of entities that are synonymous or closely related to the given topic.
|
129 |
+
- **Concept List:** A list of core concepts from the abstract that are highly relevant to the topic.
|
130 |
+
|
131 |
+
2. **Ensure Minimum Keyword Count:**
|
132 |
+
- **Entity List** must contain at least **5** terms. If there are fewer, intelligently supplement additional relevant terms.
|
133 |
+
- **Concept List** must contain **12-15** terms. If there are fewer, intelligently supplement additional relevant terms.
|
134 |
+
|
135 |
+
3. **Standardize Formatting:**
|
136 |
+
- Convert all terms to their **base form** and ensure they end with a wildcard `*`.
|
137 |
+
- Examples: `verification → verif*`, `optimization → optim*`, `retrieval → retriev*`, `embedding → embed*`
|
138 |
+
- All terms must be **in lowercase**.
|
139 |
+
|
140 |
+
4. **Construct the Final Query:**
|
141 |
+
- The query must follow this exact structure:
|
142 |
+
```
|
143 |
+
(abs:"<Entity1*>" OR abs:"<Entity2*>" OR abs:"<Entity3*>" OR abs:"<Entity4*>" OR abs:"<Entity5*>") AND
|
144 |
+
(abs:"<Concept1*>" OR abs:"<Concept2*>" OR ... OR abs:"<Concept12*>")
|
145 |
+
```
|
146 |
+
- **Entities are grouped together using `OR` in the first part.**
|
147 |
+
- **Concepts are grouped together using `OR` in the second part.**
|
148 |
+
- **The two groups are combined using `AND`.**
|
149 |
+
- **Do not include any explanations or extra text. Output only the final query.**
|
150 |
+
"""
|
151 |
+
|
152 |
+
# User prompt: Provide examples of topics with corresponding query formats.
|
153 |
+
# User prompt: Provide examples of topics with corresponding query formats.
|
154 |
+
# User prompt: Uses pre-extracted entities and concepts, ensures minimum count, and applies stemming + wildcards.
|
155 |
+
user_prompt_query = f"""
|
156 |
+
Below are the pre-extracted keywords for constructing the final arXiv query.
|
157 |
+
|
158 |
+
**Topic:** {topic}
|
159 |
+
**Entity list and Concept list:** {entity_list}
|
160 |
+
|
161 |
+
### **Processing Rules Applied:**
|
162 |
+
- **Ensure at least 5 entities** (if fewer, supplement additional relevant terms).
|
163 |
+
- **Ensure 12-15 concepts** (if fewer, supplement additional relevant terms).
|
164 |
+
- **Convert all terms to base form and append wildcard `*`.**
|
165 |
+
- **Output only the final query with no extra text.**
|
166 |
+
|
167 |
+
### **Example Query Format:**
|
168 |
+
|
169 |
+
1. **Topic:** Large Language Models
|
170 |
+
**Transformed Entity List:** ["languag model*", "plm*", "larg languag*", "llm*", "deep model*"]
|
171 |
+
**Transformed Concept List:** ["tur*", "languag intellig*", "ai*", "gener*", "statist*", "neural*", "pre-train*", "transform*", "corpora*", "nlp*", "in-context*", "bert*", "chatgpt*", "adapt*", "utiliz*"]
|
172 |
+
**Query:**
|
173 |
+
(abs:"languag model*" OR abs:"plm*" OR abs:"larg languag*" OR abs:"llm*" OR abs:"deep model*") AND (abs:"tur*" OR abs:"languag intellig*" OR abs:"ai*" OR abs:"gener*" OR abs:"statist*" OR abs:"neural*" OR abs:"pre-train*" OR abs:"transform*" OR abs:"corpora*" OR abs:"nlp*" OR abs:"in-context*" OR abs:"bert*" OR abs:"chatgpt*" OR abs:"adapt*" OR abs:"utiliz*")
|
174 |
+
2. **Topic:** Quantum Computing
|
175 |
+
**Transformed Entity List:** ["quant comput*", "qubit*", "qc*", "quant devic*", "topolog comput*"]
|
176 |
+
**Transformed Concept List:** ["decoheren*", "entangl*", "error*", "topolog*", "anneal*", "photon*", "superconduct*", "algorithm*", "optim*", "verif*", "fault-toler*", "nois*", "cirquit*", "quant machin*", "measur*"]
|
177 |
+
**Query:**
|
178 |
+
(abs:"quant comput*" OR abs:"qubit*" OR abs:"qc*" OR abs:"quant devic*" OR abs:"topolog comput*") AND (abs:"decoheren*" OR abs:"entangl*" OR abs:"error*" OR abs:"topolog*" OR abs:"anneal*" OR abs:"photon*" OR abs:"superconduct*" OR abs:"algorithm*" OR abs:"optim*" OR abs:"verif*" OR abs:"fault-toler*" OR abs:"nois*" OR abs:"cirquit*" OR abs:"quant machin*" OR abs:"measur*")
|
179 |
+
---
|
180 |
+
|
181 |
+
### **Now Generate the Query for This Topic:**
|
182 |
+
Using the provided **Entity List** and **Concept List**, apply the following steps:
|
183 |
+
1. **Ensure Entity List contains at least 5 items.** If fewer, supplement additional relevant terms.
|
184 |
+
2. **Ensure Concept List contains 12-15 items.** If fewer, supplement additional relevant terms.
|
185 |
+
3. **Convert all terms to their base form and append `*`.**
|
186 |
+
4. **Construct the arXiv search query in the same format as the examples above.**
|
187 |
+
5. **Return only the final query. Do not include explanations or additional text.**
|
188 |
+
"""
|
189 |
+
|
190 |
+
# Initialize the OpenAI API client
|
191 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
192 |
+
openai_api_base = os.getenv("OPENAI_API_BASE")
|
193 |
+
client = OpenAI(
|
194 |
+
api_key=openai_api_key,
|
195 |
+
base_url=openai_api_base,
|
196 |
+
)
|
197 |
+
|
198 |
+
messages = [
|
199 |
+
{"role": "system", "content": system_prompt_query},
|
200 |
+
{"role": "user", "content": user_prompt_query}
|
201 |
+
]
|
202 |
+
|
203 |
+
response = client.chat.completions.create(
|
204 |
+
model="Qwen2.5-72B-Instruct",
|
205 |
+
max_tokens=512,
|
206 |
+
temperature=0.5,
|
207 |
+
stop="<|im_end|>",
|
208 |
+
stream=True,
|
209 |
+
messages=messages
|
210 |
+
)
|
211 |
+
|
212 |
+
output_query = ""
|
213 |
+
for chunk in response:
|
214 |
+
if chunk.choices[0].delta.content:
|
215 |
+
output_query += chunk.choices[0].delta.content
|
216 |
+
match = re.search(r'\(.*\)', output_query, re.DOTALL)
|
217 |
+
|
218 |
+
if match:
|
219 |
+
extracted_query = match.group(0) # 保留匹配到的整个括号内容
|
220 |
+
else:
|
221 |
+
extracted_query = output_query.strip() # 如果匹配失败,使用原始查询
|
222 |
+
|
223 |
+
# 重新拼接 `submittedDate`
|
224 |
+
updated_query = f"{extracted_query} AND submittedDate:[{start_date} TO {end_date}]"
|
225 |
+
print('The response is :', updated_query)
|
226 |
+
return updated_query.strip()
|
227 |
+
|
228 |
+
|
229 |
+
# Example usage:
|
230 |
+
if __name__ == "__main__":
|
231 |
+
topic = "Quantum Computing"
|
232 |
+
final_query = generate_arxiv_query_chain_of_thought(topic)
|
233 |
+
print("\nFinal Query Returned:")
|
234 |
+
print(final_query)
|
src/demo/references.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
from openai import OpenAI
|
4 |
+
|
5 |
+
def getQwenClient():
|
6 |
+
openai_api_key = os.environ.get("OPENAI_API_KEY")
|
7 |
+
openai_api_base = os.environ.get("OPENAI_API_BASE")
|
8 |
+
|
9 |
+
client = OpenAI(
|
10 |
+
# defaults to os.environ.get("OPENAI_API_KEY")
|
11 |
+
api_key = openai_api_key,
|
12 |
+
base_url = openai_api_base,
|
13 |
+
)
|
14 |
+
return client
|
15 |
+
|
16 |
+
def generateResponse(client, prompt):
|
17 |
+
chat_response = client.chat.completions.create(
|
18 |
+
model=os.environ.get("MODEL"),
|
19 |
+
max_tokens=1536,
|
20 |
+
temperature=0.5,
|
21 |
+
stop="<|im_end|>",
|
22 |
+
stream=True,
|
23 |
+
messages=[{"role": "user", "content": prompt}]
|
24 |
+
)
|
25 |
+
# Stream the response to console
|
26 |
+
text = ""
|
27 |
+
for chunk in chat_response:
|
28 |
+
if chunk.choices[0].delta.content:
|
29 |
+
text += chunk.choices[0].delta.content
|
30 |
+
return text
|
31 |
+
|
32 |
+
def generate_references(papers_info, client):
|
33 |
+
|
34 |
+
# In-Context Learning
|
35 |
+
examples = '''
|
36 |
+
Example1:
|
37 |
+
Authors: Armen Aghajanyan, Armen Aghajanyan, Anchit Gupta, Akshat Shrivastava, Xilun Chen, Luke Zettlemoyer, and Sonal Gupta
|
38 |
+
Title: Muppet: Massive multi-task representations with pre-finetuning
|
39 |
+
Reference: Armen Aghajanyan, Anchit Gupta, Akshat Shrivastava, Xilun Chen, Luke Zettlemoyer, and Sonal Gupta. Muppet: Massive multi-task representations with pre-finetuning
|
40 |
+
|
41 |
+
Example2:
|
42 |
+
Authors: Ari Holtzman1, Peter West222, Vered Shwartz3, Yejin Choi4, Luke Zettlemoyer12001
|
43 |
+
Title: Surface form competition: Why the highest probability answer isn't always right.
|
44 |
+
Reference: Ari Holtzman, Peter West, Vered Shwartz, Yejin Choi, Luke Zettlemoyer. Surface form competition: Why the highest probability answer isn't always right.
|
45 |
+
|
46 |
+
Example3:
|
47 |
+
Authors: Mikel Artetxe, Shruti Bhosale, Naman Goyal, Todor Mihaylov, Myle Ott, Sam Shleifer, Xi Victoria Lin, Jingfei Du, Srinivasan Iyer, Ramakanth Pasunuru, Giri Anantharaman, Xian Li, Shuohui Chen, Halil Akin, Mandeep Baines, Louis Martin, Xing Zhou, Punit Singh Koura, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Mona Diab, Zornitsa Kozareva, Ves Stoyanov
|
48 |
+
Title: Efficient large scale language modeling with mixtures of experts.
|
49 |
+
Reference: Mikel Artetxe, Shruti Bhosale, Naman Goyal, Todor Mihaylov, Myle Ott, Sam Shleifer, Xi Victoria Lin, Jingfei Du, Srinivasan Iyer, Ramakanth Pasunuru, et al. Efficient large scale language modeling with mixtures of experts.
|
50 |
+
'''
|
51 |
+
|
52 |
+
prompt = f'''
|
53 |
+
Based on the following examples, generate the references based on the provided paper information.
|
54 |
+
The generated references should be clear, legal and properly formatted.
|
55 |
+
If the authors are many, list the first few authors followed by "et al.".
|
56 |
+
|
57 |
+
Please include the "Reference:" label before each reference as shown in the examples.
|
58 |
+
|
59 |
+
{examples}
|
60 |
+
Now, please generate the references:
|
61 |
+
|
62 |
+
'''
|
63 |
+
|
64 |
+
for idx, paper in enumerate(papers_info):
|
65 |
+
authors = paper['authors']
|
66 |
+
title = paper['title']
|
67 |
+
prompt += f'''
|
68 |
+
Paper{idx+1}:
|
69 |
+
Authors: {authors}
|
70 |
+
Title: {title}
|
71 |
+
Reference:'''
|
72 |
+
|
73 |
+
response = generateResponse(client, prompt)
|
74 |
+
references = []
|
75 |
+
pattern = r'Reference:(.*?)(?=\n\n|$)'
|
76 |
+
matches = re.findall(pattern, response, re.S)
|
77 |
+
|
78 |
+
for match in matches:
|
79 |
+
reference = match.strip()
|
80 |
+
if reference:
|
81 |
+
references.append(reference)
|
82 |
+
|
83 |
+
return references
|
src/demo/survey_generation_pipeline/asg_abstract.py
ADDED
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
|
4 |
+
class AbstractGenerator:
|
5 |
+
def __init__(self, pipeline):
|
6 |
+
self.pipeline = pipeline
|
7 |
+
|
8 |
+
def generate(self, title, intro, mode='lora'):
|
9 |
+
if mode == 'lora' or mode == 'test':
|
10 |
+
if mode == 'lora':
|
11 |
+
self.pipeline.model.set_adapter("abstract")
|
12 |
+
|
13 |
+
system_prompt = f'''You are a helpful assistant that help to generate the abstract of the survey paper given the survey title and survey introduction.'''
|
14 |
+
# user_prompt = {"survey_title":survey_title, "claims":cluster_with_claims}
|
15 |
+
user_prompt = f'''Help me to generate the abstract of a survey paper given the title: *{title}*, and and the introduction:{intro}'''
|
16 |
+
|
17 |
+
messages = [
|
18 |
+
{"role": "system", "content": system_prompt},
|
19 |
+
{"role": "user", "content": user_prompt},
|
20 |
+
{"role": "assistant", "content":"Abstract: This survey "}
|
21 |
+
]
|
22 |
+
|
23 |
+
outputs = self.pipeline(
|
24 |
+
messages,
|
25 |
+
max_new_tokens=4096,
|
26 |
+
)
|
27 |
+
result = outputs[0]["generated_text"][-1]['content']
|
28 |
+
return result
|
29 |
+
else:
|
30 |
+
raise ValueError('mode not supported')
|
31 |
+
|
32 |
+
if __name__ == '__main__':
|
33 |
+
from transformers import pipeline
|
34 |
+
import torch
|
35 |
+
import transformers
|
36 |
+
|
37 |
+
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
38 |
+
Global_pipeline = transformers.pipeline(
|
39 |
+
"text-generation",
|
40 |
+
model=model_id,
|
41 |
+
model_kwargs={"torch_dtype": torch.bfloat16},
|
42 |
+
token = os.getenv('HF_API_KEY'),
|
43 |
+
device_map="auto",
|
44 |
+
)
|
45 |
+
Global_pipeline.model.load_adapter(peft_model_id = "technicolor/llama3.1_8b_outline_generation", adapter_name="outline")
|
46 |
+
Global_pipeline.model.load_adapter(peft_model_id ="technicolor/llama3.1_8b_abstract_generation", adapter_name="abstract")
|
47 |
+
Global_pipeline.model.load_adapter(peft_model_id ="technicolor/llama3.1_8b_conclusion_generation", adapter_name="conclusion")
|
48 |
+
title = "A Survey of Large Language Models"
|
49 |
+
intro = '''L
|
50 |
+
ANGUAGE is a prominent ability in human beings to
|
51 |
+
express and communicate, which develops in early
|
52 |
+
childhood and evolves over a lifetime [3, 4]. Machines,
|
53 |
+
however, cannot naturally grasp the abilities of understanding and communicating in the form of human language,
|
54 |
+
unless equipped with powerful artificial intelligence (AI)
|
55 |
+
algorithms. It has been a longstanding research challenge
|
56 |
+
to achieve this goal, to enable machines to read, write, and
|
57 |
+
communicate like humans [5].
|
58 |
+
Technically, language modeling (LM) is one of the major
|
59 |
+
approaches to advancing language intelligence of machines.
|
60 |
+
In general, LM aims to model the generative likelihood
|
61 |
+
of word sequences, so as to predict the probabilities of
|
62 |
+
future (or missing) tokens. The research of LM has received
|
63 |
+
extensive attention in the literature, which can be divided
|
64 |
+
into four major development stages:
|
65 |
+
• Statistical language models (SLM). SLMs [6–9] are developed based on statistical learning methods that rose in
|
66 |
+
the 1990s. The basic idea is to build the word prediction
|
67 |
+
model based on the Markov assumption, e.g., predicting the
|
68 |
+
next word based on the most recent context. The SLMs with
|
69 |
+
a fixed context length n are also called n-gram language
|
70 |
+
models, e.g., bigram and trigram language models. SLMs
|
71 |
+
have been widely applied to enhance task performance
|
72 |
+
in information retrieval (IR) [10, 11] and natural language
|
73 |
+
processing (NLP) [12–14]. However, they often suffer from
|
74 |
+
the curse of dimensionality: it is difficult to accurately
|
75 |
+
estimate high-order language models since an exponential
|
76 |
+
number of transition probabilities need to be estimated.
|
77 |
+
Thus, specially designed smoothing strategies such as backoff estimation [15] and Good–Turing estimation [16] have
|
78 |
+
been introduced to alleviate the data sparsity problem.
|
79 |
+
• Neural language models (NLM). NLMs [1, 17, 18] characterize the probability of word sequences by neural networks,
|
80 |
+
e.g., multi-layer perceptron (MLP) and recurrent neural networks (RNNs). As a remarkable contribution, the work in
|
81 |
+
[1] introduced the concept of distributed representation of
|
82 |
+
words and built the word prediction function conditioned
|
83 |
+
on the aggregated context features (i.e., the distributed
|
84 |
+
word vectors). By extending the idea of learning effective
|
85 |
+
features for text data, a general neural network approach
|
86 |
+
was developed to build a unified, end-to-end solution for
|
87 |
+
various NLP tasks [2]. Furthermore, word2vec [19, 20] was
|
88 |
+
proposed to build a simplified shallow neural network
|
89 |
+
for learning distributed word representations, which were
|
90 |
+
demonstrated to be very effective across a variety of NLP
|
91 |
+
tasks. These studies have initiated the use of language
|
92 |
+
models for representation learning (beyond word sequence
|
93 |
+
modeling), having an important impact on the field of NLP.
|
94 |
+
• Pre-trained language models (PLM). As an early attempt, ELMo [21] was proposed to capture context-aware
|
95 |
+
word representations by first pre-training a bidirectional
|
96 |
+
LSTM (biLSTM) network (instead of learning fixed word
|
97 |
+
representations) and then fine-tuning the biLSTM network
|
98 |
+
according to specific downstream tasks. Furthermore, based
|
99 |
+
on the highly parallelizable Transformer architecture [22]
|
100 |
+
with self-attention mechanisms, BERT [23] was proposed by
|
101 |
+
pre-training bidirectional language models with specially
|
102 |
+
designed pre-training tasks on large-scale unlabeled corpora. These pre-trained context-aware word representations
|
103 |
+
are very effective as general-purpose semantic features,
|
104 |
+
which have largely raised the performance bar of NLP
|
105 |
+
tasks. This study has inspired a large number of follow-up
|
106 |
+
work, which sets the “pre-training and fine-tuning” learning
|
107 |
+
paradigm. Following this paradigm, a great number of studies on PLMs have been developed, introducing either different architectures [24, 25] (e.g., GPT-2 [26] and BART [24]) or
|
108 |
+
improved pre-training strategies [27–29]. In this paradigm, it
|
109 |
+
often requires fine-tuning the PLM for adapting to different
|
110 |
+
downstream tasks.
|
111 |
+
• Large language models (LLM). Researchers find that
|
112 |
+
scaling PLM (e.g., scaling model size or data size) often
|
113 |
+
leads to an improved model capacity on downstream tasks
|
114 |
+
(i.e., following the scaling law [30]). A number of studies
|
115 |
+
have explored the performance limit by training an ever
|
116 |
+
larger PLM (e.g., the 175B-parameter GPT-3 and the 540Bparameter PaLM). Although scaling is mainly conducted
|
117 |
+
in model size (with similar architectures and pre-training
|
118 |
+
tasks), these large-sized PLMs display different behaviors
|
119 |
+
from smaller PLMs (e.g., 330M-parameter BERT and 1.5Bparameter GPT-2) and show surprising abilities (called emergent abilities [31]) in solving a series of complex tasks. For
|
120 |
+
example, GPT-3 can solve few-shot tasks through in-context
|
121 |
+
learning, whereas GPT-2 cannot do well. Thus, the research
|
122 |
+
community coins the term “large language models (LLM)”
|
123 |
+
1
|
124 |
+
for these large-sized PLMs [32–35], which attract increasing
|
125 |
+
research attention (See Figure 1). A remarkable application
|
126 |
+
of LLMs is ChatGPT2
|
127 |
+
that adapts the LLMs from the GPT
|
128 |
+
series for dialogue, which presents an amazing conversation
|
129 |
+
ability with humans. We can observe a sharp increase of the
|
130 |
+
arXiv papers that are related to LLMs after the release of
|
131 |
+
ChatGPT in Figure 1.
|
132 |
+
As discussed before, language model is not a new technical concept specially for LLMs, but has evolved with the
|
133 |
+
advance of artificial intelligence over the decades. Early language models mainly aim to model and generate text data,
|
134 |
+
while latest language models (e.g., GPT-4) focus on complex
|
135 |
+
task solving. From language modeling to task solving, it is an
|
136 |
+
important leap in scientific thinking, which is the key to
|
137 |
+
understand the development of language models in the research history. From the perspective of task solving, the four
|
138 |
+
generations of language models have exhibited different levels of model capacities. In Figure 2, we describe the evolution process of language models in terms of the task solving
|
139 |
+
capacity. At first, statistical language models mainly assisted
|
140 |
+
in some specific tasks (e.g., retrieval or speech tasks), in
|
141 |
+
which the predicted or estimated probabilities can enhance
|
142 |
+
the performance of task-specific approaches. Subsequently,
|
143 |
+
neural language models focused on learning task-agnostic
|
144 |
+
representations (e.g., features), aiming to reduce the efforts
|
145 |
+
for human feature engineering. Furthermore, pre-trained
|
146 |
+
language models learned context-aware representations that
|
147 |
+
can be optimized according to downstream tasks. For the
|
148 |
+
latest generation of language model, LLMs are enhanced by
|
149 |
+
exploring the scaling effect on model capacity, which can be
|
150 |
+
considered as general-purpose task solvers. To summarize,
|
151 |
+
in the evolution process, the task scope that can be solved
|
152 |
+
by language models have been greatly extended, and the
|
153 |
+
task performance attained by language models have been
|
154 |
+
significantly enhanced.
|
155 |
+
In the existing literature, PLMs have been widely discussed and surveyed [36–39], while LLMs are seldom reviewed in a systematic way. To motivate our survey, we first
|
156 |
+
highlight three major differences between LLMs and PLMs.
|
157 |
+
First, LLMs display some surprising emergent abilities that
|
158 |
+
may not be observed in previous smaller PLMs. These abilities are key to the performance of language models on complex tasks, making AI algorithms unprecedently powerful
|
159 |
+
and effective. Second, LLMs would revolutionize the way
|
160 |
+
that humans develop and use AI algorithms. Unlike small
|
161 |
+
PLMs, the major approach to accessing LLMs is through
|
162 |
+
the prompting interface (e.g., GPT-4 API). Humans have to
|
163 |
+
understand how LLMs work and format their tasks in a way
|
164 |
+
that LLMs can follow. Third, the development of LLMs no
|
165 |
+
longer draws a clear distinction between research and engineering. The training of LLMs requires extensive practical
|
166 |
+
experiences in large-scale data processing and distributed
|
167 |
+
parallel training. To develop capable LLMs, researchers
|
168 |
+
have to solve complicated engineering issues, working with
|
169 |
+
engineers or being engineers.
|
170 |
+
Nowadays, LLMs are posing a significant impact on
|
171 |
+
the AI community, and the advent of ChatGPT and GPT-4
|
172 |
+
leads to the rethinking of the possibilities of artificial general
|
173 |
+
intelligence (AGI). OpenAI has published a technical article
|
174 |
+
entitled “Planning for AGI and beyond”, which discusses
|
175 |
+
the short-term and long-term plans to approach AGI [40],
|
176 |
+
and a more recent paper has argued that GPT-4 might be
|
177 |
+
considered as an early version of an AGI system [41]. The
|
178 |
+
research areas of AI are being revolutionized by the rapid
|
179 |
+
progress of LLMs. In the field of NLP, LLMs can serve as a
|
180 |
+
general-purpose language task solver (to some extent), and
|
181 |
+
the research paradigm has been shifting towards the use
|
182 |
+
of LLMs. In the field of IR, traditional search engines are
|
183 |
+
challenged by the new information seeking way through AI
|
184 |
+
chatbots (i.e., ChatGPT), and New Bing3 presents an initial
|
185 |
+
attempt that enhances the search results based on LLMs. In
|
186 |
+
the field of CV, the researchers try to develop ChatGPT-like
|
187 |
+
vision-language models that can better serve multimodal
|
188 |
+
dialogues [42–45], and GPT-4 [46] has supported multimodal input by integrating the visual information. This new
|
189 |
+
wave of technology would potentially lead to a prosperous
|
190 |
+
ecosystem of real-world applications based on LLMs. For
|
191 |
+
instance, Microsoft 365 is being empowered by LLMs (i.e.,
|
192 |
+
Copilot) to automate the office work, and OpenAI supports
|
193 |
+
the use of plugins in ChatGPT for implementing special
|
194 |
+
functions.
|
195 |
+
Despite the progress and impact, the underlying principles of LLMs are still not well explored. Firstly, it is
|
196 |
+
mysterious why emergent abilities occur in LLMs, instead of
|
197 |
+
smaller PLMs. As a more general issue, there lacks a deep,
|
198 |
+
detailed investigation of the key factors that contribute to
|
199 |
+
the superior abilities of LLMs. It is important to study when
|
200 |
+
and how LLMs obtain such abilities [47]. Although there are
|
201 |
+
some meaningful discussions about this problem [31, 47],
|
202 |
+
more principled investigations are needed to uncover the
|
203 |
+
“secrets“ of LLMs. Secondly, it is difficult for the research
|
204 |
+
community to train capable LLMs. Due to the huge demand of computation resources, it is very costly to carry
|
205 |
+
out repetitive, ablating studies for investigating the effect
|
206 |
+
of various strategies for training LLMs. Indeed, LLMs are
|
207 |
+
mainly trained by industry, where many important training
|
208 |
+
details (e.g., data collection and cleaning) are not revealed
|
209 |
+
to the public. Thirdly, it is challenging to align LLMs with
|
210 |
+
human values or preferences. Despite the capacities, LLMs
|
211 |
+
are also likely to produce toxic, fictitious, or harmful contents. It requires effective and efficient control approaches
|
212 |
+
to eliminating the potential risk of the use of LLMs [46].
|
213 |
+
Faced with both opportunities and challenges, it needs
|
214 |
+
more attention on the research and development of LLMs. In
|
215 |
+
order to provide a basic understanding of LLMs, this survey
|
216 |
+
conducts a literature review of the recent advances in LLMs
|
217 |
+
from four major aspects, including pre-training (how to pretrain a capable LLM), adaptation (how to effectively adapt
|
218 |
+
pre-trained LLMs for better use), utilization (how to use
|
219 |
+
LLMs for solving various downstream tasks) and capability
|
220 |
+
evaluation (how to evaluate the abilities of LLMs and existing
|
221 |
+
empirical findings). We thoroughly comb the literature and
|
222 |
+
summarize the key findings, techniques, and methods of
|
223 |
+
LLMs. For this survey, we also create a GitHub project
|
224 |
+
website by collecting the supporting resources for LLMs, at
|
225 |
+
the link https://github.com/RUCAIBox/LLMSurvey. We
|
226 |
+
are also aware of several related review articles on PLMs
|
227 |
+
or LLMs [32, 36, 38, 39, 43, 48–54]. These papers either
|
228 |
+
discuss PLMs or some specific (or general) aspects of LLMs.
|
229 |
+
Compared with them, we focus on the techniques and
|
230 |
+
methods to develop and use LLMs and provide a relatively
|
231 |
+
comprehensive reference to important aspects of LLMs.
|
232 |
+
The remainder of this survey is organized as follows:
|
233 |
+
Section 2 introduces the background for LLMs and the evolution of GPT-series models, followed by the summarization
|
234 |
+
of available resources for developing LLMs in Section 3.
|
235 |
+
Sections 4, 5, 6, and 7 review and summarize the recent
|
236 |
+
progress from the four aspects of pre-training, adaptation,
|
237 |
+
utilization, and capacity evaluation, respectively. Then, Section 8 discusses the practical guide for prompt design,
|
238 |
+
and Section 9 reviews the applications of LLMs in several
|
239 |
+
representative domains. Finally, we conclude the survey in
|
240 |
+
Section 10 by summarizing the major findings and discuss
|
241 |
+
the remaining issues for future work.
|
242 |
+
'''
|
243 |
+
|
244 |
+
|
245 |
+
abstract_generator = AbstractGenerator(Global_pipeline)
|
246 |
+
with_lora = abstract_generator.generate(title, intro, mode='lora')
|
247 |
+
with_test = abstract_generator.generate(title, intro, mode='test')
|
src/demo/survey_generation_pipeline/asg_clustername.py
ADDED
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pandas as pd
|
3 |
+
import re # Import the regular expressions module
|
4 |
+
from openai import OpenAI
|
5 |
+
import ast
|
6 |
+
|
7 |
+
def generate_cluster_name_qwen_sep(tsv_path, survey_title):
|
8 |
+
data = pd.read_csv(tsv_path, sep='\t')
|
9 |
+
|
10 |
+
# Define the system prompt once, outside the loop
|
11 |
+
system_prompt = f'''You are a research assistant working on a survey paper. The survey paper is about "{survey_title}". \
|
12 |
+
'''
|
13 |
+
|
14 |
+
result = [] # Initialize the result list
|
15 |
+
|
16 |
+
for i in range(3): # Assuming labels are 0, 1, 2
|
17 |
+
sentence_list = [] # Reset sentence_list for each label
|
18 |
+
for j in range(len(data)):
|
19 |
+
if data['label'][j] == i:
|
20 |
+
sentence_list.append(data['retrieval_result'][j])
|
21 |
+
|
22 |
+
# Convert the sentence list to a string representation
|
23 |
+
user_prompt = f'''
|
24 |
+
Given a list of descriptions of sentences about an aspect of the survey, you need to use one phrase (within 8 words) to summarize it and treat it as a section title of your survey paper. \
|
25 |
+
Your response should be a list with only one element and without any other information, for example, ["Post-training of LLMs"] \
|
26 |
+
Your response must contain one keyword of the survey title, unspecified or irrelevant results are not allowed. \
|
27 |
+
The description list is:{sentence_list}'''
|
28 |
+
|
29 |
+
messages = [
|
30 |
+
{"role": "system", "content": system_prompt},
|
31 |
+
{"role": "user", "content": user_prompt},
|
32 |
+
]
|
33 |
+
|
34 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
35 |
+
openai_api_base = os.getenv("OPENAI_API_BASE")
|
36 |
+
client = OpenAI(
|
37 |
+
api_key=openai_api_key,
|
38 |
+
base_url=openai_api_base,
|
39 |
+
)
|
40 |
+
|
41 |
+
chat_response = client.chat.completions.create(
|
42 |
+
model=os.environ.get("MODEL"),
|
43 |
+
max_tokens=768,
|
44 |
+
temperature=0.5,
|
45 |
+
stop="<|im_end|>",
|
46 |
+
stream=True,
|
47 |
+
messages=messages
|
48 |
+
)
|
49 |
+
|
50 |
+
# Stream the response to a single text string
|
51 |
+
text = ""
|
52 |
+
for chunk in chat_response:
|
53 |
+
if chunk.choices[0].delta.content:
|
54 |
+
text += chunk.choices[0].delta.content
|
55 |
+
|
56 |
+
# Use regex to extract the first content within []
|
57 |
+
match = re.search(r'\[(.*?)\]', text)
|
58 |
+
if match:
|
59 |
+
cluster_name = match.group(1).strip() # Extract and clean the cluster name
|
60 |
+
# 去除集群名称两侧的引号(如果存在)
|
61 |
+
cluster_name = cluster_name.strip('"').strip("'")
|
62 |
+
result.append(cluster_name)
|
63 |
+
else:
|
64 |
+
result.append("No Cluster Name Found") # Handle cases where pattern isn't found
|
65 |
+
# print("The generated cluster names are:")
|
66 |
+
# print(result)
|
67 |
+
return result # This will be a list with three elements
|
68 |
+
|
69 |
+
# Example usage:
|
70 |
+
# result = generate_cluster_name_qwen_sep('path_to_your_file.tsv', 'Your Survey Title')
|
71 |
+
# print(result) # Output might look like ["Cluster One", "Cluster Two", "Cluster Three"]
|
72 |
+
|
73 |
+
def refine_cluster_name(cluster_names, survey_title):
|
74 |
+
cluster_names = str(cluster_names) # Convert to string to handle list input
|
75 |
+
# Define the system prompt to set the context
|
76 |
+
system_prompt = f'''You are a research assistant tasked with optimizing and refining a set of section titles for a survey paper. The survey paper is about "{survey_title}".
|
77 |
+
'''
|
78 |
+
|
79 |
+
# Construct the user prompt, including all cluster names
|
80 |
+
user_prompt = f'''
|
81 |
+
Here is a set of section titles generated for the survey topic "{survey_title}":
|
82 |
+
{cluster_names}
|
83 |
+
Please ensure that all cluster names are coherent and consistent with each other, and that each name is clear, concise, and accurately reflects the corresponding section.
|
84 |
+
Notice to remove the overlapping information between the cluster names.
|
85 |
+
Each cluster name should be within 8 words and include a keyword from the survey title.
|
86 |
+
Response with a list of section titles in the following format without any other irrelevant information,
|
87 |
+
For example, ["Refined Title 1", "Refined Title 2", "Refined Title 3"]
|
88 |
+
'''
|
89 |
+
|
90 |
+
messages = [
|
91 |
+
{"role": "system", "content": system_prompt},
|
92 |
+
{"role": "user", "content": user_prompt},
|
93 |
+
]
|
94 |
+
|
95 |
+
# Initialize OpenAI client
|
96 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
97 |
+
openai_api_base = os.getenv("OPENAI_API_BASE")
|
98 |
+
client = OpenAI(
|
99 |
+
api_key=openai_api_key,
|
100 |
+
base_url=openai_api_base,
|
101 |
+
)
|
102 |
+
|
103 |
+
try:
|
104 |
+
chat_response = client.chat.completions.create(
|
105 |
+
model=os.environ.get("MODEL"),
|
106 |
+
max_tokens=256,
|
107 |
+
temperature=0.5,
|
108 |
+
stop="<|im_end|>",
|
109 |
+
stream=True,
|
110 |
+
messages=messages
|
111 |
+
)
|
112 |
+
|
113 |
+
# Stream the response and concatenate into a complete text
|
114 |
+
text = ""
|
115 |
+
for chunk in chat_response:
|
116 |
+
if chunk.choices[0].delta.content:
|
117 |
+
text += chunk.choices[0].delta.content
|
118 |
+
|
119 |
+
# print("The raw response text is:")
|
120 |
+
# print(text)
|
121 |
+
|
122 |
+
# Use regex to extract content within square brackets
|
123 |
+
match = re.search(r'\[(.*?)\]', text)
|
124 |
+
if match:
|
125 |
+
refined_cluster_names = match.group(1).strip() # Extract and clean the cluster name
|
126 |
+
else:
|
127 |
+
refined_cluster_names = [
|
128 |
+
survey_title + ": Definition",
|
129 |
+
survey_title + ": Methods",
|
130 |
+
survey_title + ": Evaluation"
|
131 |
+
] # Handle cases where pattern isn't found
|
132 |
+
|
133 |
+
except Exception as e:
|
134 |
+
print(f"An error occurred while refining cluster names: {e}")
|
135 |
+
refined_cluster_names = ["Refinement Error"] * len(cluster_names)
|
136 |
+
|
137 |
+
refined_cluster_names = ast.literal_eval(refined_cluster_names) # Convert string to list
|
138 |
+
|
139 |
+
# print("The refined cluster names are:")
|
140 |
+
# print(refined_cluster_names)
|
141 |
+
return refined_cluster_names # Returns a list with the refined cluster names、
|
142 |
+
|
143 |
+
|
144 |
+
|
145 |
+
|
146 |
+
def generate_cluster_name_new(tsv_path, survey_title, cluster_num = 3):
|
147 |
+
data = pd.read_csv(tsv_path, sep='\t')
|
148 |
+
desp=[]
|
149 |
+
|
150 |
+
|
151 |
+
for i in range(cluster_num): # Assuming labels are 0, 1, 2
|
152 |
+
sentence_list = [] # Initialize the sentence list
|
153 |
+
for j in range(len(data)):
|
154 |
+
if data['label'][j] == i:
|
155 |
+
sentence_list.append(data['retrieval_result'][j])
|
156 |
+
desp.append(sentence_list)
|
157 |
+
|
158 |
+
system_prompt = f'''
|
159 |
+
You are a research assistant working on a survey paper. The survey paper is about "{survey_title}". '''
|
160 |
+
|
161 |
+
cluster_info = "\n".join([f'Cluster {i+1}: "{desp[i]}"' for i in range(cluster_num)])
|
162 |
+
|
163 |
+
user_prompt = f'''
|
164 |
+
Your task is to generate {cluster_num} distinctive cluster names (e.g., "Pre-training of LLMs") of the given clusters of reference papers, each reference paper is described by a sentence.
|
165 |
+
|
166 |
+
The clusters of reference papers are:
|
167 |
+
{cluster_info}
|
168 |
+
|
169 |
+
Your output should be a single list of {cluster_num} cluster names, e.g., ["Pre-training of LLMs", "Fine-tuning of LLMs", "Evaluation of LLMs"]
|
170 |
+
Do not output any other text or information.
|
171 |
+
'''
|
172 |
+
|
173 |
+
messages = [
|
174 |
+
{"role": "system", "content": system_prompt},
|
175 |
+
{"role": "user", "content": user_prompt},
|
176 |
+
]
|
177 |
+
|
178 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
179 |
+
openai_api_base = os.getenv("OPENAI_API_BASE")
|
180 |
+
client = OpenAI(
|
181 |
+
api_key=openai_api_key,
|
182 |
+
base_url=openai_api_base,
|
183 |
+
)
|
184 |
+
|
185 |
+
chat_response = client.chat.completions.create(
|
186 |
+
model=os.environ.get("MODEL"),
|
187 |
+
max_tokens=768,
|
188 |
+
temperature=0.5,
|
189 |
+
stop="<|im_end|>",
|
190 |
+
stream=True,
|
191 |
+
messages=messages
|
192 |
+
)
|
193 |
+
|
194 |
+
# Stream the response to a single text string
|
195 |
+
text = ""
|
196 |
+
for chunk in chat_response:
|
197 |
+
if chunk.choices[0].delta.content:
|
198 |
+
text += chunk.choices[0].delta.content
|
199 |
+
# print("The raw response text is:")
|
200 |
+
# print(text)
|
201 |
+
|
202 |
+
# Use regex to extract content within square brackets
|
203 |
+
match = re.search(r'\[(.*?)\]', text)
|
204 |
+
if match:
|
205 |
+
refined_cluster_names = match.group(1).strip() # Extract and clean the cluster name
|
206 |
+
else:
|
207 |
+
predefined_sections = [
|
208 |
+
"Definition", "Methods", "Evaluation", "Applications",
|
209 |
+
"Challenges", "Future Directions", "Comparisons", "Case Studies"
|
210 |
+
]
|
211 |
+
|
212 |
+
# 根据 cluster_num 选择前 cluster_num 个预定义类别
|
213 |
+
refined_cluster_names = [
|
214 |
+
f"{survey_title}: {predefined_sections[i]}" for i in range(cluster_num)
|
215 |
+
]
|
216 |
+
|
217 |
+
refined_cluster_names = ast.literal_eval(refined_cluster_names) # Convert string to list
|
218 |
+
|
219 |
+
# print("The refined cluster names are:")
|
220 |
+
# print(refined_cluster_names)
|
221 |
+
return refined_cluster_names # Returns a list with the refined cluster names、
|
222 |
+
|
223 |
+
|
224 |
+
if __name__ == "__main__":
|
225 |
+
refined_result = refine_cluster_name(["Pre-training of LLMs", "Fine-tuning of LLMs", "Evaluation of LLMs"], 'Survey of LLMs')
|
226 |
+
# print(refined_result)
|
227 |
+
|
228 |
+
|