technicolor commited on
Commit
a97d040
·
1 Parent(s): 8d551fa

Add Django InteractiveSurvey project

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .env +3 -0
  2. .gitattributes +3 -0
  3. .gitattributes copy +35 -0
  4. .gitignore +19 -0
  5. Dockerfile +34 -0
  6. README copy.md +12 -0
  7. flowchart_classifier.pth +3 -0
  8. scripts/additional_scripts.py +40 -0
  9. scripts/setup_env.py +48 -0
  10. src/.idea/asg.iml +12 -0
  11. src/.idea/misc.xml +6 -0
  12. src/.idea/modules.xml +8 -0
  13. src/.idea/vcs.xml +6 -0
  14. src/.idea/workspace.xml +271 -0
  15. src/DATA_PATH +0 -0
  16. src/__init__.py +0 -0
  17. src/asg/__init__.py +0 -0
  18. src/asg/asgi.py +16 -0
  19. src/asg/settings.py +126 -0
  20. src/asg/urls.py +22 -0
  21. src/asg/wsgi.py +16 -0
  22. src/db.sqlite3 +3 -0
  23. src/demo/__init__.py +0 -0
  24. src/demo/admin.py +3 -0
  25. src/demo/apps.py +5 -0
  26. src/demo/asg_abstract.py +247 -0
  27. src/demo/asg_add_flowchart.py +313 -0
  28. src/demo/asg_clustername.py +228 -0
  29. src/demo/asg_conclusion.py +253 -0
  30. src/demo/asg_generator.py +90 -0
  31. src/demo/asg_latex.py +816 -0
  32. src/demo/asg_loader.py +256 -0
  33. src/demo/asg_mindmap.py +302 -0
  34. src/demo/asg_outline.py +1029 -0
  35. src/demo/asg_query.py +326 -0
  36. src/demo/asg_retriever.py +364 -0
  37. src/demo/asg_splitter.py +25 -0
  38. src/demo/category_and_tsne.py +231 -0
  39. src/demo/count_files.py +20 -0
  40. src/demo/download.py +225 -0
  41. src/demo/latex_template/acl.sty +312 -0
  42. src/demo/latex_template/template.tex +22 -0
  43. src/demo/main.py +448 -0
  44. src/demo/migrations/__init__.py +0 -0
  45. src/demo/models.py +14 -0
  46. src/demo/postprocess.py +39 -0
  47. src/demo/query1.py +234 -0
  48. src/demo/references.py +83 -0
  49. src/demo/survey_generation_pipeline/asg_abstract.py +247 -0
  50. src/demo/survey_generation_pipeline/asg_clustername.py +228 -0
.env ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ OPENAI_API_KEY=sk-d474bcdf6cac4cceb472233d66d637bd
2
+ OPENAI_API_BASE=https://dashscope.aliyuncs.com/compatible-mode/v1
3
+ MODEL=qwen-plus-latest
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ src/static/img/paper_with_arrow.png filter=lfs diff=lfs merge=lfs -text
37
+ src/static/img/papers.png filter=lfs diff=lfs merge=lfs -text
38
+ src/db.sqlite3 filter=lfs diff=lfs merge=lfs -text
.gitattributes copy ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 忽略 Python 缓存文件
2
+ __pycache__/
3
+ *.pyc
4
+
5
+ src/static/data
6
+ chromadb
7
+ resources
8
+ logs
9
+ examples
10
+
11
+ src/demo/survey_generation_pipeline/info
12
+ src/demo/survey_generation_pipeline/txt
13
+ src/demo/survey_generation_pipeline/logs
14
+ src/demo/survey_generation_pipeline/md
15
+ src/demo/survey_generation_pipeline/pdfs
16
+ src/demo/survey_generation_pipeline/result
17
+ src/demo/survey_generation_pipeline/tsv
18
+ src/demo/survey_generation_pipeline/txt
19
+ InteractiveSurvey-default-report.pdf
Dockerfile ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 使用官方的轻量镜像
2
+ FROM python:3.10-slim
3
+
4
+ # 设置工作目录
5
+ WORKDIR /app
6
+
7
+ # 更新系统并安装依赖
8
+ RUN apt-get update && apt-get install -y --no-install-recommends \
9
+ git wget curl build-essential \
10
+ graphviz texlive-xetex texlive-fonts-recommended texlive-latex-recommended libgl1 && \
11
+ apt-get clean && rm -rf /var/lib/apt/lists/*
12
+
13
+ # RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y
14
+ # 复制项目依赖文件并安装环境
15
+ COPY scripts/ /app/
16
+ RUN python setup_env.py
17
+
18
+ # 下载模型文件并调整存储路径
19
+ RUN wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py && \
20
+ python download_models_hf.py && \
21
+ rm -rf /root/.cache/pip && \
22
+ python additional_scripts.py
23
+
24
+ # 复制项目代码
25
+ COPY . /app/
26
+
27
+ # 暴露服务端口
28
+ EXPOSE 7860
29
+
30
+ # 健康检查
31
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s CMD curl -f http://localhost:7860/ || exit 1
32
+
33
+ # 启动命令
34
+ CMD ["python", "src/manage.py", "runserver", "0.0.0.0:7860"]
README copy.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: InteractiveSurvey
3
+ emoji: 🦀
4
+ colorFrom: yellow
5
+ colorTo: purple
6
+ sdk: docker
7
+ pinned: false
8
+ license: apache-2.0
9
+ short_description: 'InteractiveSurvey: An LLM-based Personalized and Interactive'
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
flowchart_classifier.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a69d2097e1bc09e3e2b69272c5afade1087acebead83c30f1c6dc6561804badd
3
+ size 16348318
scripts/additional_scripts.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ nltk.download('averaged_perceptron_tagger', download_dir='/usr/local/nltk_data')
3
+
4
+ import json
5
+ import os
6
+
7
+ file_path = "/root/magic-pdf.json"
8
+
9
+ new_config = {
10
+ "device-mode": "cuda",
11
+ "layout-config": {
12
+ "model": "layoutlmv3"
13
+ },
14
+ "formula-config": {
15
+ "mfd_model": "yolo_v8_mfd",
16
+ "mfr_model": "unimernet_small",
17
+ "enable": False
18
+ },
19
+ "table-config": {
20
+ "model": "tablemaster",
21
+ "enable": False,
22
+ "max_time": 400
23
+ }
24
+ }
25
+
26
+ if os.path.exists(file_path):
27
+ with open(file_path, "r", encoding="utf-8") as file:
28
+ try:
29
+ data = json.load(file)
30
+ except json.JSONDecodeError:
31
+ data = {}
32
+ else:
33
+ data = {}
34
+
35
+ data.update(new_config)
36
+
37
+ with open(file_path, "w", encoding="utf-8") as file:
38
+ json.dump(data, file, indent=4)
39
+
40
+ print(f"File '{file_path}' has been updated.")
scripts/setup_env.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import subprocess
3
+ import sys
4
+ import shutil
5
+ import os
6
+
7
+ def clear_pip_cache():
8
+ print("🧹 Cleaning...")
9
+ try:
10
+ # 获取 pip 缓存目录
11
+ result = subprocess.run([sys.executable, "-m", "pip", "cache", "dir"], stdout=subprocess.PIPE, check=True, text=True)
12
+ cache_dir = result.stdout.strip()
13
+ if os.path.exists(cache_dir):
14
+ shutil.rmtree(cache_dir)
15
+ print(f"✅ : {cache_dir}")
16
+ else:
17
+ print("No cache dir found.")
18
+ except Exception as e:
19
+ print(f"❌ {e}")
20
+
21
+ # 按顺序构造 pip 安装命令列表(全部加上 --no-cache-dir)
22
+ commands = [
23
+ [sys.executable, "-m", "pip", "install", "--no-cache-dir", "unstructured==0.16.10"],
24
+ [sys.executable, "-m", "pip", "install", "--no-cache-dir", "requests==2.32.3"],
25
+ [sys.executable, "-m", "pip", "install", "--no-cache-dir", "chromadb==0.5.4"],
26
+ [sys.executable, "-m", "pip", "install", "--no-cache-dir", "langchain-huggingface==0.1.2"],
27
+ [sys.executable, "-m", "pip", "install", "--no-cache-dir", "markdown_pdf==1.3"],
28
+ [sys.executable, "-m", "pip", "install", "--no-cache-dir", "bertopic==0.16.3"],
29
+ [sys.executable, "-m", "pip", "install", "--no-cache-dir", "-U", "langchain-community"],
30
+ [sys.executable, "-m", "pip", "install", "--no-cache-dir", "--force-reinstall", "torch==2.3.1", "torchvision==0.18.1", "numpy<2.0.0", "--index-url", "https://download.pytorch.org/whl/cu118"],
31
+ [sys.executable, "-m", "pip", "install", "--no-cache-dir", "-U", "magic-pdf[full]", "--extra-index-url", "https://wheels.myhloli.com"],
32
+ [sys.executable, "-m", "pip", "install", "--no-cache-dir", "Django==2.2.5"]
33
+ [sys.executable, "-m", "pip", "install", "--no-cache-dir", "graphviz"]
34
+ ]
35
+
36
+ def run_commands(cmds):
37
+ for cmd in cmds:
38
+ cmd_str = " ".join(cmd)
39
+ print(f"🚀 {cmd_str}")
40
+ try:
41
+ subprocess.run(cmd, check=True)
42
+ except subprocess.CalledProcessError:
43
+ print(f"❌ {cmd_str}")
44
+ sys.exit(1)
45
+
46
+ if __name__ == "__main__":
47
+ clear_pip_cache()
48
+ run_commands(commands)
src/.idea/asg.iml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="WEB_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$">
5
+ <excludeFolder url="file://$MODULE_DIR$/.tmp" />
6
+ <excludeFolder url="file://$MODULE_DIR$/temp" />
7
+ <excludeFolder url="file://$MODULE_DIR$/tmp" />
8
+ </content>
9
+ <orderEntry type="inheritedJdk" />
10
+ <orderEntry type="sourceFolder" forTests="false" />
11
+ </component>
12
+ </module>
src/.idea/misc.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="JavaScriptSettings">
4
+ <option name="languageLevel" value="ES6" />
5
+ </component>
6
+ </project>
src/.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/asg.iml" filepath="$PROJECT_DIR$/.idea/asg.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
src/.idea/vcs.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="$PROJECT_DIR$/.." vcs="Git" />
5
+ </component>
6
+ </project>
src/.idea/workspace.xml ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ChangeListManager">
4
+ <list default="true" id="cf214f68-be48-41a9-bfbc-51edc15fe9c5" name="Default Changelist" comment="">
5
+ <change afterPath="$PROJECT_DIR$/demo/templates/demo/index_beta.html" afterDir="false" />
6
+ <change beforePath="$PROJECT_DIR$/../.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/../.idea/workspace.xml" afterDir="false" />
7
+ <change beforePath="$PROJECT_DIR$/demo/category_and_tsne.py" beforeDir="false" afterPath="$PROJECT_DIR$/demo/category_and_tsne.py" afterDir="false" />
8
+ <change beforePath="$PROJECT_DIR$/demo/taskDes.py" beforeDir="false" afterPath="$PROJECT_DIR$/demo/taskDes.py" afterDir="false" />
9
+ <change beforePath="$PROJECT_DIR$/demo/templates/demo/index.html" beforeDir="false" afterPath="$PROJECT_DIR$/demo/templates/demo/index.html" afterDir="false" />
10
+ <change beforePath="$PROJECT_DIR$/demo/views.py" beforeDir="false" afterPath="$PROJECT_DIR$/demo/views.py" afterDir="false" />
11
+ <change beforePath="$PROJECT_DIR$/model_dm" beforeDir="false" afterPath="$PROJECT_DIR$/model_dm" afterDir="false" />
12
+ <change beforePath="$PROJECT_DIR$/static/data/001.tsv" beforeDir="false" afterPath="$PROJECT_DIR$/static/data/001.tsv" afterDir="false" />
13
+ <change beforePath="$PROJECT_DIR$/static/img/tsne_2907070.png" beforeDir="false" afterPath="$PROJECT_DIR$/static/img/tsne_2907070.png" afterDir="false" />
14
+ <change beforePath="$PROJECT_DIR$/static/img/tsne_3274658.png" beforeDir="false" afterPath="$PROJECT_DIR$/static/img/tsne_3274658.png" afterDir="false" />
15
+ <change beforePath="$PROJECT_DIR$/test.tsv" beforeDir="false" afterPath="$PROJECT_DIR$/test.tsv" afterDir="false" />
16
+ </list>
17
+ <ignored path="$PROJECT_DIR$/.tmp/" />
18
+ <ignored path="$PROJECT_DIR$/temp/" />
19
+ <ignored path="$PROJECT_DIR$/tmp/" />
20
+ <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
21
+ <option name="SHOW_DIALOG" value="false" />
22
+ <option name="HIGHLIGHT_CONFLICTS" value="true" />
23
+ <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
24
+ <option name="LAST_RESOLUTION" value="IGNORE" />
25
+ </component>
26
+ <component name="FUSProjectUsageTrigger">
27
+ <session id="-1703485925">
28
+ <usages-collector id="statistics.lifecycle.project">
29
+ <counts>
30
+ <entry key="project.closed" value="6" />
31
+ <entry key="project.open.time.1" value="1" />
32
+ <entry key="project.open.time.3" value="4" />
33
+ <entry key="project.open.time.4" value="1" />
34
+ <entry key="project.opened" value="6" />
35
+ </counts>
36
+ </usages-collector>
37
+ <usages-collector id="statistics.file.extensions.open">
38
+ <counts>
39
+ <entry key="html" value="2" />
40
+ </counts>
41
+ </usages-collector>
42
+ <usages-collector id="statistics.file.types.open">
43
+ <counts>
44
+ <entry key="HTML" value="2" />
45
+ </counts>
46
+ </usages-collector>
47
+ <usages-collector id="statistics.file.extensions.edit">
48
+ <counts>
49
+ <entry key="html" value="1590" />
50
+ </counts>
51
+ </usages-collector>
52
+ <usages-collector id="statistics.file.types.edit">
53
+ <counts>
54
+ <entry key="HTML" value="1590" />
55
+ </counts>
56
+ </usages-collector>
57
+ </session>
58
+ </component>
59
+ <component name="FileEditorManager">
60
+ <leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
61
+ <file pinned="false" current-in-tab="true">
62
+ <entry file="file://$PROJECT_DIR$/demo/templates/demo/index_beta.html">
63
+ <provider selected="true" editor-type-id="text-editor">
64
+ <state relative-caret-position="858">
65
+ <caret line="370" column="5" lean-forward="true" selection-start-line="322" selection-start-column="4" selection-end-line="370" selection-end-column="5" />
66
+ <folding>
67
+ <element signature="n#div#1;n#div#0;n#div#0;n#body#0;n#html#0;n#!!top" />
68
+ <element signature="n#div#2;n#div#0;n#div#0;n#body#0;n#html#0;n#!!top" />
69
+ <element signature="n#div#0;n#div#2;n#div#0;n#div#0;n#body#0;n#html#0;n#!!top" />
70
+ <element signature="n#style#0;n#li#0;n#!!top" expanded="true" />
71
+ <element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
72
+ <element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
73
+ <element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
74
+ <element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
75
+ <element signature="n#style#0;n#h3#0;n#!!top" expanded="true" />
76
+ <element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
77
+ <element signature="n#style#0;n#p#0;n#!!top" expanded="true" />
78
+ <element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
79
+ <element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
80
+ <element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
81
+ <element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
82
+ <element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
83
+ </folding>
84
+ </state>
85
+ </provider>
86
+ </entry>
87
+ </file>
88
+ </leaf>
89
+ </component>
90
+ <component name="Git.Settings">
91
+ <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$/.." />
92
+ </component>
93
+ <component name="IdeDocumentHistory">
94
+ <option name="CHANGED_PATHS">
95
+ <list>
96
+ <option value="$PROJECT_DIR$/demo/templates/demo/index.html" />
97
+ <option value="$PROJECT_DIR$/demo/templates/demo/index_beta.html" />
98
+ </list>
99
+ </option>
100
+ </component>
101
+ <component name="JsBuildToolGruntFileManager" detection-done="true" sorting="DEFINITION_ORDER" />
102
+ <component name="JsBuildToolPackageJson" detection-done="true" sorting="DEFINITION_ORDER" />
103
+ <component name="JsGulpfileManager">
104
+ <detection-done>true</detection-done>
105
+ <sorting>DEFINITION_ORDER</sorting>
106
+ </component>
107
+ <component name="ProjectFrameBounds" fullScreen="true">
108
+ <option name="x" value="937" />
109
+ <option name="y" value="1440" />
110
+ <option name="width" value="1680" />
111
+ <option name="height" value="1050" />
112
+ </component>
113
+ <component name="ProjectLevelVcsManager" settingsEditedManually="true" />
114
+ <component name="ProjectView">
115
+ <navigator proportions="" version="1">
116
+ <foldersAlwaysOnTop value="true" />
117
+ </navigator>
118
+ <panes>
119
+ <pane id="ProjectPane">
120
+ <subPane>
121
+ <expand>
122
+ <path>
123
+ <item name="asg" type="b2602c69:ProjectViewProjectNode" />
124
+ <item name="asg" type="462c0819:PsiDirectoryNode" />
125
+ </path>
126
+ <path>
127
+ <item name="asg" type="b2602c69:ProjectViewProjectNode" />
128
+ <item name="asg" type="462c0819:PsiDirectoryNode" />
129
+ <item name="demo" type="462c0819:PsiDirectoryNode" />
130
+ </path>
131
+ <path>
132
+ <item name="asg" type="b2602c69:ProjectViewProjectNode" />
133
+ <item name="asg" type="462c0819:PsiDirectoryNode" />
134
+ <item name="demo" type="462c0819:PsiDirectoryNode" />
135
+ <item name="templates" type="462c0819:PsiDirectoryNode" />
136
+ </path>
137
+ <path>
138
+ <item name="asg" type="b2602c69:ProjectViewProjectNode" />
139
+ <item name="asg" type="462c0819:PsiDirectoryNode" />
140
+ <item name="demo" type="462c0819:PsiDirectoryNode" />
141
+ <item name="templates" type="462c0819:PsiDirectoryNode" />
142
+ <item name="demo" type="462c0819:PsiDirectoryNode" />
143
+ </path>
144
+ <path>
145
+ <item name="asg" type="b2602c69:ProjectViewProjectNode" />
146
+ <item name="asg" type="462c0819:PsiDirectoryNode" />
147
+ <item name="static" type="462c0819:PsiDirectoryNode" />
148
+ </path>
149
+ <path>
150
+ <item name="asg" type="b2602c69:ProjectViewProjectNode" />
151
+ <item name="asg" type="462c0819:PsiDirectoryNode" />
152
+ <item name="static" type="462c0819:PsiDirectoryNode" />
153
+ <item name="img" type="462c0819:PsiDirectoryNode" />
154
+ </path>
155
+ </expand>
156
+ <select />
157
+ </subPane>
158
+ </pane>
159
+ <pane id="Scope" />
160
+ </panes>
161
+ </component>
162
+ <component name="PropertiesComponent">
163
+ <property name="WebServerToolWindowFactoryState" value="false" />
164
+ <property name="last_opened_file_path" value="$PROJECT_DIR$" />
165
+ <property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
166
+ <property name="nodejs_npm_path_reset_for_default_project" value="true" />
167
+ </component>
168
+ <component name="RecentsManager">
169
+ <key name="CopyFile.RECENT_KEYS">
170
+ <recent name="$PROJECT_DIR$/demo/templates/demo" />
171
+ <recent name="$PROJECT_DIR$/static/img" />
172
+ </key>
173
+ </component>
174
+ <component name="RunDashboard">
175
+ <option name="ruleStates">
176
+ <list>
177
+ <RuleState>
178
+ <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
179
+ </RuleState>
180
+ <RuleState>
181
+ <option name="name" value="StatusDashboardGroupingRule" />
182
+ </RuleState>
183
+ </list>
184
+ </option>
185
+ </component>
186
+ <component name="SvnConfiguration">
187
+ <configuration />
188
+ </component>
189
+ <component name="TaskManager">
190
+ <task active="true" id="Default" summary="Default task">
191
+ <changelist id="cf214f68-be48-41a9-bfbc-51edc15fe9c5" name="Default Changelist" comment="" />
192
+ <created>1655724718572</created>
193
+ <option name="number" value="Default" />
194
+ <option name="presentableId" value="Default" />
195
+ <updated>1655724718572</updated>
196
+ <workItem from="1655724720238" duration="1478000" />
197
+ <workItem from="1657850351467" duration="27000" />
198
+ <workItem from="1657851735564" duration="5134000" />
199
+ <workItem from="1657960829388" duration="4835000" />
200
+ <workItem from="1658119972542" duration="6469000" />
201
+ <workItem from="1658150456661" duration="1476000" />
202
+ </task>
203
+ <servers />
204
+ </component>
205
+ <component name="TimeTrackingManager">
206
+ <option name="totallyTimeSpent" value="19419000" />
207
+ </component>
208
+ <component name="ToolWindowManager">
209
+ <frame x="937" y="1440" width="1680" height="1050" extended-state="0" />
210
+ <editor active="true" />
211
+ <layout>
212
+ <window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.16696805" />
213
+ <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
214
+ <window_info id="Favorites" order="2" side_tool="true" />
215
+ <window_info anchor="bottom" id="Message" order="0" />
216
+ <window_info anchor="bottom" id="Find" order="1" />
217
+ <window_info anchor="bottom" id="Run" order="2" />
218
+ <window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
219
+ <window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
220
+ <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
221
+ <window_info anchor="bottom" id="TODO" order="6" />
222
+ <window_info anchor="bottom" id="Docker" order="7" show_stripe_button="false" />
223
+ <window_info anchor="bottom" id="Version Control" order="8" show_stripe_button="false" />
224
+ <window_info anchor="bottom" id="Terminal" order="9" />
225
+ <window_info anchor="bottom" id="Event Log" order="10" side_tool="true" />
226
+ <window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
227
+ <window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
228
+ <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
229
+ </layout>
230
+ </component>
231
+ <component name="TypeScriptGeneratedFilesManager">
232
+ <option name="version" value="1" />
233
+ </component>
234
+ <component name="VcsContentAnnotationSettings">
235
+ <option name="myLimit" value="2678400000" />
236
+ </component>
237
+ <component name="editorHistoryManager">
238
+ <entry file="file://$PROJECT_DIR$/demo/templates/demo/index.html">
239
+ <provider selected="true" editor-type-id="text-editor">
240
+ <state relative-caret-position="983">
241
+ <caret line="121" lean-forward="true" selection-start-line="121" selection-end-line="121" />
242
+ </state>
243
+ </provider>
244
+ </entry>
245
+ <entry file="file://$PROJECT_DIR$/demo/templates/demo/index_beta.html">
246
+ <provider selected="true" editor-type-id="text-editor">
247
+ <state relative-caret-position="858">
248
+ <caret line="370" column="5" lean-forward="true" selection-start-line="322" selection-start-column="4" selection-end-line="370" selection-end-column="5" />
249
+ <folding>
250
+ <element signature="n#div#1;n#div#0;n#div#0;n#body#0;n#html#0;n#!!top" />
251
+ <element signature="n#div#2;n#div#0;n#div#0;n#body#0;n#html#0;n#!!top" />
252
+ <element signature="n#div#0;n#div#2;n#div#0;n#div#0;n#body#0;n#html#0;n#!!top" />
253
+ <element signature="n#style#0;n#li#0;n#!!top" expanded="true" />
254
+ <element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
255
+ <element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
256
+ <element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
257
+ <element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
258
+ <element signature="n#style#0;n#h3#0;n#!!top" expanded="true" />
259
+ <element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
260
+ <element signature="n#style#0;n#p#0;n#!!top" expanded="true" />
261
+ <element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
262
+ <element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
263
+ <element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
264
+ <element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
265
+ <element signature="n#style#0;n#h2#0;n#!!top" expanded="true" />
266
+ </folding>
267
+ </state>
268
+ </provider>
269
+ </entry>
270
+ </component>
271
+ </project>
src/DATA_PATH ADDED
Binary file (17.2 kB). View file
 
src/__init__.py ADDED
File without changes
src/asg/__init__.py ADDED
File without changes
src/asg/asgi.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ASGI config for asg project.
3
+
4
+ It exposes the ASGI callable as a module-level variable named ``application``.
5
+
6
+ For more information on this file, see
7
+ https://docs.djangoproject.com/en/3.0/howto/deployment/asgi/
8
+ """
9
+
10
+ import os
11
+
12
+ from django.core.asgi import get_asgi_application
13
+
14
+ os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'asg.settings')
15
+
16
+ application = get_asgi_application()
src/asg/settings.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Django settings for asg project.
3
+
4
+ Generated by 'django-admin startproject' using Django 3.0.7.
5
+
6
+ For more information on this file, see
7
+ https://docs.djangoproject.com/en/3.0/topics/settings/
8
+
9
+ For the full list of settings and their values, see
10
+ https://docs.djangoproject.com/en/3.0/ref/settings/
11
+ """
12
+
13
+ import os
14
+
15
+ # Build paths inside the project like this: os.path.join(BASE_DIR, ...)
16
+ BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
17
+
18
+
19
+ # Quick-start development settings - unsuitable for production
20
+ # See https://docs.djangoproject.com/en/3.0/howto/deployment/checklist/
21
+
22
+ # SECURITY WARNING: keep the secret key used in production secret!
23
+ SECRET_KEY = 'g#7@+76ha9hwc1hl8!tnvr6rh1k_z8_@sleb=*8i@nh=h3_oi#'
24
+
25
+ # SECURITY WARNING: don't run with debug turned on in production!
26
+ DEBUG = True
27
+
28
+
29
+ ALLOWED_HOSTS = ['*',]
30
+
31
+
32
+ # Application definition
33
+
34
+ INSTALLED_APPS = [
35
+ 'django.contrib.admin',
36
+ 'django.contrib.auth',
37
+ 'django.contrib.contenttypes',
38
+ 'django.contrib.sessions',
39
+ 'django.contrib.messages',
40
+ 'django.contrib.staticfiles',
41
+ 'demo',
42
+ ]
43
+
44
+ MIDDLEWARE = [
45
+ 'django.middleware.security.SecurityMiddleware',
46
+ 'django.contrib.sessions.middleware.SessionMiddleware',
47
+ 'django.middleware.common.CommonMiddleware',
48
+ 'django.middleware.csrf.CsrfViewMiddleware',
49
+ 'django.contrib.auth.middleware.AuthenticationMiddleware',
50
+ 'django.contrib.messages.middleware.MessageMiddleware',
51
+ 'django.middleware.clickjacking.XFrameOptionsMiddleware',
52
+ ]
53
+
54
+ ROOT_URLCONF = 'asg.urls'
55
+
56
+ TEMPLATES = [
57
+ {
58
+ 'BACKEND': 'django.template.backends.django.DjangoTemplates',
59
+ 'DIRS': ['demo/templates'],
60
+ 'APP_DIRS': True,
61
+ 'OPTIONS': {
62
+ 'context_processors': [
63
+ 'django.template.context_processors.debug',
64
+ 'django.template.context_processors.request',
65
+ 'django.contrib.auth.context_processors.auth',
66
+ 'django.contrib.messages.context_processors.messages',
67
+ ],
68
+ },
69
+ },
70
+ ]
71
+
72
+ WSGI_APPLICATION = 'asg.wsgi.application'
73
+
74
+
75
+ # Database
76
+ # https://docs.djangoproject.com/en/3.0/ref/settings/#databases
77
+
78
+ DATABASES = {
79
+ 'default': {
80
+ 'ENGINE': 'django.db.backends.sqlite3',
81
+ 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'),
82
+ }
83
+ }
84
+
85
+
86
+ # Password validation
87
+ # https://docs.djangoproject.com/en/3.0/ref/settings/#auth-password-validators
88
+
89
+ AUTH_PASSWORD_VALIDATORS = [
90
+ {
91
+ 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
92
+ },
93
+ {
94
+ 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
95
+ },
96
+ {
97
+ 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
98
+ },
99
+ {
100
+ 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
101
+ },
102
+ ]
103
+
104
+
105
+ # Internationalization
106
+ # https://docs.djangoproject.com/en/3.0/topics/i18n/
107
+
108
+ LANGUAGE_CODE = 'en-us'
109
+
110
+ TIME_ZONE = 'UTC'
111
+
112
+ USE_I18N = True
113
+
114
+ USE_L10N = True
115
+
116
+ USE_TZ = True
117
+
118
+
119
+ # Static files (CSS, JavaScript, Images)
120
+ # https://docs.djangoproject.com/en/3.0/howto/static-files/
121
+
122
+ STATIC_URL = '/static/'
123
+
124
+ STATICFILES_DIRS = (
125
+ os.path.join(BASE_DIR, "static"),
126
+ )
src/asg/urls.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """asg URL Configuration
2
+
3
+ The `urlpatterns` list routes URLs to views. For more information please see:
4
+ https://docs.djangoproject.com/en/3.0/topics/http/urls/
5
+ Examples:
6
+ Function views
7
+ 1. Add an import: from my_app import views
8
+ 2. Add a URL to urlpatterns: path('', views.home, name='home')
9
+ Class-based views
10
+ 1. Add an import: from other_app.views import Home
11
+ 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
12
+ Including another URLconf
13
+ 1. Import the include() function: from django.urls import include, path
14
+ 2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
15
+ """
16
+ from django.contrib import admin
17
+ from django.urls import path, include
18
+
19
+ urlpatterns = [
20
+ path('admin/', admin.site.urls),
21
+ path('', include('demo.urls')),
22
+ ]
src/asg/wsgi.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ WSGI config for asg project.
3
+
4
+ It exposes the WSGI callable as a module-level variable named ``application``.
5
+
6
+ For more information on this file, see
7
+ https://docs.djangoproject.com/en/3.0/howto/deployment/wsgi/
8
+ """
9
+
10
+ import os
11
+
12
+ from django.core.wsgi import get_wsgi_application
13
+
14
+ os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'asg.settings')
15
+
16
+ application = get_wsgi_application()
src/db.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1424881b6d73f9e786d8ed6e666f3ac3e9e3159cab7bfe62ed52b1b7259b3eae
3
+ size 131072
src/demo/__init__.py ADDED
File without changes
src/demo/admin.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from django.contrib import admin
2
+
3
+ # Register your models here.
src/demo/apps.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from django.apps import AppConfig
2
+
3
+
4
+ class DemoConfig(AppConfig):
5
+ name = 'demo'
src/demo/asg_abstract.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+
4
+ class AbstractGenerator:
5
+ def __init__(self, pipeline):
6
+ self.pipeline = pipeline
7
+
8
+ def generate(self, title, intro, mode='lora'):
9
+ if mode == 'lora' or mode == 'test':
10
+ if mode == 'lora':
11
+ self.pipeline.model.set_adapter("abstract")
12
+
13
+ system_prompt = f'''You are a helpful assistant that help to generate the abstract of the survey paper given the survey title and survey introduction.'''
14
+ # user_prompt = {"survey_title":survey_title, "claims":cluster_with_claims}
15
+ user_prompt = f'''Help me to generate the abstract of a survey paper given the title: *{title}*, and and the introduction:{intro}'''
16
+
17
+ messages = [
18
+ {"role": "system", "content": system_prompt},
19
+ {"role": "user", "content": user_prompt},
20
+ {"role": "assistant", "content":"Abstract: This survey "}
21
+ ]
22
+
23
+ outputs = self.pipeline(
24
+ messages,
25
+ max_new_tokens=4096,
26
+ )
27
+ result = outputs[0]["generated_text"][-1]['content']
28
+ return result
29
+ else:
30
+ raise ValueError('mode not supported')
31
+
32
+ if __name__ == '__main__':
33
+ from transformers import pipeline
34
+ import torch
35
+ import transformers
36
+
37
+ model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
38
+ Global_pipeline = transformers.pipeline(
39
+ "text-generation",
40
+ model=model_id,
41
+ model_kwargs={"torch_dtype": torch.bfloat16},
42
+ token = os.getenv('HF_API_KEY'),
43
+ device_map="auto",
44
+ )
45
+ Global_pipeline.model.load_adapter(peft_model_id = "technicolor/llama3.1_8b_outline_generation", adapter_name="outline")
46
+ Global_pipeline.model.load_adapter(peft_model_id ="technicolor/llama3.1_8b_abstract_generation", adapter_name="abstract")
47
+ Global_pipeline.model.load_adapter(peft_model_id ="technicolor/llama3.1_8b_conclusion_generation", adapter_name="conclusion")
48
+ title = "A Survey of Large Language Models"
49
+ intro = '''L
50
+ ANGUAGE is a prominent ability in human beings to
51
+ express and communicate, which develops in early
52
+ childhood and evolves over a lifetime [3, 4]. Machines,
53
+ however, cannot naturally grasp the abilities of understanding and communicating in the form of human language,
54
+ unless equipped with powerful artificial intelligence (AI)
55
+ algorithms. It has been a longstanding research challenge
56
+ to achieve this goal, to enable machines to read, write, and
57
+ communicate like humans [5].
58
+ Technically, language modeling (LM) is one of the major
59
+ approaches to advancing language intelligence of machines.
60
+ In general, LM aims to model the generative likelihood
61
+ of word sequences, so as to predict the probabilities of
62
+ future (or missing) tokens. The research of LM has received
63
+ extensive attention in the literature, which can be divided
64
+ into four major development stages:
65
+ • Statistical language models (SLM). SLMs [6–9] are developed based on statistical learning methods that rose in
66
+ the 1990s. The basic idea is to build the word prediction
67
+ model based on the Markov assumption, e.g., predicting the
68
+ next word based on the most recent context. The SLMs with
69
+ a fixed context length n are also called n-gram language
70
+ models, e.g., bigram and trigram language models. SLMs
71
+ have been widely applied to enhance task performance
72
+ in information retrieval (IR) [10, 11] and natural language
73
+ processing (NLP) [12–14]. However, they often suffer from
74
+ the curse of dimensionality: it is difficult to accurately
75
+ estimate high-order language models since an exponential
76
+ number of transition probabilities need to be estimated.
77
+ Thus, specially designed smoothing strategies such as backoff estimation [15] and Good–Turing estimation [16] have
78
+ been introduced to alleviate the data sparsity problem.
79
+ • Neural language models (NLM). NLMs [1, 17, 18] characterize the probability of word sequences by neural networks,
80
+ e.g., multi-layer perceptron (MLP) and recurrent neural networks (RNNs). As a remarkable contribution, the work in
81
+ [1] introduced the concept of distributed representation of
82
+ words and built the word prediction function conditioned
83
+ on the aggregated context features (i.e., the distributed
84
+ word vectors). By extending the idea of learning effective
85
+ features for text data, a general neural network approach
86
+ was developed to build a unified, end-to-end solution for
87
+ various NLP tasks [2]. Furthermore, word2vec [19, 20] was
88
+ proposed to build a simplified shallow neural network
89
+ for learning distributed word representations, which were
90
+ demonstrated to be very effective across a variety of NLP
91
+ tasks. These studies have initiated the use of language
92
+ models for representation learning (beyond word sequence
93
+ modeling), having an important impact on the field of NLP.
94
+ • Pre-trained language models (PLM). As an early attempt, ELMo [21] was proposed to capture context-aware
95
+ word representations by first pre-training a bidirectional
96
+ LSTM (biLSTM) network (instead of learning fixed word
97
+ representations) and then fine-tuning the biLSTM network
98
+ according to specific downstream tasks. Furthermore, based
99
+ on the highly parallelizable Transformer architecture [22]
100
+ with self-attention mechanisms, BERT [23] was proposed by
101
+ pre-training bidirectional language models with specially
102
+ designed pre-training tasks on large-scale unlabeled corpora. These pre-trained context-aware word representations
103
+ are very effective as general-purpose semantic features,
104
+ which have largely raised the performance bar of NLP
105
+ tasks. This study has inspired a large number of follow-up
106
+ work, which sets the “pre-training and fine-tuning” learning
107
+ paradigm. Following this paradigm, a great number of studies on PLMs have been developed, introducing either different architectures [24, 25] (e.g., GPT-2 [26] and BART [24]) or
108
+ improved pre-training strategies [27–29]. In this paradigm, it
109
+ often requires fine-tuning the PLM for adapting to different
110
+ downstream tasks.
111
+ • Large language models (LLM). Researchers find that
112
+ scaling PLM (e.g., scaling model size or data size) often
113
+ leads to an improved model capacity on downstream tasks
114
+ (i.e., following the scaling law [30]). A number of studies
115
+ have explored the performance limit by training an ever
116
+ larger PLM (e.g., the 175B-parameter GPT-3 and the 540Bparameter PaLM). Although scaling is mainly conducted
117
+ in model size (with similar architectures and pre-training
118
+ tasks), these large-sized PLMs display different behaviors
119
+ from smaller PLMs (e.g., 330M-parameter BERT and 1.5Bparameter GPT-2) and show surprising abilities (called emergent abilities [31]) in solving a series of complex tasks. For
120
+ example, GPT-3 can solve few-shot tasks through in-context
121
+ learning, whereas GPT-2 cannot do well. Thus, the research
122
+ community coins the term “large language models (LLM)”
123
+ 1
124
+ for these large-sized PLMs [32–35], which attract increasing
125
+ research attention (See Figure 1). A remarkable application
126
+ of LLMs is ChatGPT2
127
+ that adapts the LLMs from the GPT
128
+ series for dialogue, which presents an amazing conversation
129
+ ability with humans. We can observe a sharp increase of the
130
+ arXiv papers that are related to LLMs after the release of
131
+ ChatGPT in Figure 1.
132
+ As discussed before, language model is not a new technical concept specially for LLMs, but has evolved with the
133
+ advance of artificial intelligence over the decades. Early language models mainly aim to model and generate text data,
134
+ while latest language models (e.g., GPT-4) focus on complex
135
+ task solving. From language modeling to task solving, it is an
136
+ important leap in scientific thinking, which is the key to
137
+ understand the development of language models in the research history. From the perspective of task solving, the four
138
+ generations of language models have exhibited different levels of model capacities. In Figure 2, we describe the evolution process of language models in terms of the task solving
139
+ capacity. At first, statistical language models mainly assisted
140
+ in some specific tasks (e.g., retrieval or speech tasks), in
141
+ which the predicted or estimated probabilities can enhance
142
+ the performance of task-specific approaches. Subsequently,
143
+ neural language models focused on learning task-agnostic
144
+ representations (e.g., features), aiming to reduce the efforts
145
+ for human feature engineering. Furthermore, pre-trained
146
+ language models learned context-aware representations that
147
+ can be optimized according to downstream tasks. For the
148
+ latest generation of language model, LLMs are enhanced by
149
+ exploring the scaling effect on model capacity, which can be
150
+ considered as general-purpose task solvers. To summarize,
151
+ in the evolution process, the task scope that can be solved
152
+ by language models have been greatly extended, and the
153
+ task performance attained by language models have been
154
+ significantly enhanced.
155
+ In the existing literature, PLMs have been widely discussed and surveyed [36–39], while LLMs are seldom reviewed in a systematic way. To motivate our survey, we first
156
+ highlight three major differences between LLMs and PLMs.
157
+ First, LLMs display some surprising emergent abilities that
158
+ may not be observed in previous smaller PLMs. These abilities are key to the performance of language models on complex tasks, making AI algorithms unprecedently powerful
159
+ and effective. Second, LLMs would revolutionize the way
160
+ that humans develop and use AI algorithms. Unlike small
161
+ PLMs, the major approach to accessing LLMs is through
162
+ the prompting interface (e.g., GPT-4 API). Humans have to
163
+ understand how LLMs work and format their tasks in a way
164
+ that LLMs can follow. Third, the development of LLMs no
165
+ longer draws a clear distinction between research and engineering. The training of LLMs requires extensive practical
166
+ experiences in large-scale data processing and distributed
167
+ parallel training. To develop capable LLMs, researchers
168
+ have to solve complicated engineering issues, working with
169
+ engineers or being engineers.
170
+ Nowadays, LLMs are posing a significant impact on
171
+ the AI community, and the advent of ChatGPT and GPT-4
172
+ leads to the rethinking of the possibilities of artificial general
173
+ intelligence (AGI). OpenAI has published a technical article
174
+ entitled “Planning for AGI and beyond”, which discusses
175
+ the short-term and long-term plans to approach AGI [40],
176
+ and a more recent paper has argued that GPT-4 might be
177
+ considered as an early version of an AGI system [41]. The
178
+ research areas of AI are being revolutionized by the rapid
179
+ progress of LLMs. In the field of NLP, LLMs can serve as a
180
+ general-purpose language task solver (to some extent), and
181
+ the research paradigm has been shifting towards the use
182
+ of LLMs. In the field of IR, traditional search engines are
183
+ challenged by the new information seeking way through AI
184
+ chatbots (i.e., ChatGPT), and New Bing3 presents an initial
185
+ attempt that enhances the search results based on LLMs. In
186
+ the field of CV, the researchers try to develop ChatGPT-like
187
+ vision-language models that can better serve multimodal
188
+ dialogues [42–45], and GPT-4 [46] has supported multimodal input by integrating the visual information. This new
189
+ wave of technology would potentially lead to a prosperous
190
+ ecosystem of real-world applications based on LLMs. For
191
+ instance, Microsoft 365 is being empowered by LLMs (i.e.,
192
+ Copilot) to automate the office work, and OpenAI supports
193
+ the use of plugins in ChatGPT for implementing special
194
+ functions.
195
+ Despite the progress and impact, the underlying principles of LLMs are still not well explored. Firstly, it is
196
+ mysterious why emergent abilities occur in LLMs, instead of
197
+ smaller PLMs. As a more general issue, there lacks a deep,
198
+ detailed investigation of the key factors that contribute to
199
+ the superior abilities of LLMs. It is important to study when
200
+ and how LLMs obtain such abilities [47]. Although there are
201
+ some meaningful discussions about this problem [31, 47],
202
+ more principled investigations are needed to uncover the
203
+ “secrets“ of LLMs. Secondly, it is difficult for the research
204
+ community to train capable LLMs. Due to the huge demand of computation resources, it is very costly to carry
205
+ out repetitive, ablating studies for investigating the effect
206
+ of various strategies for training LLMs. Indeed, LLMs are
207
+ mainly trained by industry, where many important training
208
+ details (e.g., data collection and cleaning) are not revealed
209
+ to the public. Thirdly, it is challenging to align LLMs with
210
+ human values or preferences. Despite the capacities, LLMs
211
+ are also likely to produce toxic, fictitious, or harmful contents. It requires effective and efficient control approaches
212
+ to eliminating the potential risk of the use of LLMs [46].
213
+ Faced with both opportunities and challenges, it needs
214
+ more attention on the research and development of LLMs. In
215
+ order to provide a basic understanding of LLMs, this survey
216
+ conducts a literature review of the recent advances in LLMs
217
+ from four major aspects, including pre-training (how to pretrain a capable LLM), adaptation (how to effectively adapt
218
+ pre-trained LLMs for better use), utilization (how to use
219
+ LLMs for solving various downstream tasks) and capability
220
+ evaluation (how to evaluate the abilities of LLMs and existing
221
+ empirical findings). We thoroughly comb the literature and
222
+ summarize the key findings, techniques, and methods of
223
+ LLMs. For this survey, we also create a GitHub project
224
+ website by collecting the supporting resources for LLMs, at
225
+ the link https://github.com/RUCAIBox/LLMSurvey. We
226
+ are also aware of several related review articles on PLMs
227
+ or LLMs [32, 36, 38, 39, 43, 48–54]. These papers either
228
+ discuss PLMs or some specific (or general) aspects of LLMs.
229
+ Compared with them, we focus on the techniques and
230
+ methods to develop and use LLMs and provide a relatively
231
+ comprehensive reference to important aspects of LLMs.
232
+ The remainder of this survey is organized as follows:
233
+ Section 2 introduces the background for LLMs and the evolution of GPT-series models, followed by the summarization
234
+ of available resources for developing LLMs in Section 3.
235
+ Sections 4, 5, 6, and 7 review and summarize the recent
236
+ progress from the four aspects of pre-training, adaptation,
237
+ utilization, and capacity evaluation, respectively. Then, Section 8 discusses the practical guide for prompt design,
238
+ and Section 9 reviews the applications of LLMs in several
239
+ representative domains. Finally, we conclude the survey in
240
+ Section 10 by summarizing the major findings and discuss
241
+ the remaining issues for future work.
242
+ '''
243
+
244
+
245
+ abstract_generator = AbstractGenerator(Global_pipeline)
246
+ with_lora = abstract_generator.generate(title, intro, mode='lora')
247
+ with_test = abstract_generator.generate(title, intro, mode='test')
src/demo/asg_add_flowchart.py ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from urllib.parse import quote
5
+
6
+ import os
7
+ import json
8
+ import torch
9
+ import torchvision.transforms as transforms
10
+ from torchvision import models
11
+ from PIL import Image
12
+
13
+ # 常量定义
14
+ BASE_DIR = os.path.normpath("src/static/data/md") # 根目录
15
+ INFO_DIR = os.path.normpath("src/static/data/info") # 存放 JSON 结果的目录
16
+
17
+ # 加载 PyTorch EfficientNet 训练好的 3 类分类模型
18
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19
+ model = models.efficientnet_b0(pretrained=False)
20
+
21
+ # 修改最后一层,适应 3 类(flowchart, non-flowchart, other)
22
+ num_features = model.classifier[1].in_features
23
+ model.classifier[1] = torch.nn.Linear(num_features, 3) # 3 类
24
+ model.load_state_dict(torch.load("flowchart_classifier.pth", map_location=device))
25
+ model.to(device) # 确保模型移动到正确的设备
26
+ model.eval()
27
+
28
+ # 预处理图片
29
+ transform = transforms.Compose([
30
+ transforms.Resize((224, 224)),
31
+ transforms.ToTensor(),
32
+ ])
33
+
34
+ def detect_flowcharts(survey_id):
35
+ """ 在指定 survey_id 目录下查找 flowchart,并保存 JSON 结果 """
36
+ survey_path = os.path.join(BASE_DIR, survey_id) # 该 survey_id 的目录
37
+ if not os.path.exists(survey_path):
38
+ print(f"❌ 目录 {survey_path} 不存在!")
39
+ return
40
+
41
+ flowchart_dict = {} # 存储 flowchart 结果
42
+
43
+ # 遍历该 survey 目录下的所有 PDF 文件夹
44
+ for pdf_folder in os.listdir(survey_path):
45
+ pdf_folder_path = os.path.join(survey_path, pdf_folder)
46
+
47
+ if not os.path.isdir(pdf_folder_path):
48
+ continue # 只处理文件夹
49
+
50
+ print(f"🔍 处理 PDF 文件夹: {pdf_folder}")
51
+
52
+ # 遍历所有 `xxx/auto/images` 目录
53
+ for root, dirs, files in os.walk(pdf_folder_path):
54
+ if "auto/images" in root.replace("\\", "/"): # 兼容 Windows 和 Linux
55
+ for filename in sorted(files): # 按文件名排序,保证第一个找到的 Flowchart 被选用
56
+ if not filename.lower().endswith(".jpg"): # 只处理 JPG
57
+ continue
58
+
59
+ image_path = os.path.join(root, filename)
60
+ img = Image.open(image_path).convert("RGB") # 打开图片并转换为 RGB
61
+
62
+ # 预处理图片并转换为张量
63
+ img_tensor = transform(img).unsqueeze(0).to(device)
64
+
65
+ # 运行分类模型
66
+ with torch.no_grad():
67
+ output = model(img_tensor)
68
+ predicted_class = torch.argmax(output).item()
69
+
70
+ # **确保 predicted_class == 0 表示 flowchart**
71
+ if predicted_class == 2: # `0` 代表 Flowchart 类别
72
+ print(f"✅ Flowchart detected: {image_path}")
73
+ flowchart_dict[pdf_folder] = image_path
74
+ break # **只存当前 PDF 文件夹的第一张 flowchart**
75
+
76
+ # 只有检测到 Flowchart 时才保存 JSON
77
+ if flowchart_dict:
78
+ os.makedirs(os.path.join(INFO_DIR, survey_id), exist_ok=True) # 确保目录存在
79
+ json_path = os.path.join(INFO_DIR, survey_id, "flowchart_results.json")
80
+ with open(json_path, "w", encoding="utf-8") as f:
81
+ json.dump(flowchart_dict, f, indent=4, ensure_ascii=False)
82
+
83
+ print(f"📁 Flowchart 结果已保存: {json_path}")
84
+ else:
85
+ print(f"⚠️ 没有检测到 Flowchart,未生成 JSON")
86
+
87
+ # 示例调用
88
+ # survey_id = "test" # 例如 "test"
89
+ # detect_flowcharts(survey_id)
90
+
91
+ def insert_ref_images(json_path, ref_names, text):
92
+ """
93
+ 参数:
94
+ json_path: JSON 文件路径,其内容格式例如:
95
+ {
96
+ "Accelerating federated learning with data and model parallelism in edge computing":
97
+ "src/static/data/md/test/Accelerating federated learning with data and model parallelism in edge computing/auto/images/xxx.jpg",
98
+ ...
99
+ }
100
+ ref_names: 引用名称列表,其中第 1 个元素对应 [1],第 2 个对应 [2],以此类推。
101
+ text: 包含类似 [1]、[2] 等引用的 Markdown 文本。
102
+
103
+ 返回:
104
+ 修改后的文本字符串。在每个引用标记首次出现行的下方插入对应的 HTML 代码块,
105
+ 格式如下:
106
+
107
+ <div style="text-align:center">
108
+ <img src="image_path" alt="the flow chart of [ref_name]" style="width:50%;"/>
109
+ </div>
110
+ <div style="text-align:center">
111
+ Fig [ref_num]: The flow chart of [ref_name]
112
+ </div>
113
+
114
+ 其中 [ref_num] 为引用编号(ref_names 中的 1-based index),[ref_name] 为引用名称。
115
+
116
+ 说明:
117
+ 1. JSON 中存储的路径已是目标路径,但可能混合了正斜杠和反斜杠。
118
+ 2. 代码将先拆分路径字符串,再利用 os.path.join 拼接生成当前系统的标准路径,
119
+ 最后转换为统一的正斜杠格式并进行 URL ��码,以适配所有系统。
120
+ """
121
+ # 加载 JSON 文件内容
122
+ try:
123
+ with open(json_path, 'r', encoding='utf-8') as f:
124
+ img_mapping = json.load(f)
125
+ except Exception as e:
126
+ raise Exception(f"加载 JSON 文件出错: {e}")
127
+
128
+ inserted_refs = {} # 记录每个引用标记是否已插入图片
129
+ lines = text.splitlines()
130
+ new_lines = []
131
+ # 匹配类似 [1]、[2] 的引用标记
132
+ ref_pattern = re.compile(r'\[(\d+)\]')
133
+ img_index = 2
134
+ for line in lines:
135
+ new_lines.append(line)
136
+ matches = ref_pattern.findall(line)
137
+ for ref_num_str in matches:
138
+ try:
139
+ ref_num = int(ref_num_str)
140
+ except ValueError:
141
+ continue
142
+
143
+ # 仅在引用标记首次出现时插入 HTML 块
144
+ if ref_num not in inserted_refs:
145
+ inserted_refs[ref_num] = True
146
+
147
+ if 1 <= ref_num <= len(ref_names):
148
+ ref_name = ref_names[ref_num - 1]
149
+ jpg_path = img_mapping.get(ref_name, "")
150
+ else:
151
+ ref_name = f"ref_{ref_num}"
152
+ jpg_path = ""
153
+
154
+ if jpg_path:
155
+ # 将路径中可能混合的正斜杠和反斜杠拆分为多个部分
156
+ parts = re.split(r'[\\/]+', jpg_path)
157
+ # 使用 os.path.join 拼接成当前系统的规范路径
158
+ normalized_jpg_path = os.path.join(*parts)
159
+ # 转换为适用于 HTML 的路径格式(全部替换为正斜杠)
160
+ normalized_jpg_path = normalized_jpg_path.replace(os.sep, '/')
161
+ # 对路径进行 URL 编码(保留斜杠)
162
+ normalized_jpg_path_url = quote(normalized_jpg_path, safe="/")
163
+
164
+ html_block = (
165
+ f"<div style=\"text-align:center\">\n"
166
+ f" <img src=\"{normalized_jpg_path_url}\" alt=\"the chart of {ref_name}\" style=\"width:60%;\"/>\n"
167
+ f"</div>\n"
168
+ f"<div style=\"text-align:center;font-size:smaller;\">\n"
169
+ f" Fig {img_index}: Chart from \'{ref_name}\'\n"
170
+ f"</div>"
171
+ )
172
+ new_lines.append(html_block)
173
+ new_lines.append("") # 增加一个空行分隔
174
+ img_index += 1
175
+
176
+ return "\n".join(new_lines)
177
+
178
+ def insert_tex_images(json_path, ref_names, text):
179
+ """
180
+ 将 Markdown 文本中出现的数字引用(例如 [1], \[1], \[1\])替换为 LaTeX figure 环境。
181
+ 仅在每个引用编号第一次出现时插入对应图片,后续出现同编号不再重复插入。
182
+
183
+ 参数:
184
+ json_path: JSON 文件路径,其内容格式例如:
185
+ {
186
+ "Accelerating federated learning with data and model parallelism in edge computing":
187
+ "src/static/data/md/test/Accelerating federated learning with data and model parallelism in edge computing/auto/images/xxx.jpg",
188
+ ...
189
+ }
190
+ ref_names: 引用名称列表。其中第 1 个元素对应 [1],第 2 个对应 [2],以此类推。
191
+ text: 包含类似 [1]、\[1]、\[1\] 等形式的 Markdown 文本。
192
+
193
+ 返回:
194
+ 修改后的文本字符串。在每个引用标记首次出现行的下方插入对应的 LaTeX figure 环境:
195
+
196
+ \begin{figure}[htbp]
197
+ \centering
198
+ \includegraphics[width=0.6\textwidth]{image_path}
199
+ \caption{Fig 2: Chart from 'ref_name'}
200
+ \end{figure}
201
+
202
+ 说明:
203
+ 1. JSON 中存储的路径可能含正反斜杠。
204
+ 2. 我们按系统拼接路径,再统一转为正斜杠并进行 URL 编码。
205
+ 3. figure 的计数从 1 开始(可根据需求调整)。
206
+ 4. 若某引用编号未在 JSON 中匹配到图片,则不插入 figure。
207
+ """
208
+
209
+ # 读取 JSON
210
+ try:
211
+ with open(json_path, 'r', encoding='utf-8') as f:
212
+ img_mapping = json.load(f)
213
+ except Exception as e:
214
+ raise Exception(f"加载 JSON 文件出错: {e}")
215
+
216
+ # 用于记录某个编号是否已插入过
217
+ inserted_refs = {}
218
+
219
+ # 按行处理文本
220
+ lines = text.splitlines()
221
+ new_lines = []
222
+
223
+ # --------------------------
224
+ # 匹配 [1], \[1], \[1\] 等数字引用
225
+ # --------------------------
226
+ # 含义:
227
+ # (?:\\)? -> 可选的反斜杠 0或1次
228
+ # \[ -> 文字 '[' (在正则中需转义)
229
+ # (\d+) -> 捕获一个或多个数字
230
+ # (?:\\)? -> 可选的反斜杠 0或1次
231
+ # \] -> 文字 ']' (需转义)
232
+ # 整体匹配可能出现以下形式:
233
+ # [1], \[1], \[1\], [12], \[12] 等
234
+ ref_pattern = re.compile(r'(?:\\)?\[(\d+)(?:\\)?\]')
235
+
236
+ # figure 计数
237
+ figure_index = 1
238
+
239
+ for line in lines:
240
+ new_lines.append(line) # 先把此行内容写入新文本
241
+
242
+ # 查找本��中所有符合模式的引用
243
+ matches = ref_pattern.findall(line)
244
+ for ref_num_str in matches:
245
+ try:
246
+ ref_num = int(ref_num_str)
247
+ except ValueError:
248
+ continue
249
+
250
+ # 若该引用编号尚未插入过图片,则执行插入
251
+ if ref_num not in inserted_refs:
252
+ inserted_refs[ref_num] = True
253
+
254
+ # 判断这个编号是否在 ref_names 范围内
255
+ if 1 <= ref_num <= len(ref_names):
256
+ ref_name = ref_names[ref_num - 1]
257
+ jpg_path = img_mapping.get(ref_name, "")
258
+ else:
259
+ ref_name = f"ref_{ref_num}"
260
+ jpg_path = ""
261
+
262
+ if jpg_path:
263
+ # 规范化路径
264
+ parts = re.split(r'[\\/]+', jpg_path)
265
+ normalized_jpg_path = os.path.join(*parts)
266
+ normalized_jpg_path = normalized_jpg_path.replace(os.sep, '/')
267
+ # URL 编码(保留 '/')
268
+ # normalized_jpg_path_url = quote(normalized_jpg_path, safe="/")
269
+ normalized_jpg_path_url = normalized_jpg_path
270
+
271
+ # 构建 LaTeX figure 块
272
+ tex_block = (
273
+ r"\begin{figure}[htbp]" "\n"
274
+ r" \centering" "\n"
275
+ f" \\includegraphics[width=0.5\\textwidth]{{{normalized_jpg_path_url}}}\n"
276
+ f" \\caption{{Chart from \\textit{ref_name}}}\n"
277
+ r"\end{figure}"
278
+ )
279
+
280
+ # 插到新文本中,再加个空行分隔
281
+ new_lines.append(tex_block)
282
+ new_lines.append("")
283
+ figure_index += 1
284
+
285
+ return "\n".join(new_lines)
286
+
287
+
288
+ # 示例用法
289
+ if __name__ == "__main__":
290
+ # Markdown 文件路径
291
+ md_file_path = "src/static/data/info/test/survey_test_processed.md"
292
+ # JSON 文件路径
293
+ json_file_path = "src/static/data/info/test/flowchart_results.json"
294
+
295
+ try:
296
+ with open(md_file_path, "r", encoding="utf-8") as f:
297
+ text = f.read()
298
+ except FileNotFoundError:
299
+ print(f"错误: Markdown 文件 {md_file_path} 未找到!")
300
+ text = ""
301
+
302
+ ref_names = [
303
+ "An explainable federated learning and blockchain based secure credit modeling method",
304
+ "Bafl a blockchain based asynchronous",
305
+ "Biscotti a blockchain system for private and secure federated learning",
306
+ "Blockdfl a blockchain based fully decentralized peer to peer",
307
+ "Accelerating blockchain enabled federated learning with clustered clients",
308
+ "A fast blockchain based federated learning framework with compressed communications"
309
+ ]
310
+
311
+ result = insert_ref_images(json_file_path, ref_names, text)
312
+ print("修改后的文本为:\n")
313
+ print(result)
src/demo/asg_clustername.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import re # Import the regular expressions module
4
+ from openai import OpenAI
5
+ import ast
6
+
7
+ def generate_cluster_name_qwen_sep(tsv_path, survey_title):
8
+ data = pd.read_csv(tsv_path, sep='\t')
9
+
10
+ # Define the system prompt once, outside the loop
11
+ system_prompt = f'''You are a research assistant working on a survey paper. The survey paper is about "{survey_title}". \
12
+ '''
13
+
14
+ result = [] # Initialize the result list
15
+
16
+ for i in range(3): # Assuming labels are 0, 1, 2
17
+ sentence_list = [] # Reset sentence_list for each label
18
+ for j in range(len(data)):
19
+ if data['label'][j] == i:
20
+ sentence_list.append(data['retrieval_result'][j])
21
+
22
+ # Convert the sentence list to a string representation
23
+ user_prompt = f'''
24
+ Given a list of descriptions of sentences about an aspect of the survey, you need to use one phrase (within 8 words) to summarize it and treat it as a section title of your survey paper. \
25
+ Your response should be a list with only one element and without any other information, for example, ["Post-training of LLMs"] \
26
+ Your response must contain one keyword of the survey title, unspecified or irrelevant results are not allowed. \
27
+ The description list is:{sentence_list}'''
28
+
29
+ messages = [
30
+ {"role": "system", "content": system_prompt},
31
+ {"role": "user", "content": user_prompt},
32
+ ]
33
+
34
+ openai_api_key = os.getenv("OPENAI_API_KEY")
35
+ openai_api_base = os.getenv("OPENAI_API_BASE")
36
+ client = OpenAI(
37
+ api_key=openai_api_key,
38
+ base_url=openai_api_base,
39
+ )
40
+
41
+ chat_response = client.chat.completions.create(
42
+ model=os.environ.get("MODEL"),
43
+ max_tokens=768,
44
+ temperature=0.5,
45
+ stop="<|im_end|>",
46
+ stream=True,
47
+ messages=messages
48
+ )
49
+
50
+ # Stream the response to a single text string
51
+ text = ""
52
+ for chunk in chat_response:
53
+ if chunk.choices[0].delta.content:
54
+ text += chunk.choices[0].delta.content
55
+
56
+ # Use regex to extract the first content within []
57
+ match = re.search(r'\[(.*?)\]', text)
58
+ if match:
59
+ cluster_name = match.group(1).strip() # Extract and clean the cluster name
60
+ # 去除集群名称两侧的引号(如果存在)
61
+ cluster_name = cluster_name.strip('"').strip("'")
62
+ result.append(cluster_name)
63
+ else:
64
+ result.append("No Cluster Name Found") # Handle cases where pattern isn't found
65
+ # print("The generated cluster names are:")
66
+ # print(result)
67
+ return result # This will be a list with three elements
68
+
69
+ # Example usage:
70
+ # result = generate_cluster_name_qwen_sep('path_to_your_file.tsv', 'Your Survey Title')
71
+ # print(result) # Output might look like ["Cluster One", "Cluster Two", "Cluster Three"]
72
+
73
+ def refine_cluster_name(cluster_names, survey_title):
74
+ cluster_names = str(cluster_names) # Convert to string to handle list input
75
+ # Define the system prompt to set the context
76
+ system_prompt = f'''You are a research assistant tasked with optimizing and refining a set of section titles for a survey paper. The survey paper is about "{survey_title}".
77
+ '''
78
+
79
+ # Construct the user prompt, including all cluster names
80
+ user_prompt = f'''
81
+ Here is a set of section titles generated for the survey topic "{survey_title}":
82
+ {cluster_names}
83
+ Please ensure that all cluster names are coherent and consistent with each other, and that each name is clear, concise, and accurately reflects the corresponding section.
84
+ Notice to remove the overlapping information between the cluster names.
85
+ Each cluster name should be within 8 words and include a keyword from the survey title.
86
+ Response with a list of section titles in the following format without any other irrelevant information,
87
+ For example, ["Refined Title 1", "Refined Title 2", "Refined Title 3"]
88
+ '''
89
+
90
+ messages = [
91
+ {"role": "system", "content": system_prompt},
92
+ {"role": "user", "content": user_prompt},
93
+ ]
94
+
95
+ # Initialize OpenAI client
96
+ openai_api_key = os.getenv("OPENAI_API_KEY")
97
+ openai_api_base = os.getenv("OPENAI_API_BASE")
98
+ client = OpenAI(
99
+ api_key=openai_api_key,
100
+ base_url=openai_api_base,
101
+ )
102
+
103
+ try:
104
+ chat_response = client.chat.completions.create(
105
+ model=os.environ.get("MODEL"),
106
+ max_tokens=256,
107
+ temperature=0.5,
108
+ stop="<|im_end|>",
109
+ stream=True,
110
+ messages=messages
111
+ )
112
+
113
+ # Stream the response and concatenate into a complete text
114
+ text = ""
115
+ for chunk in chat_response:
116
+ if chunk.choices[0].delta.content:
117
+ text += chunk.choices[0].delta.content
118
+
119
+ # print("The raw response text is:")
120
+ # print(text)
121
+
122
+ # Use regex to extract content within square brackets
123
+ match = re.search(r'\[(.*?)\]', text)
124
+ if match:
125
+ refined_cluster_names = match.group(1).strip() # Extract and clean the cluster name
126
+ else:
127
+ refined_cluster_names = [
128
+ survey_title + ": Definition",
129
+ survey_title + ": Methods",
130
+ survey_title + ": Evaluation"
131
+ ] # Handle cases where pattern isn't found
132
+
133
+ except Exception as e:
134
+ print(f"An error occurred while refining cluster names: {e}")
135
+ refined_cluster_names = ["Refinement Error"] * len(cluster_names)
136
+
137
+ refined_cluster_names = ast.literal_eval(refined_cluster_names) # Convert string to list
138
+
139
+ # print("The refined cluster names are:")
140
+ # print(refined_cluster_names)
141
+ return refined_cluster_names # Returns a list with the refined cluster names、
142
+
143
+
144
+
145
+
146
+ def generate_cluster_name_new(tsv_path, survey_title, cluster_num = 3):
147
+ data = pd.read_csv(tsv_path, sep='\t')
148
+ desp=[]
149
+
150
+
151
+ for i in range(cluster_num): # Assuming labels are 0, 1, 2
152
+ sentence_list = [] # Initialize the sentence list
153
+ for j in range(len(data)):
154
+ if data['label'][j] == i:
155
+ sentence_list.append(data['retrieval_result'][j])
156
+ desp.append(sentence_list)
157
+
158
+ system_prompt = f'''
159
+ You are a research assistant working on a survey paper. The survey paper is about "{survey_title}". '''
160
+
161
+ cluster_info = "\n".join([f'Cluster {i+1}: "{desp[i]}"' for i in range(cluster_num)])
162
+
163
+ user_prompt = f'''
164
+ Your task is to generate {cluster_num} distinctive cluster names (e.g., "Pre-training of LLMs") of the given clusters of reference papers, each reference paper is described by a sentence.
165
+
166
+ The clusters of reference papers are:
167
+ {cluster_info}
168
+
169
+ Your output should be a single list of {cluster_num} cluster names, e.g., ["Pre-training of LLMs", "Fine-tuning of LLMs", "Evaluation of LLMs"]
170
+ Do not output any other text or information.
171
+ '''
172
+
173
+ messages = [
174
+ {"role": "system", "content": system_prompt},
175
+ {"role": "user", "content": user_prompt},
176
+ ]
177
+
178
+ openai_api_key = os.getenv("OPENAI_API_KEY")
179
+ openai_api_base = os.getenv("OPENAI_API_BASE")
180
+ client = OpenAI(
181
+ api_key=openai_api_key,
182
+ base_url=openai_api_base,
183
+ )
184
+
185
+ chat_response = client.chat.completions.create(
186
+ model=os.environ.get("MODEL"),
187
+ max_tokens=768,
188
+ temperature=0.5,
189
+ stop="<|im_end|>",
190
+ stream=True,
191
+ messages=messages
192
+ )
193
+
194
+ # Stream the response to a single text string
195
+ text = ""
196
+ for chunk in chat_response:
197
+ if chunk.choices[0].delta.content:
198
+ text += chunk.choices[0].delta.content
199
+ # print("The raw response text is:")
200
+ # print(text)
201
+
202
+ # Use regex to extract content within square brackets
203
+ match = re.search(r'\[(.*?)\]', text)
204
+ if match:
205
+ refined_cluster_names = match.group(1).strip() # Extract and clean the cluster name
206
+ else:
207
+ predefined_sections = [
208
+ "Definition", "Methods", "Evaluation", "Applications",
209
+ "Challenges", "Future Directions", "Comparisons", "Case Studies"
210
+ ]
211
+
212
+ # 根据 cluster_num 选择前 cluster_num 个预定义类别
213
+ refined_cluster_names = [
214
+ f"{survey_title}: {predefined_sections[i]}" for i in range(cluster_num)
215
+ ]
216
+
217
+ refined_cluster_names = ast.literal_eval(refined_cluster_names) # Convert string to list
218
+
219
+ # print("The refined cluster names are:")
220
+ # print(refined_cluster_names)
221
+ return refined_cluster_names # Returns a list with the refined cluster names、
222
+
223
+
224
+ if __name__ == "__main__":
225
+ refined_result = refine_cluster_name(["Pre-training of LLMs", "Fine-tuning of LLMs", "Evaluation of LLMs"], 'Survey of LLMs')
226
+ # print(refined_result)
227
+
228
+
src/demo/asg_conclusion.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+
4
+ class ConclusionGenerator:
5
+ def __init__(self, pipeline):
6
+ self.pipeline = pipeline
7
+
8
+ def generate(self, title, intro, mode='lora'):
9
+ if mode == 'lora' or mode == 'test':
10
+ if mode == 'lora':
11
+ self.pipeline.model.set_adapter("conclusion")
12
+
13
+ system_prompt = f'''You are a helpful assistant that help to generate the conclusion of the survey paper given the survey title and survey introduction.'''
14
+ # user_prompt = {"survey_title":survey_title, "claims":cluster_with_claims}
15
+ user_prompt = f'''Help me to generate the conclusion of a survey paper given the title: *{title}*, and and the introduction:{intro}'''
16
+
17
+ messages = [
18
+ {"role": "system", "content": system_prompt},
19
+ {"role": "user", "content": user_prompt},
20
+ {"role": "assistant", "content":"Conclusion: This survey "}
21
+ ]
22
+
23
+ outputs = self.pipeline(
24
+ messages,
25
+ max_new_tokens=4096,
26
+ )
27
+ result = outputs[0]["generated_text"][-1]['content']
28
+ return result
29
+ else:
30
+ raise ValueError('mode not supported')
31
+
32
+ if __name__ == '__main__':
33
+ from transformers import pipeline
34
+ import torch
35
+ import transformers
36
+
37
+ model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
38
+ Global_pipeline = transformers.pipeline(
39
+ "text-generation",
40
+ model=model_id,
41
+ model_kwargs={"torch_dtype": torch.bfloat16},
42
+ token = os.getenv('HF_API_KEY'),
43
+ device_map="auto",
44
+ )
45
+ Global_pipeline.model.load_adapter(peft_model_id = "technicolor/llama3.1_8b_outline_generation", adapter_name="outline")
46
+ Global_pipeline.model.load_adapter(peft_model_id ="technicolor/llama3.1_8b_conclusion_generation", adapter_name="conclusion")
47
+ Global_pipeline.model.load_adapter(peft_model_id ="technicolor/llama3.1_8b_abstract_generation", adapter_name="abstract")
48
+
49
+
50
+ title = "A Survey of Large Language Models"
51
+ intro = '''L
52
+ ANGUAGE is a prominent ability in human beings to
53
+ express and communicate, which develops in early
54
+ childhood and evolves over a lifetime [3, 4]. Machines,
55
+ however, cannot naturally grasp the abilities of understanding and communicating in the form of human language,
56
+ unless equipped with powerful artificial intelligence (AI)
57
+ algorithms. It has been a longstanding research challenge
58
+ to achieve this goal, to enable machines to read, write, and
59
+ communicate like humans [5].
60
+ Technically, language modeling (LM) is one of the major
61
+ approaches to advancing language intelligence of machines.
62
+ In general, LM aims to model the generative likelihood
63
+ of word sequences, so as to predict the probabilities of
64
+ future (or missing) tokens. The research of LM has received
65
+ extensive attention in the literature, which can be divided
66
+ into four major development stages:
67
+ • Statistical language models (SLM). SLMs [6–9] are developed based on statistical learning methods that rose in
68
+ the 1990s. The basic idea is to build the word prediction
69
+ model based on the Markov assumption, e.g., predicting the
70
+ next word based on the most recent context. The SLMs with
71
+ a fixed context length n are also called n-gram language
72
+ models, e.g., bigram and trigram language models. SLMs
73
+ have been widely applied to enhance task performance
74
+ in information retrieval (IR) [10, 11] and natural language
75
+ processing (NLP) [12–14]. However, they often suffer from
76
+ the curse of dimensionality: it is difficult to accurately
77
+ estimate high-order language models since an exponential
78
+ number of transition probabilities need to be estimated.
79
+ Thus, specially designed smoothing strategies such as backoff estimation [15] and Good–Turing estimation [16] have
80
+ been introduced to alleviate the data sparsity problem.
81
+ • Neural language models (NLM). NLMs [1, 17, 18] characterize the probability of word sequences by neural networks,
82
+ e.g., multi-layer perceptron (MLP) and recurrent neural networks (RNNs). As a remarkable contribution, the work in
83
+ [1] introduced the concept of distributed representation of
84
+ words and built the word prediction function conditioned
85
+ on the aggregated context features (i.e., the distributed
86
+ word vectors). By extending the idea of learning effective
87
+ features for text data, a general neural network approach
88
+ was developed to build a unified, end-to-end solution for
89
+ various NLP tasks [2]. Furthermore, word2vec [19, 20] was
90
+ proposed to build a simplified shallow neural network
91
+ for learning distributed word representations, which were
92
+ demonstrated to be very effective across a variety of NLP
93
+ tasks. These studies have initiated the use of language
94
+ models for representation learning (beyond word sequence
95
+ modeling), having an important impact on the field of NLP.
96
+ • Pre-trained language models (PLM). As an early attempt, ELMo [21] was proposed to capture context-aware
97
+ word representations by first pre-training a bidirectional
98
+ LSTM (biLSTM) network (instead of learning fixed word
99
+ representations) and then fine-tuning the biLSTM network
100
+ according to specific downstream tasks. Furthermore, based
101
+ on the highly parallelizable Transformer architecture [22]
102
+ with self-attention mechanisms, BERT [23] was proposed by
103
+ pre-training bidirectional language models with specially
104
+ designed pre-training tasks on large-scale unlabeled corpora. These pre-trained context-aware word representations
105
+ are very effective as general-purpose semantic features,
106
+ which have largely raised the performance bar of NLP
107
+ tasks. This study has inspired a large number of follow-up
108
+ work, which sets the “pre-training and fine-tuning” learning
109
+ paradigm. Following this paradigm, a great number of studies on PLMs have been developed, introducing either different architectures [24, 25] (e.g., GPT-2 [26] and BART [24]) or
110
+ improved pre-training strategies [27–29]. In this paradigm, it
111
+ often requires fine-tuning the PLM for adapting to different
112
+ downstream tasks.
113
+ • Large language models (LLM). Researchers find that
114
+ scaling PLM (e.g., scaling model size or data size) often
115
+ leads to an improved model capacity on downstream tasks
116
+ (i.e., following the scaling law [30]). A number of studies
117
+ have explored the performance limit by training an ever
118
+ larger PLM (e.g., the 175B-parameter GPT-3 and the 540Bparameter PaLM). Although scaling is mainly conducted
119
+ in model size (with similar architectures and pre-training
120
+ tasks), these large-sized PLMs display different behaviors
121
+ from smaller PLMs (e.g., 330M-parameter BERT and 1.5Bparameter GPT-2) and show surprising abilities (called emergent abilities [31]) in solving a series of complex tasks. For
122
+ example, GPT-3 can solve few-shot tasks through in-context
123
+ learning, whereas GPT-2 cannot do well. Thus, the research
124
+ community coins the term “large language models (LLM)”
125
+ 1
126
+ for these large-sized PLMs [32–35], which attract increasing
127
+ research attention (See Figure 1). A remarkable application
128
+ of LLMs is ChatGPT2
129
+ that adapts the LLMs from the GPT
130
+ series for dialogue, which presents an amazing conversation
131
+ ability with humans. We can observe a sharp increase of the
132
+ arXiv papers that are related to LLMs after the release of
133
+ ChatGPT in Figure 1.
134
+ As discussed before, language model is not a new technical concept specially for LLMs, but has evolved with the
135
+ advance of artificial intelligence over the decades. Early language models mainly aim to model and generate text data,
136
+ while latest language models (e.g., GPT-4) focus on complex
137
+ task solving. From language modeling to task solving, it is an
138
+ important leap in scientific thinking, which is the key to
139
+ understand the development of language models in the research history. From the perspective of task solving, the four
140
+ generations of language models have exhibited different levels of model capacities. In Figure 2, we describe the evolution process of language models in terms of the task solving
141
+ capacity. At first, statistical language models mainly assisted
142
+ in some specific tasks (e.g., retrieval or speech tasks), in
143
+ which the predicted or estimated probabilities can enhance
144
+ the performance of task-specific approaches. Subsequently,
145
+ neural language models focused on learning task-agnostic
146
+ representations (e.g., features), aiming to reduce the efforts
147
+ for human feature engineering. Furthermore, pre-trained
148
+ language models learned context-aware representations that
149
+ can be optimized according to downstream tasks. For the
150
+ latest generation of language model, LLMs are enhanced by
151
+ exploring the scaling effect on model capacity, which can be
152
+ considered as general-purpose task solvers. To summarize,
153
+ in the evolution process, the task scope that can be solved
154
+ by language models have been greatly extended, and the
155
+ task performance attained by language models have been
156
+ significantly enhanced.
157
+ In the existing literature, PLMs have been widely discussed and surveyed [36–39], while LLMs are seldom reviewed in a systematic way. To motivate our survey, we first
158
+ highlight three major differences between LLMs and PLMs.
159
+ First, LLMs display some surprising emergent abilities that
160
+ may not be observed in previous smaller PLMs. These abilities are key to the performance of language models on complex tasks, making AI algorithms unprecedently powerful
161
+ and effective. Second, LLMs would revolutionize the way
162
+ that humans develop and use AI algorithms. Unlike small
163
+ PLMs, the major approach to accessing LLMs is through
164
+ the prompting interface (e.g., GPT-4 API). Humans have to
165
+ understand how LLMs work and format their tasks in a way
166
+ that LLMs can follow. Third, the development of LLMs no
167
+ longer draws a clear distinction between research and engineering. The training of LLMs requires extensive practical
168
+ experiences in large-scale data processing and distributed
169
+ parallel training. To develop capable LLMs, researchers
170
+ have to solve complicated engineering issues, working with
171
+ engineers or being engineers.
172
+ Nowadays, LLMs are posing a significant impact on
173
+ the AI community, and the advent of ChatGPT and GPT-4
174
+ leads to the rethinking of the possibilities of artificial general
175
+ intelligence (AGI). OpenAI has published a technical article
176
+ entitled “Planning for AGI and beyond”, which discusses
177
+ the short-term and long-term plans to approach AGI [40],
178
+ and a more recent paper has argued that GPT-4 might be
179
+ considered as an early version of an AGI system [41]. The
180
+ research areas of AI are being revolutionized by the rapid
181
+ progress of LLMs. In the field of NLP, LLMs can serve as a
182
+ general-purpose language task solver (to some extent), and
183
+ the research paradigm has been shifting towards the use
184
+ of LLMs. In the field of IR, traditional search engines are
185
+ challenged by the new information seeking way through AI
186
+ chatbots (i.e., ChatGPT), and New Bing3 presents an initial
187
+ attempt that enhances the search results based on LLMs. In
188
+ the field of CV, the researchers try to develop ChatGPT-like
189
+ vision-language models that can better serve multimodal
190
+ dialogues [42–45], and GPT-4 [46] has supported multimodal input by integrating the visual information. This new
191
+ wave of technology would potentially lead to a prosperous
192
+ ecosystem of real-world applications based on LLMs. For
193
+ instance, Microsoft 365 is being empowered by LLMs (i.e.,
194
+ Copilot) to automate the office work, and OpenAI supports
195
+ the use of plugins in ChatGPT for implementing special
196
+ functions.
197
+ Despite the progress and impact, the underlying principles of LLMs are still not well explored. Firstly, it is
198
+ mysterious why emergent abilities occur in LLMs, instead of
199
+ smaller PLMs. As a more general issue, there lacks a deep,
200
+ detailed investigation of the key factors that contribute to
201
+ the superior abilities of LLMs. It is important to study when
202
+ and how LLMs obtain such abilities [47]. Although there are
203
+ some meaningful discussions about this problem [31, 47],
204
+ more principled investigations are needed to uncover the
205
+ “secrets“ of LLMs. Secondly, it is difficult for the research
206
+ community to train capable LLMs. Due to the huge demand of computation resources, it is very costly to carry
207
+ out repetitive, ablating studies for investigating the effect
208
+ of various strategies for training LLMs. Indeed, LLMs are
209
+ mainly trained by industry, where many important training
210
+ details (e.g., data collection and cleaning) are not revealed
211
+ to the public. Thirdly, it is challenging to align LLMs with
212
+ human values or preferences. Despite the capacities, LLMs
213
+ are also likely to produce toxic, fictitious, or harmful contents. It requires effective and efficient control approaches
214
+ to eliminating the potential risk of the use of LLMs [46].
215
+ Faced with both opportunities and challenges, it needs
216
+ more attention on the research and development of LLMs. In
217
+ order to provide a basic understanding of LLMs, this survey
218
+ conducts a literature review of the recent advances in LLMs
219
+ from four major aspects, including pre-training (how to pretrain a capable LLM), adaptation (how to effectively adapt
220
+ pre-trained LLMs for better use), utilization (how to use
221
+ LLMs for solving various downstream tasks) and capability
222
+ evaluation (how to evaluate the abilities of LLMs and existing
223
+ empirical findings). We thoroughly comb the literature and
224
+ summarize the key findings, techniques, and methods of
225
+ LLMs. For this survey, we also create a GitHub project
226
+ website by collecting the supporting resources for LLMs, at
227
+ the link https://github.com/RUCAIBox/LLMSurvey. We
228
+ are also aware of several related review articles on PLMs
229
+ or LLMs [32, 36, 38, 39, 43, 48–54]. These papers either
230
+ discuss PLMs or some specific (or general) aspects of LLMs.
231
+ Compared with them, we focus on the techniques and
232
+ methods to develop and use LLMs and provide a relatively
233
+ comprehensive reference to important aspects of LLMs.
234
+ The remainder of this survey is organized as follows:
235
+ Section 2 introduces the background for LLMs and the evolution of GPT-series models, followed by the summarization
236
+ of available resources for developing LLMs in Section 3.
237
+ Sections 4, 5, 6, and 7 review and summarize the recent
238
+ progress from the four aspects of pre-training, adaptation,
239
+ utilization, and capacity evaluation, respectively. Then, Section 8 discusses the practical guide for prompt design,
240
+ and Section 9 reviews the applications of LLMs in several
241
+ representative domains. Finally, we conclude the survey in
242
+ Section 10 by summarizing the major findings and discuss
243
+ the remaining issues for future work.
244
+ '''
245
+
246
+
247
+ conclusion_generator = ConclusionGenerator(Global_pipeline)
248
+ with_lora = conclusion_generator.generate(title, intro, mode='lora')
249
+ # print("The conclusion generated with LORA is: \n", with_lora)
250
+ # print("=============================================================")
251
+ with_test = conclusion_generator.generate(title, intro, mode='test')
252
+ # print("The conclusion generated with test is: \n", with_test)
253
+ # print("=============================================================")
src/demo/asg_generator.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import transformers
3
+ import os
4
+ import re
5
+ import ast
6
+ import json
7
+ import base64
8
+
9
+ def getQwenClient():
10
+ openai_api_key = os.environ.get("OPENAI_API_KEY")
11
+ openai_api_base = os.environ.get("OPENAI_API_BASE")
12
+
13
+ client = OpenAI(
14
+ api_key=openai_api_key,
15
+ base_url=openai_api_base,
16
+ )
17
+ return client
18
+
19
+ def generateResponse(client, prompt):
20
+ chat_response = client.chat.completions.create(
21
+ model=os.environ.get("MODEL"),
22
+ max_tokens=768,
23
+ temperature=0.5,
24
+ stop="<|im_end|>",
25
+ stream=True,
26
+ messages=[{"role": "user", "content": prompt}]
27
+ )
28
+
29
+ text = ""
30
+ for chunk in chat_response:
31
+ if chunk.choices[0].delta.content:
32
+ text += chunk.choices[0].delta.content
33
+ return text
34
+
35
+ def generate_sentence_patterns(keyword, num_patterns=5, temp=0.7):
36
+ template = f"""
37
+ You are a helpful assistant that provides only the output requested, without any additional text.
38
+
39
+ Please generate {num_patterns} commonly used sentence templates in academic papers to describe the '{keyword}'.
40
+ - Do not include any explanations, sign-offs, or additional text.
41
+ - The list should be in the following format:
42
+ [
43
+ "First template should be here",
44
+ "Second template should be here",
45
+ ...
46
+ ]
47
+
48
+ Begin your response immediately with the list, and do not include any other text.
49
+ """
50
+ openai_api_key = os.getenv("OPENAI_API_KEY")
51
+ openai_api_base = os.getenv("OPENAI_API_BASE")
52
+ client = OpenAI(
53
+ api_key = openai_api_key,
54
+ base_url = openai_api_base,
55
+ )
56
+ response = generateResponse(client, template)
57
+ return response
58
+
59
+ def generate(context, keyword, paper_title, temp=0.7):
60
+ template = f"""
61
+ Context:
62
+ {context}
63
+ ------------------------------------------------------------
64
+ Based on the above context, answer the question: What {keyword} are mentioned in the paper {paper_title}?
65
+ Please provide a direct answer in one paragraph, no longer than 100 words.
66
+
67
+ If the context provides enough information, answer strictly based on it.
68
+ If the context provided does not contain any specified {keyword}, deduce and integrate your own opinion as if the {keyword} were described in the context.
69
+ Ensure that your answer remains consistent with the style and format of the provided context, as if the information you provide is naturally part of it.
70
+ ------------------------------------------------------------
71
+ Answer:
72
+ The {keyword} mentioned in this paper discuss [Your oberservation or opinion]...
73
+ """
74
+ openai_api_key = os.getenv("OPENAI_API_KEY")
75
+ openai_api_base = os.getenv("OPENAI_API_BASE")
76
+ client = OpenAI(
77
+ api_key = openai_api_key,
78
+ base_url = openai_api_base,
79
+ )
80
+ response = generateResponse(client, template)
81
+ return response
82
+
83
+ def extract_query_list(text):
84
+ pattern = re.compile(
85
+ r'\[\s*"[^"]+"\s*,\s*"[^"]+"\s*,\s*"[^"]+"\s*,\s*"[^"]+"\s*,\s*"[^"]+"\s*,\s*"[^"]+"\s*,\s*"[^"]+"\s*,\s*"[^"]+"\s*,\s*"[^"]+"\s*,\s*"[^"]+"\s*\]'
86
+ )
87
+ match = pattern.search(text)
88
+ if match:
89
+ return match.group(0)
90
+ return None
src/demo/asg_latex.py ADDED
@@ -0,0 +1,816 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import subprocess
3
+ import os
4
+
5
+ from openai import OpenAI
6
+ import dotenv
7
+ from .asg_add_flowchart import insert_tex_images
8
+ from .asg_mindmap import insert_outline_figure
9
+
10
+
11
+ def _remove_div_blocks(lines):
12
+ """
13
+ 从给定的行列表中,移除所有形如:
14
+ <div style="...">
15
+ ... (若干行)
16
+ </div>
17
+ 的 HTML 块(含首尾 <div> ... </div>)整段跳过。
18
+ 返回处理后的新行列表。
19
+ """
20
+ new_lines = []
21
+ i = 0
22
+ n = len(lines)
23
+
24
+ while i < n:
25
+ line = lines[i]
26
+ # 如果该行以 <div style= 开头,则进入跳过模式
27
+ if line.strip().startswith("<div style="):
28
+ # 跳过本行
29
+ i += 1
30
+ # 一直向后找,直到遇到 '</div>' 行
31
+ while i < n and not lines[i].strip().startswith("</div>"):
32
+ i += 1
33
+ # 这里再跳过 '</div>' 那一行
34
+ i += 1
35
+ else:
36
+ new_lines.append(line)
37
+ i += 1
38
+
39
+ return new_lines
40
+
41
+ def _convert_setext_to_atx(lines):
42
+ """
43
+ 将形如:
44
+
45
+ 标题文字
46
+ ===
47
+
48
+ 转换为:
49
+
50
+ # 标题文字
51
+
52
+ 将形如:
53
+
54
+ 标题文字
55
+ ---
56
+
57
+ 转换为:
58
+
59
+ ## 标题文字
60
+ """
61
+ setext_equal_pattern = re.compile(r'^\s*=+\s*$') # 匹配全 `===`
62
+ setext_dash_pattern = re.compile(r'^\s*-+\s*$') # 匹配全 `---`
63
+
64
+ new_lines = []
65
+ i = 0
66
+ n = len(lines)
67
+
68
+ while i < n:
69
+ line = lines[i]
70
+ if i < n - 1:
71
+ next_line = lines[i + 1].strip()
72
+ # 若下一行是 ===
73
+ if setext_equal_pattern.match(next_line):
74
+ heading_text = line.strip()
75
+ new_lines.append(f"# {heading_text}")
76
+ i += 2 # 跳过下一行
77
+ continue
78
+ # 若下一行是 ---
79
+ if setext_dash_pattern.match(next_line):
80
+ heading_text = line.strip()
81
+ new_lines.append(f"## {heading_text}")
82
+ i += 2
83
+ continue
84
+ # 否则不改动
85
+ new_lines.append(line)
86
+ i += 1
87
+
88
+ return new_lines
89
+
90
+ def preprocess_md(md_input_path: str, md_output_path: str = None) -> str:
91
+ """
92
+ 预处理一个 Markdown 文件:
93
+ 1. 移除所有 <div style="..."> ... </div> 这类 HTML 块
94
+ 2. 将 setext 标题 (===, ---) 转为 ATX 标题 (#, ##)
95
+ 3. 覆盖写回或输出到新文件
96
+
97
+ 参数:
98
+ md_input_path: 原始 Markdown 文件路径
99
+ md_output_path: 处理后要写出的 Markdown 文件路径; 若为 None 则覆盖原始文件.
100
+
101
+ 返回:
102
+ str: 返回处理后 Markdown 文件的实际写出路径 (md_output_path).
103
+ """
104
+ if md_output_path is None:
105
+ md_output_path = md_input_path
106
+
107
+ # 1) 读入行
108
+ with open(md_input_path, 'r', encoding='utf-8') as f:
109
+ lines = f.read().splitlines()
110
+
111
+ # 2) 移除 <div style="..."> ... </div> 片段
112
+ lines_no_div = _remove_div_blocks(lines)
113
+
114
+ # 3) 将 setext 标题转换为 ATX
115
+ lines_atx = _convert_setext_to_atx(lines_no_div)
116
+
117
+ # 4) 写出
118
+ with open(md_output_path, 'w', encoding='utf-8') as f:
119
+ for ln in lines_atx:
120
+ f.write(ln + "\n")
121
+
122
+ return md_output_path
123
+
124
+ def search_sections(md_path: str):
125
+ """
126
+ 解析仅含 ATX 风格标题的 Markdown 文件,返回一个列表,
127
+ 每个元素是一个三元组: (level, heading_text, content_string)
128
+
129
+ 说明:
130
+ - 标题行形如 "# 标题"、"## 标题"、"### 标题" 等(在井号后有一个空格)。
131
+ - level = (井号个数 - 1),即 "# -> level=0"、"## -> level=1"、"### -> level=2" ...
132
+ - 移除类似 "3.1.3 "、"2.10.1 " 这类数字点前缀(含其后空格)。
133
+ - content_string 为该标题之后、直到下一个标题行或文件结束为止的所有文本(换行拼接)。
134
+ """
135
+
136
+ # 用于匹配 ATX 标题(如 "# 标题", "## 3.1.3 标题" 等)
137
+ atx_pattern = re.compile(r'^(#+)\s+(.*)$')
138
+
139
+ # 用于去除标题前缀的数字.数字.数字... (可能有空格)
140
+ # 示例匹配: "3.1.3 "、"2.10.1 " 等
141
+ leading_numbers_pattern = re.compile(r'^\d+(\.\d+)*\s*')
142
+
143
+ # 读入行
144
+ with open(md_path, "r", encoding="utf-8") as f:
145
+ lines = f.read().splitlines()
146
+
147
+ sections = []
148
+ i = 0
149
+ n = len(lines)
150
+
151
+ def gather_content(start_idx: int):
152
+ """
153
+ 从 start_idx 开始,收集正文,直到遇到下一个 ATX 标题或文档末尾。
154
+ 返回 (content_string, end_idx).
155
+ """
156
+ content_lines = []
157
+ idx = start_idx
158
+ while idx < n:
159
+ line = lines[idx].rstrip()
160
+ # 如果此行匹配到 ATX 标题模式,则停止收集正文
161
+ if atx_pattern.match(line):
162
+ break
163
+ content_lines.append(lines[idx])
164
+ idx += 1
165
+ return "\n".join(content_lines), idx
166
+
167
+ while i < n:
168
+ line = lines[i].rstrip()
169
+
170
+ # 判断是否为 ATX 标题
171
+ match_atx = atx_pattern.match(line)
172
+ if match_atx:
173
+ # group(1) 例如 "##"
174
+ # group(2) 例如 "3.1 Introduction"
175
+ hashes = match_atx.group(1)
176
+ heading_text_raw = match_atx.group(2).strip()
177
+
178
+ # 计算标题层级: "# -> level=0, ## -> level=1, ### -> level=2"
179
+ heading_level = len(hashes) - 1
180
+
181
+ # 移除类似 "3.1.3 " 的前缀
182
+ heading_text = leading_numbers_pattern.sub('', heading_text_raw).strip()
183
+
184
+ i += 1 # 跳过标题行,准备收集正文
185
+ content_string, new_idx = gather_content(i)
186
+
187
+ sections.append((heading_level, heading_text, content_string))
188
+ i = new_idx
189
+ print(heading_level, heading_text)
190
+ else:
191
+ # 否则跳到下一行
192
+ i += 1
193
+
194
+ # [可选调试输出] 打印当前标题层级及其文本
195
+
196
+
197
+ return sections[1:]
198
+
199
+ def abstract_to_tex(section):
200
+ """
201
+ 将 Markdown 中的 abstract 段落转化为 LaTeX 片段。
202
+
203
+ 参数:
204
+ section: (level, heading_text, content_string)
205
+ level: 0 表示一级标题, 1 表示二级标题, etc.
206
+ heading_text: 当前标题文字
207
+ content_string: 该标题下的 Markdown 文本
208
+
209
+ 返回:
210
+ 一个字符串,包含对应的 LaTeX abstract 环境。
211
+ """
212
+ level, heading_text, content_string = section
213
+
214
+ # 如果标题不是 "Abstract",则直接返回空字符串
215
+ if heading_text.lower() != "abstract":
216
+ return ""
217
+
218
+ # 生成 LaTeX abstract 环境
219
+ latex_abstract = (
220
+ "\\begin{abstract}\n"
221
+ f"{content_string}\n"
222
+ "\\end{abstract}"
223
+ )
224
+ return latex_abstract
225
+
226
+ def references_to_tex(section):
227
+ """
228
+ 将 Markdown 中的 references 段落转化为 LaTeX 片段。
229
+
230
+ 参数:
231
+ section: (level, heading_text, content_string)
232
+ level: 0 表示一级标题, 1 表示二级标题, etc.
233
+ heading_text: 当前标题文字
234
+ content_string: 该标题下的 Markdown 文本
235
+
236
+ 返回:
237
+ 一个字符串,包含对应的 LaTeX references 环境。
238
+ """
239
+ level, heading_text, content_string = section
240
+
241
+ # 如果标题不是 "References",则直接返回空字符串
242
+ if heading_text.lower() != "references":
243
+ return ""
244
+
245
+ # 在每一行的末尾添加 \\ 以实现换行
246
+ lines = content_string.splitlines()
247
+ latex_content = " \\\\{}\n".join(line.strip() for line in lines if line.strip())
248
+
249
+ # 生成 LaTeX 片段,使用 \section* 创建不带编号的标题
250
+ latex_references = (
251
+ "\\section*{References}\n" # 不带编号的 section
252
+ f"{latex_content}"
253
+ )
254
+ return latex_references
255
+
256
+ def md_to_tex_section(section):
257
+
258
+ """
259
+ 将单个 Markdown 分段 (level, heading, content) 转化为 LaTeX 片段。
260
+ 会根据标题的深度生成 \\section, \\subsection, 或 \\subsubsection 等。
261
+ 同时对 markdown 中的图片 div 进行正则替换,转化为 LaTeX figure 环境。
262
+
263
+ 参数:
264
+ section: (level, heading_text, content_string)
265
+ level: 0 表示一级标题, 1 表示二级标题, etc.
266
+ heading_text: 当前标题文字
267
+ content_string: 该标题下的 Markdown 文本
268
+
269
+ 返回:
270
+ 一个字符串,包含对应的 LaTeX 标题以及内容。
271
+ 内容由 OpenAI 模型将 Markdown 转为 LaTeX,并将图片 div 转为 LaTeX figure。
272
+ """
273
+ level, heading_text, content_string = section
274
+
275
+ # 根据 heading level 生成对应的 LaTeX 命令
276
+ if level == 0:
277
+ latex_heading = f"\\section{{{heading_text}}}"
278
+ elif level == 1:
279
+ latex_heading = f"\\subsection{{{heading_text}}}"
280
+ elif level == 2:
281
+ latex_heading = f"\\subsubsection{{{heading_text}}}"
282
+ else:
283
+ # 更深入的层级可自行添加
284
+ latex_heading = f"\\paragraph{{{heading_text}}}"
285
+
286
+ # 先粗略替换图片 div 为占位符,后续交由 OpenAI 模型或自身再做处理
287
+ # 这里我们先把 <div style="text-align:center">...<img ...>...</div><div ...>Fig x: ...</div> 转换为一个自定义标记 [IMG_BLOCK] ... [END_IMG_BLOCK]
288
+ # 这样后面可以更好控制让 OpenAI 转成正确的 LaTeX 也行,或在本地处理也行。
289
+ # 这里我们本地进行处理,将它直接转换为 LaTeX figure。
290
+
291
+ def replace_img_div(match):
292
+ """
293
+ 将 <div style="text-align:center"> <img src="..." alt="..." style="width:60%;"/> </div>
294
+ <div style="text-align:center;font-size:smaller;">Fig x: ...</div>
295
+ 这种模式转换为标准 LaTeX figure 环境
296
+ """
297
+ whole_block = match.group(0)
298
+
299
+ # 提取 src
300
+ src_match = re.search(r'<img.*?src="(.*?)".*?>', whole_block, re.DOTALL)
301
+ src_path = src_match.group(1) if src_match else "image_not_found"
302
+
303
+ # 提取 alt
304
+ alt_match = re.search(r'<img.*?alt="(.*?)".*?>', whole_block, re.DOTALL)
305
+ alt_text = alt_match.group(1) if alt_match else ""
306
+
307
+ # 提取 caption (Fig x: ...)
308
+ fig_match = re.search(r'Fig\s*\d+:\s*(.*?)<\/div>', whole_block, re.DOTALL)
309
+ fig_caption = fig_match.group(1).strip() if fig_match else ""
310
+
311
+ # 生成 LaTeX figure
312
+ latex_figure = (
313
+ "\\begin{figure}[htbp]\n"
314
+ " \\centering\n"
315
+ f" \\includegraphics[width=0.6\\textwidth]{{{src_path}}}\n"
316
+ f" \\caption{{{alt_text if alt_text else fig_caption}}}\n"
317
+ # 也可以根据需求决定是否加 label
318
+ "\\end{figure}\n"
319
+ )
320
+
321
+ return latex_figure
322
+
323
+ # 用正则定位该模式并转换为 latex figure
324
+ # 该模式大概是:
325
+ # <div style="text-align:center">.*?<img src="...".*?>.*?</div>\s*<div style="text-align:center;font-size:smaller;">.*?</div>
326
+ # 这里用非贪婪模式, DOTALL 允许匹配换行
327
+ pattern_img_div = re.compile(
328
+ r'<div\s+style="text-align:center".*?>.*?<img.*?>.*?</div>\s*<div\s+style="text-align:center;font-size:smaller;">.*?<\/div>',
329
+ re.DOTALL
330
+ )
331
+
332
+ content_converted_images = re.sub(pattern_img_div, replace_img_div, content_string)
333
+
334
+ # ------------------------------------------------
335
+ # 调用 OpenAI 接口,将 (转换好图片 div 的) Markdown 转为 LaTeX
336
+ # ------------------------------------------------
337
+ system_prompt = (
338
+ "You are a helpful assistant that converts Markdown text to rigorous LaTeX. "
339
+ "Maintain inline formatting like bold, italics, and code blocks when possible. "
340
+ "Simply format horizontally aligned text, lists, tables, etc. into valid LaTeX."
341
+ "Use [LaTeX] ... [/LaTeX] to wrap the final content without the \\section\{\}."
342
+ "If the content is mathematically descriptive, please insert exactly one LaTeX math equation with explaination ($...$)to describe it."
343
+ "Do not include any other irrelevant information."
344
+ "Remember to clean the refs such as \[1], \[2], \[3] inside the text to strip the backslashes to [1], [2], [3]. No any extra backslashes."
345
+ )
346
+
347
+ user_prompt = (
348
+ "Convert the following Markdown content to LaTeX. The text may already contain "
349
+ "some partial LaTeX for figures:\n\n"
350
+ f"{content_converted_images}"
351
+ )
352
+
353
+ messages = [
354
+ {"role": "system", "content": system_prompt},
355
+ {"role": "user", "content": user_prompt},
356
+ ]
357
+
358
+ # 从环境变量中获取 openai key 和 base url
359
+ openai_api_key = os.getenv("OPENAI_API_KEY")
360
+ openai_api_base = os.getenv("OPENAI_API_BASE")
361
+
362
+ # 初始化 Client
363
+ client = OpenAI(
364
+ api_key=openai_api_key,
365
+ base_url=openai_api_base,
366
+ )
367
+
368
+ chat_response = client.chat.completions.create(
369
+ model=os.environ.get("MODEL"),
370
+ max_tokens=2048,
371
+ temperature=0.5,
372
+ stop="<|im_end|>",
373
+ stream=True,
374
+ messages=messages
375
+ )
376
+
377
+ # Stream the response
378
+ tex_body = ""
379
+ for chunk in chat_response:
380
+ if chunk.choices[0].delta.content:
381
+ tex_body += chunk.choices[0].delta.content
382
+
383
+ # 假设我们想在聊天回复中,使用 [LaTeX] ... [/LaTeX] 包裹最终内容,类似:
384
+ # [LaTeX]
385
+ # 你要的 tex body ...
386
+ # [/LaTeX]
387
+
388
+ # 可以用正则截取中间内容:
389
+ pattern = r'\[LaTeX\](.*?)\[/LaTeX\]'
390
+ match = re.search(pattern, tex_body, re.DOTALL)
391
+ if match:
392
+ # 如果拿到中间的内容 就用它, 否则就用全部
393
+ tex_body = match.group(1).strip()
394
+
395
+ # 去掉多余的空白
396
+ tex_body = re.sub(r'\s+', ' ', tex_body).strip()
397
+
398
+ # 整合 LaTeX 标题和转好的正文
399
+ final_tex_snippet = latex_heading + "\n\n" + tex_body + "\n"
400
+ print("Tex snippet:")
401
+ print(final_tex_snippet)
402
+ return final_tex_snippet
403
+
404
+ def md_to_tex_section_without_jpg(section):
405
+ """
406
+ 将单个 Markdown 分段 (level, heading_text, content_string) 转化为 LaTeX 片段,
407
+ 不处理任何 HTML 或图片 div,仅调用 OpenAI 模型将普通 Markdown 转为 LaTeX。
408
+
409
+ 参数:
410
+ section: (level, heading_text, content_string)
411
+ - level: 0 表示一级标题, 1 表示二级标题, 2 表示三级标题等
412
+ - heading_text: 当前标题文字
413
+ - content_string: 该标题下的 Markdown 文本
414
+
415
+ 返回:
416
+ 一个字符串,包含对应的 LaTeX 标题以及转换后的正文。
417
+ """
418
+
419
+ level, heading_text, content_string = section
420
+
421
+ # 1) 根据 level 生成对应的 LaTeX 命令
422
+ # 你也可以改成更灵活的逻辑,比如多级。
423
+ if level == 0:
424
+ latex_heading = f"\\section{{{heading_text}}}"
425
+ elif level == 1:
426
+ latex_heading = f"\\subsection{{{heading_text}}}"
427
+ elif level == 2:
428
+ latex_heading = f"\\subsubsection{{{heading_text}}}"
429
+ else:
430
+ latex_heading = f"\\paragraph{{{heading_text}}}"
431
+
432
+ # 2) 判断是否要跳过 LLM 转换
433
+ # 这里给出几种常见原因:
434
+ # - 内容字符串为空或全是空白 (content_string.strip() == "")
435
+ # - 标题看起来只是一个段落号, 形如"3"、"3.1"、"3.1.1" 等 (可根据需要调宽或调窄判断规则)
436
+
437
+ # 例:用一个正则匹配 `数字(.数字)*`,可带可不带后缀空格
438
+ # 如果 heading_text 完全匹配这个模式,就认为它是个“纯编号标题”,不必调用 LLM
439
+ pure_number_pattern = re.compile(r'^\d+(\.\d+)*$')
440
+
441
+ # 先去一下两端空格
442
+ ht_stripped = heading_text.strip()
443
+ # 若正文为空,或标题是纯数字/编号,就跳过 LLM
444
+ skip_llm = (not content_string.strip()) or bool(pure_number_pattern.match(ht_stripped))
445
+
446
+ if skip_llm:
447
+ # 直接返回标题 + 原始正文 (若有也可保留)
448
+ # 如果你只想输出标题,就让正文为空
449
+ tex_body = content_string
450
+ # 也可以选择把正文丢弃,比如:
451
+ # tex_body = ""
452
+ else:
453
+ # 3) 需要调用 LLM 的情况
454
+ # system_prompt = (
455
+ # "You are a helpful assistant that converts Markdown text to rigorous LaTeX. "
456
+ # "Maintain inline formatting like bold, italics, and code blocks when possible. "
457
+ # "Simply format horizontally aligned text, lists, tables, etc. into valid LaTeX."
458
+ # "Use [LaTeX] ... [/LaTeX] to wrap the final content without the \\section\{\}."
459
+ # "If the content is mathematically descriptive, please insert exactly one LaTeX math equation with explaination (\\[...\\])to describe it."
460
+ # "You are forced to use \\begin{dmath} and \\end{dmath} to replace the origin square brackets and wrap the equation"
461
+ # "Do not include any other irrelevant information."
462
+ # "Remember to clean the refs such as \[1], \[2], \[3] inside the text to strip the backslashes to [1], [2], [3]. No any extra backslashes."
463
+ # )
464
+ system_prompt = (
465
+ "You are a helpful assistant that converts Markdown text to rigorous LaTeX. "
466
+ "Maintain inline formatting like bold, italics, and code blocks when possible. "
467
+ "Format horizontally aligned text, lists, and tables into valid LaTeX.\n\n"
468
+
469
+ "Use [LaTeX] ... [/LaTeX] to wrap the final content without the \\section{}.\n\n"
470
+ "If the content is mathematically descriptive, please insert exactly one LaTeX math equation to describe it."
471
+ "For mathematical content, strictly follow the **standard equation format** below:\n\n"
472
+
473
+ "1. **Wrap equations inside `equation`**:\n"
474
+ " ```latex\n"
475
+ " \\begin{equation}\n"
476
+ " \\resizebox{0.95\\columnwidth}{!}{$\n"
477
+ " ... % (Insert the equation here)\n"
478
+ " $}\n"
479
+ " \\end{equation}\n"
480
+ " ```\n"
481
+ " - **All equations must be enclosed in `\\resizebox{0.95\\columnwidth}{!}{...}`**.\n"
482
+ " - **Ensure the equation fits within `\\columnwidth`** in two-column layouts.\n\n"
483
+
484
+ "2. **For descriptions, simply use plain text with double backslashes, for example:\n"
485
+ "$f_i(x)$ is the local objective function of node $i$.\\"
486
+ "$\mathcal{N}_i$ is the set of in-neighbors of node $i$.\\"
487
+
488
+ "3. **Ensure proper formatting**:\n"
489
+ " - **DO NOT use `align`, `multline`, or `split`**—only `equation` with `resizebox`.\n"
490
+ " - **DO NOT allow formulas to exceed column width**.\n"
491
+ " - **DO NOT allow any other latex syntax such as"
492
+ " \\documentclass{article} \\usepackage{amsmath} \\usepackage{graphicx} \\begin{document}** use the plain content with formula.\n"
493
+
494
+ " - **Maintain the original refs and ensure that references like [1], [2], [3], do not contain unnecessary backslashes**.\n\n"
495
+
496
+ "All generated LaTeX content **must strictly adhere to this structure**."
497
+ )
498
+
499
+ user_prompt = (
500
+ "Convert the following Markdown content to LaTeX. "
501
+ f"{content_string}"
502
+ )
503
+
504
+ messages = [
505
+ {"role": "system", "content": system_prompt},
506
+ {"role": "user", "content": user_prompt},
507
+ ]
508
+
509
+ # 从环境变量中获取 openai key 和 base url
510
+ openai_api_key = os.getenv("OPENAI_API_KEY")
511
+ openai_api_base = os.getenv("OPENAI_API_BASE")
512
+
513
+ # 初始化 Client
514
+ client = OpenAI(
515
+ api_key=openai_api_key,
516
+ base_url=openai_api_base,
517
+ )
518
+
519
+ chat_response = client.chat.completions.create(
520
+ model=os.environ.get("MODEL"),
521
+ max_tokens=2048,
522
+ temperature=0.5,
523
+ stop="<|im_end|>",
524
+ stream=True,
525
+ messages=messages
526
+ )
527
+
528
+ # 流式读取返���
529
+ tex_body = ""
530
+ for chunk in chat_response:
531
+ if chunk.choices[0].delta.content:
532
+ tex_body += chunk.choices[0].delta.content
533
+
534
+ # 提取 [LaTeX] ... [/LaTeX] 中间的内容
535
+ pattern = r'\[LaTeX\](.*?)\[/LaTeX\]'
536
+ match = re.search(pattern, tex_body, re.DOTALL)
537
+ if match:
538
+ tex_body = match.group(1).strip()
539
+
540
+ # 去掉多余的空白
541
+ tex_body = re.sub(r'\s+', ' ', tex_body).strip()
542
+
543
+ # 4) 最终拼接
544
+ final_tex_snippet = latex_heading + "\n\n" + tex_body + "\n"
545
+ print("Tex snippet:")
546
+ print(final_tex_snippet)
547
+ return final_tex_snippet
548
+
549
+ def insert_section(tex_path: str, section_content: str):
550
+ """
551
+ 将 section_content 追加到 .tex 文件“最后一个 section(或子节)的正文末尾”。
552
+ 具体逻辑如下:
553
+ 1. 如果文件内找不到任何 \section{...}、\subsection{...}、\subsubsection{...},
554
+ 那么就将 section_content 插入到 \end{abstract} 之后。
555
+ 2. 如果在全文中能找到若干标题 (\section、\subsection、\subsubsection),
556
+ 则将 section_content 插入到最后出现的那个标题对应正文的末尾(即它和下一个标题/文件结束之间)。
557
+ 3. 如果既没有 abstract 环境,也没有任何标题,则在 \end{document} 前插入。
558
+
559
+ 参数:
560
+ tex_path: str
561
+ .tex 文件的路径。
562
+ section_content: str
563
+ 需要插入的段落字符串(LaTeX 格式)。
564
+
565
+ 注意:
566
+ - 这段逻辑会将新的内容**追加**到最后一个标题所对应正文的末尾,
567
+ 这样可以避免把之前的内容“分割”或“顶开”。
568
+ """
569
+
570
+ if not os.path.exists(tex_path):
571
+ print(f"TeX 文件不存在: {tex_path}")
572
+ return
573
+
574
+ with open(tex_path, 'r', encoding='utf-8') as f:
575
+ lines = f.readlines()
576
+
577
+ # 正则匹配标题、abstract、document
578
+ # 注意 \section、\subsection、\subsubsection 都做单独分组,这样获取行号时好区分
579
+ title_pattern = re.compile(r'^(\\section|\\subsection|\\subsubsection)\{[^}]*\}')
580
+ end_abstract_pattern = re.compile(r'^\\end\{abstract\}')
581
+ end_document_pattern = re.compile(r'^\\end\{document\}')
582
+
583
+ # 找到所有标题行号,保存到列表
584
+ title_lines = []
585
+ end_abstract_line = None
586
+ end_document_line = None
587
+
588
+ for i, line in enumerate(lines):
589
+ if title_pattern.match(line.strip()):
590
+ title_lines.append(i)
591
+ elif end_abstract_pattern.match(line.strip()):
592
+ end_abstract_line = i
593
+ elif end_document_pattern.match(line.strip()):
594
+ end_document_line = i
595
+
596
+ # 将要插入的内容行列表
597
+ insert_content_lines = section_content.strip().split('\n')
598
+
599
+ # 如果找不到任何标题
600
+ if not title_lines:
601
+ # 如果有 \end{abstract},就插在 \end{abstract} 后
602
+ if end_abstract_line is not None:
603
+ insert_idx = end_abstract_line + 1
604
+ else:
605
+ # 没有 \end{abstract},就尝试在 \end{document} 之前插入
606
+ if end_document_line is not None:
607
+ insert_idx = end_document_line
608
+ else:
609
+ # 如果也没有 \end{document},就插到文件末尾
610
+ insert_idx = len(lines)
611
+
612
+ new_lines = (
613
+ lines[:insert_idx]
614
+ + [l + "\n" for l in insert_content_lines]
615
+ + lines[insert_idx:]
616
+ )
617
+
618
+ else:
619
+ # 有标题时,将内容追加到“最后一个标题对应正文”的末尾
620
+ last_title_line = title_lines[-1]
621
+
622
+ # 找到下一个标题的行号(如果有),或 \end{document} 行号,以确定正文区间结束
623
+ # “最后标题正文”从 last_title_line+1 一直到 next_title_line-1(或结束)
624
+ next_boundaries = [end_document_line if end_document_line is not None else len(lines)]
625
+ for t_line in title_lines:
626
+ if t_line > last_title_line:
627
+ next_boundaries.append(t_line)
628
+ # next_boundary 是最后标题之后遇到的第一个 boundary(若没有, 就是文件末尾)
629
+ next_boundary = min(next_boundaries) if next_boundaries else len(lines)
630
+
631
+ # 我们希望将新的内容插在“最后标题正文的最末尾”之后,也就是说在 next_boundary 前。
632
+ # 不过若“最后标题”本身就处于全文件最终,next_boundary 可能表示文件末尾/文档结束。
633
+ # 这里为了避免把最后一行顶下去,可以先把其中的正文行都保留,再在最后插入 section_content。
634
+ new_lines = []
635
+ new_lines.extend(lines[:next_boundary]) # 保留从头到最后正文结束
636
+ new_lines.extend([l + "\n" for l in insert_content_lines])
637
+ new_lines.extend(lines[next_boundary:])
638
+
639
+ with open(tex_path, 'w', encoding='utf-8') as f:
640
+ f.writelines(new_lines)
641
+
642
+ print("成功插入 section 内容:", tex_path)
643
+
644
+ def md_to_tex(md_path, tex_path, title):
645
+ """
646
+ 将 Markdown 文件转换为 LaTeX 文件。
647
+
648
+ 参数:
649
+ md_path (str): 输入的 Markdown 文件路径。
650
+ tex_path (str): 输出的 LaTeX 文件路径。
651
+ """
652
+ sections = search_sections(md_path)
653
+ section_index = 0
654
+ while section_index < len(sections):
655
+ print(f"Converting section {section_index+1}/{len(sections)}")
656
+ if section_index == 0:
657
+ tex = abstract_to_tex(sections[section_index])
658
+ print(tex)
659
+ elif section_index == len(sections) - 1:
660
+ postprocess(tex_path, title)
661
+ tex = references_to_tex(sections[section_index])
662
+ print(tex)
663
+ else:
664
+ tex = md_to_tex_section_without_jpg(sections[section_index])
665
+ print(tex)
666
+ insert_section(tex_path, tex)
667
+ section_index += 1
668
+ # tex_to_pdf(tex_path, output_dir=os.path.dirname(tex_path), compiler="pdflatex")
669
+
670
+ def tex_to_pdf(tex_path, output_dir=None, compiler="xelatex"):
671
+ """
672
+ 将 LaTeX 文件编译为 PDF 文件。
673
+
674
+ 参数:
675
+ tex_path (str): 输入的 LaTeX 文件路径。
676
+ output_dir (str): 输出的 PDF 文件目录。
677
+ compiler (str): 编译器,默认为 "xelatex"。
678
+ """
679
+ if output_dir is None:
680
+ output_dir = os.path.dirname(tex_path)
681
+ tex_name = os.path.basename(tex_path)
682
+ tex_name_no_ext = os.path.splitext(tex_name)[0]
683
+ pdf_path = os.path.join(output_dir, f"{tex_name_no_ext}.pdf")
684
+
685
+ subprocess.run([
686
+ compiler,
687
+ "-interaction=nonstopmode",
688
+ "-output-directory",
689
+ output_dir,
690
+ tex_path
691
+ ])
692
+
693
+ print(f"PDF 文件已生成: {pdf_path}")
694
+
695
+ def insert_figures(png_path, tex_path, json_path, ref_names, survey_title, new_tex_path):
696
+ """
697
+ 读取给定的 TeX 文件 (tex_path),先调用 insert_outline_figure 在其中插入概览图片;
698
+ 然后再调用 insert_tex_images 在文中发现的引用标记位置插入 figure 环境。
699
+ 最后把处理完的文本写入 new_tex_path。
700
+
701
+ 参数:
702
+ png_path: 大纲图片的路径(会传给 insert_outline_figure)。
703
+ tex_path: 原始 TeX 文件路径。
704
+ json_path: 图片对应的 JSON(会传给 insert_tex_images,内含 引用名称 -> 图片路径 的映射)。
705
+ ref_names: 引用名称列表 (index 从 0 开始)。
706
+ survey_title: 用于大纲图片 figure 中的说明文字。
707
+ new_tex_path: 处理后新的 TeX 文件输出路径。
708
+ """
709
+ # 1. 读取原始 tex 文件内容
710
+ with open(tex_path, 'r', encoding='utf-8') as f:
711
+ tex_content = f.read()
712
+
713
+ # 2. 在 '2 Introduction' 前插入一张占满整页的描述性图片(概览图)
714
+ updated_tex = insert_outline_figure(
715
+ png_path=png_path,
716
+ tex_content=tex_content,
717
+ survey_title=survey_title
718
+ )
719
+
720
+ # 3. 在文中其他引用 [n], \[n], \[n\] 等位置插入 figure
721
+ updated_tex = insert_tex_images(
722
+ json_path=json_path,
723
+ ref_names=ref_names,
724
+ text=updated_tex
725
+ )
726
+
727
+ # 4. 将处理结果写入新路径
728
+ with open(new_tex_path, 'w', encoding='utf-8') as f:
729
+ f.write(updated_tex)
730
+
731
+ print(f"已生成新的 TeX 文件: {new_tex_path}")
732
+ return new_tex_path
733
+
734
+ def postprocess(tex_path, new_title):
735
+ """
736
+ 读取给定的 TeX 文件 (tex_path):
737
+ 1) 在第一处 \author 行的上一行插入 \title{new_title}。
738
+ 2) 将所有形如 "\[1\]"、"\[1]"、以及 "\[12\]" 等引用标记,
739
+ 以及 "[1\]" 之类的混合形式,全都去掉反斜杠,统一替换为 [1]、[12]。
740
+ 3) 将所有由 \[ \] 包裹的数学公式都替换为 \begin{dmath} \end{dmath}。
741
+ 最后将结果覆盖写回原始文件,并返回 tex_path。
742
+ """
743
+ new_title = 'A Survey of ' + new_title
744
+ # 1) 读取文件行
745
+ with open(tex_path, 'r', encoding='utf-8') as f:
746
+ lines = f.readlines()
747
+
748
+ # 2) 找到包含 "\author" 的行,在其上一行插入 \title{...}
749
+ inserted = False
750
+ for i, line in enumerate(lines):
751
+ # 如果这一行以 \author 开头,或包含 \author{...} 等
752
+ if line.strip().startswith(r'\author'):
753
+ # 在它前面插入一行 \title{new_title}
754
+ lines.insert(i, f'\\title{{{new_title}}}\n')
755
+ inserted = True
756
+ break
757
+
758
+ if not inserted:
759
+ # 若整份文件都没有 \author{...},可以选择直接在文档末尾插入,或在开头插入,视需求而定
760
+ print(f"[警告] 未找到 '\\author' 行,未能插入 '\\title{{{new_title}}}'。")
761
+ # 下面演示直接在文档末尾插入(可按自己需求改放开头等)
762
+ lines.append(f'\\title{{{new_title}}}\n')
763
+
764
+ # 将行列表合并为一个整体字符串,便于进行正则替换
765
+ text_joined = ''.join(lines)
766
+
767
+ # 3) 将形如 "\[1\]"、"\[12]"、"[12\]" 等都换成 "[1]"、"[12]" 等
768
+ # 核心正则:'(?:\\)?\[(\d+)(?:\\)?\]'
769
+ # (?:\\)? ---- 可选的一个反斜杠
770
+ # \[ ---- 匹配方括号的开头 '['
771
+ # (\d+) ---- 匹配并捕获1--多位数字
772
+ # (?:\\)? ---- 可选的一个反斜杠
773
+ # \] ---- 匹配方括号的结尾 ']'
774
+ ref_pattern = re.compile(r'(?:\\)?\[(\d+)(?:\\)?\]')
775
+ text_processed = ref_pattern.sub(r'[\1]', text_joined)
776
+
777
+ # 4) 将所有由 \[ \] 包裹的数学公式替换为 \begin{dmath} \end{dmath}
778
+ # 正则示例: 匹配 \[ ... \] 中间任意内容 (非贪婪)
779
+ # 使用 DOTALL 选项让 '.' 匹配换行
780
+ # eq_pattern = re.compile(r'\\\[(.*?)\\\]', re.DOTALL)
781
+ # text_processed = eq_pattern.sub(r'\\begin{dmath}\1\\end{dmath}', text_processed)
782
+
783
+ # 5) 写回原文件
784
+ with open(tex_path, 'w', encoding='utf-8') as f:
785
+ f.write(text_processed)
786
+
787
+ print(f"[完成] 已在 '{tex_path}' 中插入/追加 \\title{{{new_title}}},替换引用标记并将公式转为 dmath 格式。")
788
+ return tex_path
789
+
790
+ def md_to_tex_to_pdf(md_path, tex_path, pdf_path, png_path, json_path, ref_names, survey_title):
791
+ """
792
+ 将 Markdown 文件转换为 LaTeX 文件,然后再编译为 PDF 文件。
793
+
794
+ 参数:
795
+ md_path (str): 输入的 Markdown 文件路径。
796
+ tex_path (str): 输出的 LaTeX 文件路径。
797
+ pdf_path (str): 输出的 PDF 文件路径。
798
+ """
799
+ md_to_tex(md_path, tex_path)
800
+ new_tex_path = insert_figures(png_path, tex_path, json_path, ref_names, survey_title, tex_path)
801
+ # tex_to_pdf(new_tex_path, output_dir=os.path.dirname(tex_path), compiler="pdflatex")
802
+
803
+ if __name__ == "__main__":
804
+ # 读取环境变量
805
+ dotenv.load_dotenv()
806
+ # md_path = preprocess_md("src/demo/latex_template/test copy.md", "src/demo/latex_template/test_preprocessed.md")
807
+ md_path = 'src/static/data/info/undefined/survey_undefined_preprocessed.md'
808
+ tex_path = "src/static/data/info/undefined/template.tex"
809
+ md_to_tex(md_path, tex_path, title="A Comprehensive Review of ADMM On Consensus Distributed Optimization")
810
+ # insert_figures('src/static/data/info/undefined/outline.png',
811
+ # 'src/demo/latex_template/template.tex',
812
+ # 'src/static/data/info/undefined/flowchart_results.json',
813
+ # ['A comprehensive review of recommender systems transitioning from theory to practice', 'A large language model enhanced conversational recommender system'],
814
+ # 'Survey Title',
815
+ # 'src/demo/latex_template/template_with_figures.tex')
816
+ tex_to_pdf(tex_path, output_dir=os.path.dirname(tex_path), compiler="xelatex")
src/demo/asg_loader.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import subprocess
5
+ from langchain_community.document_loaders import UnstructuredMarkdownLoader
6
+ from langchain_core.documents import Document
7
+ import shutil
8
+
9
+ class DocumentLoading:
10
+ def convert_pdf_to_md(self, pdf_file, output_dir="output", method="auto"):
11
+ base_name = os.path.splitext(os.path.basename(pdf_file))[0]
12
+ target_dir = os.path.join(output_dir, base_name)
13
+ md_file_path = os.path.join(target_dir, method, f"{base_name}.md")
14
+ print("The md file path is: ", md_file_path)
15
+
16
+ if os.path.exists(md_file_path):
17
+ print(f"Markdown file for {pdf_file} already exists at {md_file_path}. Skipping conversion.", flush=True)
18
+ return
19
+
20
+ command = ["magic-pdf", "-p", pdf_file, "-o", output_dir, "-m", method]
21
+ try:
22
+ subprocess.run(command, check=True)
23
+ # 检查是否生成了 Markdown 文件
24
+ if not os.path.exists(md_file_path):
25
+ print(f"Conversion failed: Markdown file not found at {md_file_path}. Cleaning up folder...")
26
+ shutil.rmtree(target_dir) # 删除生成的文件夹
27
+ else:
28
+ print(f"Successfully converted {pdf_file} to markdown format in {target_dir}.")
29
+ except subprocess.CalledProcessError as e:
30
+ print(f"An error occurred during conversion: {e}")
31
+ # 如果发生错误且文件夹已生成,则删除文件夹
32
+ if os.path.exists(target_dir):
33
+ print(f"Cleaning up incomplete folder: {target_dir}")
34
+ shutil.rmtree(target_dir)
35
+ # new
36
+ def convert_pdf_to_md_new(self, pdf_dir, output_dir="output", method="auto"):
37
+ pdf_files = glob.glob(os.path.join(pdf_dir, "*.pdf"))
38
+
39
+ for pdf_file in pdf_files:
40
+ base_name = os.path.splitext(os.path.basename(pdf_file))[0]
41
+ target_dir = os.path.join(output_dir, base_name)
42
+
43
+ if os.path.exists(target_dir):
44
+ print(f"Folder for {pdf_file} already exists in {output_dir}. Skipping conversion.")
45
+ else:
46
+ command = ["magic-pdf", "-p", pdf_file, "-o", output_dir, "-m", method]
47
+ try:
48
+ subprocess.run(command, check=True)
49
+ print(f"Successfully converted {pdf_file} to markdown format in {target_dir}.")
50
+ except subprocess.CalledProcessError as e:
51
+ print(f"An error occurred: {e}")
52
+
53
+ def batch_convert_pdfs(pdf_files, output_dir="output", method="auto", max_workers=None):
54
+ # Create a process pool to run the conversion in parallel
55
+ with ProcessPoolExecutor(max_workers=max_workers) as executor:
56
+ # Submit each PDF file to the process pool for conversion
57
+ futures = [executor.submit(convert_pdf_to_md, pdf, output_dir, method) for pdf in pdf_files]
58
+
59
+ # Optionally, you can monitor the status of each future as they complete
60
+ for future in futures:
61
+ try:
62
+ future.result() # This will raise any exceptions that occurred during the processing
63
+ except Exception as exc:
64
+ print(f"An error occurred during processing: {exc}")
65
+
66
+ def extract_information_from_md(self, md_text):
67
+ title_match = re.search(r'^(.*?)(\n\n|\Z)', md_text, re.DOTALL)
68
+ title = title_match.group(1).strip() if title_match else "N/A"
69
+
70
+ authors_match = re.search(
71
+ r'\n\n(.*?)(\n\n[aA][\s]*[bB][\s]*[sS][\s]*[tT][\s]*[rR][\s]*[aA][\s]*[cC][\s]*[tT][^\n]*\n\n)',
72
+ md_text,
73
+ re.DOTALL
74
+ )
75
+ authors = authors_match.group(1).strip() if authors_match else "N/A"
76
+
77
+ abstract_match = re.search(
78
+ r'(\n\n[aA][\s]*[bB][\s]*[sS][\s]*[tT][\s]*[rR][\s]*[aA][\s]*[cC][\s]*[tT][^\n]*\n\n)(.*?)(\n\n|\Z)',
79
+ md_text,
80
+ re.DOTALL
81
+ )
82
+ abstract = abstract_match.group(0).strip() if abstract_match else "N/A"
83
+ abstract = re.sub(r'^[aA]\s*[bB]\s*[sS]\s*[tT]\s*[rR]\s*[aA]\s*[cC]\s*[tT][^\w]*', '', abstract)
84
+ abstract = re.sub(r'^[^a-zA-Z]*', '', abstract)
85
+
86
+ introduction_match = re.search(
87
+ r'\n\n([1I][\.\- ]?\s*)?[Ii]\s*[nN]\s*[tT]\s*[rR]\s*[oO]\s*[dD]\s*[uU]\s*[cC]\s*[tT]\s*[iI]\s*[oO]\s*[nN][\.\- ]?\s*\n\n(.*?)'
88
+ r'(?=\n\n(?:([2I][I]|\s*2)[^\n]*?\n\n|\n\n(?:[2I][I][^\n]*?\n\n)))',
89
+ md_text,
90
+ re.DOTALL
91
+ )
92
+ introduction = introduction_match.group(2).strip() if introduction_match else "N/A"
93
+
94
+ main_content_match = re.search(
95
+ r'(.*?)(\n\n([3I][\.\- ]?\s*)?[Rr][Ee][Ff][Ee][Rr][Ee][Nn][Cc][Ee][Ss][^\n]*\n\n|\Z)',
96
+ md_text,
97
+ re.DOTALL
98
+ )
99
+
100
+ if main_content_match:
101
+ main_content = main_content_match.group(1).strip()
102
+ else:
103
+ main_content = "N/A"
104
+
105
+ extracted_data = {
106
+ "title": title,
107
+ "authors": authors,
108
+ "abstract": abstract,
109
+ "introduction": introduction,
110
+ "main_content": main_content
111
+ }
112
+ return extracted_data
113
+
114
+ def process_md_file(self, md_file_path, survey_id):
115
+ loader = UnstructuredMarkdownLoader(md_file_path)
116
+ data = loader.load()
117
+ assert len(data) == 1, "Expected exactly one document in the markdown file."
118
+ assert isinstance(data[0], Document), "The loaded data is not of type Document."
119
+ extracted_text = data[0].page_content
120
+
121
+ extracted_data = self.extract_information_from_md(extracted_text)
122
+ if len(extracted_data["abstract"]) < 10:
123
+ extracted_data["abstract"] = extracted_data['title']
124
+
125
+ title = os.path.splitext(os.path.basename(md_file_path))[0]
126
+ title_new = title.strip()
127
+ invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '_']
128
+ for char in invalid_chars:
129
+ title_new = title_new.replace(char, ' ')
130
+
131
+ os.makedirs(f'./src/static/data/txt/{survey_id}', exist_ok=True)
132
+ with open(f'./src/static/data/txt/{survey_id}/{title_new}.json', 'w', encoding='utf-8') as f:
133
+ json.dump(extracted_data, f, ensure_ascii=False, indent=4)
134
+ return extracted_data['introduction']
135
+
136
+ def process_md_file_full(self, md_file_path, survey_id):
137
+ loader = UnstructuredMarkdownLoader(md_file_path)
138
+ data = loader.load()
139
+ assert len(data) == 1, "Expected exactly one document in the markdown file."
140
+ assert isinstance(data[0], Document), "The loaded data is not of type Document."
141
+ extracted_text = data[0].page_content
142
+
143
+ extracted_data = self.extract_information_from_md(extracted_text)
144
+ if len(extracted_data["abstract"]) < 10:
145
+ extracted_data["abstract"] = extracted_data['title']
146
+
147
+ title = os.path.splitext(os.path.basename(md_file_path))[0]
148
+ title_new = title.strip()
149
+ invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '_']
150
+ for char in invalid_chars:
151
+ title_new = title_new.replace(char, ' ')
152
+
153
+ os.makedirs(f'./src/static/data/txt/{survey_id}', exist_ok=True)
154
+ with open(f'./src/static/data/txt/{survey_id}/{title_new}.json', 'w', encoding='utf-8') as f:
155
+ json.dump(extracted_data, f, ensure_ascii=False, indent=4)
156
+ return extracted_data['abstract'] + extracted_data['introduction'] + extracted_data['main_content']
157
+
158
+
159
+ def load_pdf(self, pdf_file, survey_id, mode):
160
+ os.makedirs(f'./src/static/data/md/{survey_id}', exist_ok=True)
161
+ output_dir = f"./src/static/data/md/{survey_id}"
162
+ base_name = os.path.splitext(os.path.basename(pdf_file))[0]
163
+ target_dir = os.path.join(output_dir, base_name, "auto")
164
+
165
+ # 1. Convert PDF to markdown if the folder doesn't exist
166
+ self.convert_pdf_to_md(pdf_file, output_dir)
167
+
168
+ # 2. Process the markdown file in the output directory
169
+ md_file_path = os.path.join(target_dir, f"{base_name}.md")
170
+ if not os.path.exists(md_file_path):
171
+ raise FileNotFoundError(f"Markdown file {md_file_path} does not exist. Conversion might have failed.")
172
+
173
+ if mode == "intro":
174
+ return self.process_md_file(md_file_path, survey_id)
175
+ elif mode == "full":
176
+ return self.process_md_file_full(md_file_path, survey_id)
177
+
178
+ # wrong, still being tested
179
+ def load_pdf_new(self, pdf_dir, survey_id):
180
+ os.makedirs(f'./src/static/data/md/{survey_id}', exist_ok=True)
181
+ output_dir = f"./src/static/data/md/{survey_id}"
182
+ self.convert_pdf_to_md_new(pdf_dir, output_dir)
183
+ markdown_files = glob.glob(os.path.join(output_dir, "*", "auto", "*.md"))
184
+ all_introductions = []
185
+
186
+ for md_file_path in markdown_files:
187
+ try:
188
+ introduction = self.process_md_file(md_file_path, survey_id)
189
+ all_introductions.append(introduction)
190
+ except FileNotFoundError as e:
191
+ print(f"Markdown file {md_file_path} does not exist. Conversion might have failed.")
192
+
193
+ return all_introductions
194
+
195
+
196
+
197
+ def parallel_load_pdfs(self, pdf_files, survey_id, max_workers=4):
198
+ with ProcessPoolExecutor(max_workers=max_workers) as executor:
199
+ # Submit tasks for parallel execution
200
+ futures = [executor.submit(self.load_pdf, pdf, survey_id) for pdf in pdf_files]
201
+
202
+ # Collect results
203
+ for future in futures:
204
+ try:
205
+ result = future.result()
206
+ print(f"Processed result: {result}")
207
+ except Exception as e:
208
+ print(f"Error processing PDF: {e}")
209
+
210
+ def ensure_non_empty_introduction(self, introduction, full_text):
211
+ """
212
+ Ensure introduction is not empty. If empty, replace with full text.
213
+ """
214
+ if introduction == "N/A" or len(introduction.strip()) < 50:
215
+ return full_text.strip()
216
+ return introduction
217
+
218
+ def extract_information_from_md_new(self, md_text):
219
+ # Title extraction
220
+ title_match = re.search(r'^(.*?)(\n\n|\Z)', md_text, re.DOTALL)
221
+ title = title_match.group(1).strip() if title_match else "N/A"
222
+
223
+ # Authors extraction
224
+ authors_match = re.search(
225
+ r'\n\n(.*?)(\n\n[aA][\s]*[bB][\s]*[sS][\s]*[tT][\s]*[rR][\s]*[aA][\s]*[cC][\s]*[tT][^\n]*\n\n)',
226
+ md_text,
227
+ re.DOTALL
228
+ )
229
+ authors = authors_match.group(1).strip() if authors_match else "N/A"
230
+
231
+ # Abstract extraction
232
+ abstract_match = re.search(
233
+ r'(\n\n[aA][\s]*[bB][\s]*[sS][\s]*[tT][\s]*[rR][\s]*[aA][\s]*[cC][\s]*[tT][^\n]*\n\n)(.*?)(\n\n|\Z)',
234
+ md_text,
235
+ re.DOTALL
236
+ )
237
+ abstract = abstract_match.group(0).strip() if abstract_match else "N/A"
238
+ abstract = re.sub(r'^[aA]\s*[bB]\s*[sS]\s*[tT]\s*[rR]\s*[aA]\s*[cC]\s*[tT][^\w]*', '', abstract)
239
+ abstract = re.sub(r'^[^a-zA-Z]*', '', abstract)
240
+
241
+ # Introduction extraction
242
+ introduction_match = re.search(
243
+ r'\n\n([1I][\.\- ]?\s*)?[Ii]\s*[nN]\s*[tT]\s*[rR]\s*[oO]\s*[dD]\s*[uU]\s*[cC]\s*[tT]\s*[iI]\s*[oO]\s*[nN][\.\- ]?\s*\n\n(.*?)',
244
+ md_text, re.DOTALL
245
+ )
246
+ introduction = introduction_match.group(2).strip() if introduction_match else "N/A"
247
+
248
+ # Ensure introduction is not empty
249
+ introduction = self.ensure_non_empty_introduction(introduction, md_text)
250
+
251
+ return {
252
+ "title": title,
253
+ "authors": authors,
254
+ "abstract": abstract,
255
+ "introduction": introduction
256
+ }
src/demo/asg_mindmap.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ import textwrap
4
+ from graphviz import Digraph
5
+ import os
6
+
7
+ def wrap_text(text, max_chars):
8
+ """
9
+ 对文本进行自动换行包装,每行最大字符数为 max_chars。
10
+ """
11
+ return textwrap.fill(text, width=max_chars)
12
+
13
+ def parse_md_refs(md_content):
14
+ """
15
+ 解析 Markdown 内容,提取以 x.y.z 格式标题对应的引用。
16
+
17
+ 对于每个满足格式的 section,其内容中所有形如 [数字] 的引用
18
+ 将被抽取出来,去重后按数字升序排序,生成类似 "[1,2,3]" 的引用字符串。
19
+
20
+ 如果遇到 undesired header(如 "6 Future Directions" 或 "7 Conclusion"),
21
+ 则停止后续内容的解析,确保最后一个 section 仅包含到该 header 之前的内容。
22
+
23
+ 返回字典,键为 section 编号(例如 "3.1.1"),值为引用字符串(例如 "[1,2,3]")。
24
+ """
25
+ ref_dict = {}
26
+
27
+ # 处理 Markdown 内容(按行拆分)
28
+ lines = md_content.split("\n") if md_content else []
29
+
30
+ # 匹配 Markdown 标题中以 x.y.z 开头的叶节点(例如 "5.1.1 Neural Topic...")
31
+ section_header_regex = re.compile(r'^\s*#+\s*(\d+\.\d+\.\d+).*')
32
+ # 匹配 undesired header,如 "6 Future Directions" 或 "7 Conclusion"
33
+ undesired_header_regex = re.compile(r'^\s*#+\s*(6 Future Directions|7 Conclusion)\b')
34
+ # 匹配引用,例如 [数字]
35
+ ref_pattern = re.compile(r'\[(\d+)\]')
36
+
37
+ current_section = None
38
+ current_content = []
39
+
40
+ for line in lines:
41
+ # 如果检测到 undesired header,则先处理当前 section,再退出循环
42
+ if undesired_header_regex.match(line):
43
+ break
44
+
45
+ header_match = section_header_regex.match(line)
46
+ if header_match:
47
+ # 处理上一个 section
48
+ if current_section is not None:
49
+ all_refs = [int(num) for content_line in current_content for num in ref_pattern.findall(content_line)]
50
+ if all_refs:
51
+ ref_dict[current_section] = "[" + ",".join(map(str, sorted(set(all_refs)))) + "]"
52
+
53
+ # 更新当前 section
54
+ current_section = header_match.group(1)
55
+ current_content = []
56
+ else:
57
+ if current_section is not None:
58
+ current_content.append(line)
59
+
60
+ # 处理最后一个 section
61
+ if current_section is not None and current_content:
62
+ all_refs = [int(num) for content_line in current_content for num in ref_pattern.findall(content_line)]
63
+ if all_refs:
64
+ ref_dict[current_section] = "[" + ",".join(map(str, sorted(set(all_refs)))) + "]"
65
+
66
+ return ref_dict
67
+
68
+ def generate_graphviz_png(json_path, output_png_path, md_content=None, title="Document Outline", max_root_chars=20):
69
+ """
70
+ 从 JSON 文件中读取大纲,构造树状结构,并生成 mindmap 的 PNG 图片。
71
+
72
+ 如果提供了 md_content,则根据 Markdown 内容中以 x.y.z 格式标题对应的引用,
73
+ 在生成 mindmap 时,对于叶节点(没有子节点且标题以 x.y.z 开头)的标签,
74
+ 在原文本后追加一个换行,然后添加引用信息(例如 "[1,2,3]"),
75
+ 且引用经过数字排序。
76
+
77
+ 同时,仅对根节点文本进行自动换行包装,以限制根节点的最大宽度,
78
+ 其它节点保持原始文本格式。
79
+
80
+ 参数:
81
+ json_path: JSON 文件路径(包含大纲)
82
+ output_png_path: 输出 PNG 文件路径(不带后缀)
83
+ md_content: Markdown 文本内容(字符串),可选
84
+ title: 用于替换 mindmap 中根节点的标题,默认 "Document Outline"
85
+ max_root_chars: 限制根节点每行最大字符数,默认 20
86
+ """
87
+ # 解析 Markdown 内容的引用
88
+ ref_dict = parse_md_refs(md_content) if md_content else {}
89
+
90
+ # 读取 JSON 大纲
91
+ with open(json_path, "r", encoding="utf-8") as f:
92
+ data = json.load(f)
93
+
94
+ outline_str = data.get("outline", "")
95
+
96
+ # 解析形如 [层级, '标题'] 的项
97
+ pattern = re.compile(r"\[(\d+),\s*'([^']+)'\]")
98
+ items = pattern.findall(outline_str)
99
+ items = [(int(level), title) for level, title in items]
100
+
101
+ # 不需要的标题关键词
102
+ undesired_keywords = {"Abstract", "Introduction", "Future Directions", "Conclusion"}
103
+ # 过滤掉不需要的条目
104
+ filtered_items = [
105
+ (lvl, title) for lvl, title in items
106
+ if not re.match(r"^\d+\s+(.+)", title) or re.match(r"^\d+\s+(.+)", title).group(1) not in undesired_keywords
107
+ ]
108
+
109
+ # 构造树状结构
110
+ tree = []
111
+ stack = []
112
+ for lvl, title_item in filtered_items:
113
+ node = {"title": title_item, "children": []}
114
+ while stack and lvl <= stack[-1][0]:
115
+ stack.pop()
116
+ if stack:
117
+ stack[-1][1]["children"].append(node)
118
+ else:
119
+ tree.append(node)
120
+ stack.append((lvl, node))
121
+
122
+ # 生成 Mindmap
123
+ dot = Digraph(comment=title, format='png', engine='dot')
124
+ dot.graph_attr.update(rankdir='LR', splines='ortho', bgcolor='white', dpi="150")
125
+ dot.attr('node', shape='box', style='rounded,filled', fillcolor='white', color='gray')
126
+ dot.edge_attr.update(arrowhead='none', color="black")
127
+
128
+ # 处理根节点
129
+ wrapped_title = wrap_text(title, max_root_chars)
130
+ dot.node('root', label=wrapped_title, shape='ellipse', style='filled', fillcolor='lightgray')
131
+
132
+ node_counter = [0]
133
+ section_pattern = re.compile(r'^(\d+\.\d+\.\d+)')
134
+
135
+ def add_nodes(node, parent_id):
136
+ current_id = f'node_{node_counter[0]}'
137
+ node_counter[0] += 1
138
+ safe_label = node['title'].replace('"', r'\"')
139
+
140
+ # 如果是叶节点且标题以 x.y.z 开头,则追加引用信息(如果存在)
141
+ if not node["children"]:
142
+ m = section_pattern.match(safe_label)
143
+ if m:
144
+ section_id = m.group(1)
145
+ if section_id in ref_dict:
146
+ safe_label += "\n" + ref_dict[section_id]
147
+
148
+ dot.node(current_id, label=safe_label)
149
+ dot.edge(parent_id, current_id)
150
+ for child in node.get("children", []):
151
+ add_nodes(child, current_id)
152
+
153
+ for top_node in tree:
154
+ add_nodes(top_node, "root")
155
+
156
+ dot.render(output_png_path, cleanup=True)
157
+ print("生成 PNG 文件:", output_png_path + ".png")
158
+ return output_png_path + ".png"
159
+
160
+
161
+
162
+ def insert_outline_image(png_path, md_content, survey_title):
163
+ """
164
+ 在给定的 Markdown 内容字符串中查找 "2 Introduction" 这一行,
165
+ 然后在该位置之前插入 outline 图片的 HTML 代码块,确保渲染时
166
+ HTML 块与后续 Markdown 内容间有足够空行分隔开。
167
+
168
+ 参数:
169
+ png_path: 要插入的 PNG 图片路径,将作为 img 的 src 属性值。
170
+ md_content: Markdown 文件内容字符串。
171
+ survey_title: 用于生成图片说明文字的问卷标题。
172
+
173
+ 插入的 HTML 格式如下:
174
+
175
+ <div style="text-align:center">
176
+ <img src="{png_path}" alt="Outline" style="width:100%;"/>
177
+ </div>
178
+ <div style="text-align:center">
179
+ Fig 1. The outline of the {survey_title}
180
+ </div>
181
+
182
+ 函数返回更新后的 Markdown 内容字符串。
183
+ """
184
+
185
+ # 将 Markdown 内容字符串分割成行(保留换行符)
186
+ lines = md_content.splitlines(keepends=True)
187
+ print(lines)
188
+
189
+ # 查找包含 "2 Introduction" 的行的索引
190
+ intro_index = None
191
+ for i, line in enumerate(lines):
192
+ if '2 Introduction' in line:
193
+ intro_index = i
194
+ break
195
+
196
+ if intro_index is None:
197
+ print("没有找到 '2 Introduction' 这一行!")
198
+ return md_content
199
+
200
+ # 确保路径中的反斜杠被替换成正斜杠
201
+ png_path_fixed = png_path.replace("\\", "/")
202
+
203
+ # 构造需要插入的 HTML 代码块,在前后增加空行
204
+ html_snippet = (
205
+ "\n\n" # 添加换行确保与上文/下文分隔
206
+ f'<div style="text-align:center">\n'
207
+ f' <img src="{png_path_fixed}" alt="Outline" style="width:100%;"/>\n'
208
+ f'</div>\n'
209
+ f'<div style="text-align:center">\n'
210
+ f' Fig 1. The outline of the {survey_title}\n'
211
+ f'</div>\n'
212
+ "\n" # 再添加一个空行确保与下方内容分隔
213
+ )
214
+
215
+ print(f"将在第 {intro_index} 行插入如下 HTML 代码块(插入在 '2 Introduction' 之前):\n{html_snippet}")
216
+
217
+ # 在找到的 "2 Introduction" 这一行之前插入 html_snippet
218
+ lines.insert(intro_index, html_snippet)
219
+
220
+ # 合并所有行,构造更新后的 Markdown 内容
221
+ updated_md = "".join(lines)
222
+
223
+ print("已在 Markdown 内容中插入 outline 图片。")
224
+ return updated_md
225
+
226
+ def insert_outline_figure(png_path, tex_content, survey_title):
227
+ """
228
+ 在给定的 TeX 文件内容字符串中查找 "2 Introduction" 这一行,
229
+ 然后在其之前插入一个跨页(双栏)的 figure* 环境,包括整页显示的图片。
230
+ 它将生成类似如下 LaTeX 片段:
231
+
232
+ \begin{figure*}[htbp]
233
+ \centering
234
+ \includegraphics[width=\textwidth]{path/to/xxx.png}
235
+ \caption{Fig 1. The outline of the XXX}
236
+ \end{figure*}
237
+
238
+ 参数:
239
+ png_path: 要插入的 PNG 图片路径
240
+ tex_content: TeX 文件内容字符串
241
+ survey_title: 用于生成图片 caption 的文献/问卷标题
242
+
243
+ 返回:
244
+ 更新后的 TeX 文本字符串
245
+ """
246
+
247
+ # 将 TeX 内容逐行分割(保留换行符)
248
+ lines = tex_content.splitlines(keepends=True)
249
+
250
+ # 查找包含 "2 Introduction" 的行索引
251
+ intro_index = None
252
+ for i, line in enumerate(lines):
253
+ if 'Introduction' in line:
254
+ intro_index = i
255
+ break
256
+
257
+ # 如果找不到,就直接返回原文
258
+ if intro_index is None:
259
+ print("没有找到 'Introduction' 这一行,未执行插入。")
260
+ return tex_content
261
+
262
+ # 构造 TeX 的 figure* 代码块
263
+ # 为确保整页,可用 [p] 或者 [htbp],具体可根据排���需要调整
264
+ # 也可替换成普通 \begin{figure} ... \end{figure},如果不需要跨双栏
265
+ figure_block = (
266
+ "\n" # 加一个空行,确保与上文分隔
267
+ "\\begin{figure*}[htbp]\n"
268
+ " \\centering\n"
269
+ f" \\includegraphics[width=\\textwidth]{{{png_path}}}\n"
270
+ f" \\caption{{The outline of our survey: {survey_title}}}\n"
271
+ "\\end{figure*}\n\n" # 再留一个空行分隔
272
+ )
273
+
274
+ # 在找到的 "2 Introduction" 所在行之前插入 figure 环境
275
+ lines.insert(intro_index, figure_block)
276
+
277
+ # 重新拼接所有行
278
+ updated_tex = "".join(lines)
279
+ return updated_tex
280
+ # 使用示例:
281
+ # if __name__ == "__main__":
282
+ # png_path = 'src/static/data/info/test_4/outline.png'
283
+ # md_content = ''
284
+ # survey_title = "My Survey Title"
285
+ # updated_md = insert_outline_image(png_path, md_content, survey_title)
286
+ # --------------------------
287
+ # 使用示例
288
+ # --------------------------
289
+ if __name__ == "__main__":
290
+ json_path = os.path.join("src", "static", "data", "txt", 'test_2', "outline.json")
291
+ output_png_path = os.path.join("src", "static", "data", "info", 'test_2', "outline")
292
+ md_path = os.path.join("src", "static", "data", "info", 'test_2', f"survey_{'test_2'}_processed.md")
293
+ flowchart_results_path = os.path.join("src", "static", "data", "info", 'test_2', "flowchart_results.json")
294
+ png_path = generate_graphviz_png(
295
+ json_path=json_path,
296
+ output_png_path=output_png_path,
297
+ md_path=md_path,
298
+ title='test',
299
+ max_root_chars=30
300
+ )
301
+
302
+ # generate_graphviz_png(json_file_path, output_png_file, md_file_path, title=mindmap_title, max_root_chars=20)
src/demo/asg_outline.py ADDED
@@ -0,0 +1,1029 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import transformers
2
+ import torch
3
+ import os
4
+ import json
5
+ import re
6
+ import ast
7
+ from .survey_generator_api import *
8
+ from .asg_abstract import AbstractGenerator
9
+ from .asg_conclusion import ConclusionGenerator
10
+ from .asg_retriever import *
11
+ import pandas as df
12
+ from .references import generate_references
13
+
14
+
15
+ class OutlineGenerator():
16
+ def __init__(self, pipeline, df, cluster_names, mode='desp'):
17
+ self.pipeline = pipeline
18
+ # self.pipeline.model.load_adapter("technicolor/llama3.1_8b_outline_generation")
19
+ self.pipeline.model.set_adapter("outline")
20
+ self.df = df
21
+ self.cluster = [{'label': i, 'name': cluster_names[i]} for i in range(len(cluster_names))]
22
+ self._add_cluster_info()
23
+ self.mode = mode
24
+
25
+ def __init__(self, df, cluster_names, mode='desp'): #Without local llms
26
+ self.df = df
27
+ self.cluster = [{'label': i, 'name': cluster_names[i]} for i in range(len(cluster_names))]
28
+ self._add_cluster_info()
29
+ self.mode = mode
30
+
31
+ def _add_cluster_info(self):
32
+ label_to_info = {label: self.df[self.df['label'] == label] for label in range(len(self.cluster))}
33
+ for cluster in self.cluster:
34
+ cluster['info'] = label_to_info[cluster['label']]
35
+
36
+ def get_cluster_info(self):
37
+ return self.cluster
38
+
39
+ def generate_claims(self):
40
+ result = []
41
+ if self.mode == 'desp':
42
+ for i in range(len(self.cluster)):
43
+ cluster = self.cluster[i]
44
+ claims = ''
45
+ for j in range(len(cluster['info'])):
46
+ claims = cluster['info'].iloc[j]['retrieval_result'] + '\n' + claims
47
+ # claims = cluster['info'].iloc[j]['ref_title'] + '\n' + claims
48
+ result.append(claims)
49
+ else:
50
+ for i in range(len(self.cluster)):
51
+ cluster = self.cluster[i]
52
+ claims = ''
53
+ data = cluster['info']
54
+ for j in range(len(data)):
55
+ entry = data.iloc[j]
56
+ title = entry['title']
57
+ abstract = entry['abstract']
58
+ prompt = f'''
59
+ Title:
60
+ {title}
61
+ Abstract:
62
+ {abstract}
63
+ Task:
64
+ Conclude new findings and null findings from the abstract in one sentence in the atomic format. Do not separate
65
+ new findings and null findings. The finding must be relevant to the title. Do not include any other information.
66
+ Definition:
67
+ A scientific claim is an atomic verifiable statement expressing a finding about one aspect of a scientific entity or
68
+ process, which can be verified from a single source.'''
69
+
70
+ messages = [
71
+ {"role": "system", "content": "You are a helpful assistant."},
72
+ {"role": "user", "content": prompt},
73
+ ]
74
+
75
+ outputs = self.pipeline(
76
+ messages,
77
+ max_new_tokens=256,
78
+ )
79
+ claim = outputs[0]["generated_text"][-1]['content']
80
+ # print(claim)
81
+ # print('+++++++++++++++++++++++++++++++++')
82
+ claims = claims + '\n' + claim
83
+ result.append(claims)
84
+ return result
85
+
86
+
87
+ def generate_claims_qwen(self):
88
+ """
89
+ Generate claims for each cluster using Qwen API.
90
+
91
+ Returns:
92
+ list: A list of strings, where each string contains the claims generated
93
+ for a cluster.
94
+ """
95
+ result = []
96
+ openai_api_key = os.getenv("OPENAI_API_KEY")
97
+ openai_api_base = os.getenv("OPENAI_API_BASE")
98
+ client = OpenAI(
99
+ api_key=openai_api_key,
100
+ base_url=openai_api_base,
101
+ )
102
+
103
+ for i in range(len(self.cluster)):
104
+ cluster = self.cluster[i]
105
+ claims = ''
106
+ data = cluster['info']
107
+
108
+ for j in range(len(data)):
109
+ entry = data.iloc[j]
110
+ title = entry['title']
111
+ abstract = entry['abstract']
112
+
113
+ # Construct the prompt for Qwen
114
+ prompt = f'''
115
+ Title:
116
+ {title}
117
+ Abstract:
118
+ {abstract}
119
+ Task:
120
+ Conclude new findings and null findings from the abstract in one sentence in the atomic format. Do not separate
121
+ new findings and null findings. The finding must be relevant to the title. Do not include any other information.
122
+ Definition:
123
+ A scientific claim is an atomic verifiable statement expressing a finding about one aspect of a scientific entity or
124
+ process, which can be verified from a single source.
125
+ '''
126
+
127
+ # Define the input for Qwen
128
+ messages = [
129
+ {"role": "system", "content": "You are a helpful assistant."},
130
+ {"role": "user", "content": prompt},
131
+ ]
132
+
133
+ try:
134
+ # Call Qwen API
135
+ chat_response = client.chat.completions.create(
136
+ model=os.environ.get("MODEL"),
137
+ max_tokens=512,
138
+ temperature=0.5,
139
+ messages=messages
140
+ )
141
+
142
+ # Extract the generated claim from Qwen's response
143
+ claim = ""
144
+ for chunk in chat_response:
145
+ if "content" in chunk.choices[0].delta:
146
+ claim += chunk.choices[0].delta.content
147
+
148
+ # Clean and append the claim
149
+ claims = claims + '\n' + claim.strip()
150
+ # print("Generated claim:", claim)
151
+ # print("+++++++++++++++++++++++++++++++++")
152
+
153
+ except Exception as e:
154
+ print(f"Error generating claim for entry {j} in cluster {i}: {e}")
155
+ continue
156
+
157
+ result.append(claims)
158
+
159
+ return result
160
+
161
+ def generate_outline(self, survey_title):
162
+ claims = self.generate_claims()
163
+ cluster_with_claims = ""
164
+ for i in range(len(self.cluster)):
165
+ cluster = self.cluster[i]
166
+ cluster_with_claims = cluster_with_claims + f'Cluster {i}: {cluster["name"]}\n' + "Descriptions for entities in this cluster: \n" + claims[i] + '\n\n'
167
+ # system_prompt = f'''
168
+ # You are a helpful assistant who is helping a researcher to generate an outline for a survey paper.
169
+ # The references used by this survey paper have been clustered into different categories.
170
+ # The researcher will provides you with the title of the survey paper
171
+ # together with the cluster names and the descriptions for entities in each cluster.
172
+ # '''
173
+ system_prompt = f'''Generate the outline of the survey paper following the format of the example : [[1, '1 Introduction'], [1, '2 Perturbations of (co)differentials'], [2, '2.1 Derivations of the tensor algebra'], [more sections...]].\
174
+ The first element in the sub-list refers to the hierachy of the section name (from 1 to 3). Sections like Introduction and Conclusion should have the highest level (1)\
175
+ The second element in the sub-list refers to the section name.
176
+ '''
177
+
178
+ example_json = {"title":"A Survey of Huebschmann and Stasheff's Paper: Formal Solution of the Master Equation via HPT and Deformation Theory","outline":[{"title":"1 Introduction","outline":[]},{"title":"2 Perturbations of (co)differentials","outline":[{"title":"2.1 Derivations of the tensor algebra","outline":[]},{"title":"2.2 Coderivations of the tensor coalgebra","outline":[]},{"title":"2.3 Coderivations of the symmetric coalgebra","outline":[]},{"title":"2.4 DGLA\u2019s and perturbations of the codifferential","outline":[]},{"title":"2.5 Strongly homotopy Lie algebras","outline":[]},{"title":"2.6 The Hochschild chain complex and DGA\u2019s","outline":[]},{"title":"2.7 Strongly homotopy associative algebras","outline":[]}]},{"title":"3 Master equation","outline":[]},{"title":"4 Twisting cochain","outline":[{"title":"4.1 Differential on Hom","outline":[]},{"title":"4.2 Cup product and cup bracket","outline":[]},{"title":"4.3 Twisting cochain","outline":[]}]},{"title":"5 Homological perturbation theory (HPT)","outline":[{"title":"5.1 Contraction","outline":[]},{"title":"5.2 The first main theorem.","outline":[]}]},{"title":"6 Corollaries and the second main theorem","outline":[{"title":"6.1 Other corollaries of Theorem\u00a01.","outline":[]},{"title":"6.2 The second main theorem","outline":[]}]},{"title":"7 Differential Gerstenhaber and BV algebras","outline":[{"title":"7.1 Differential Gerstenhaber algebras","outline":[]},{"title":"7.2 Differential BV algebras","outline":[]},{"title":"7.3 Formality","outline":[{"title":"7.3.1 Formality of differential graded P\ud835\udc43Pitalic_P-algebras","outline":[]},{"title":"7.3.2 Examples","outline":[]}]},{"title":"7.4 Differential BV algebras and formality","outline":[]}]},{"title":"8 Deformation theory","outline":[]},{"title":"References","outline":[]}]}
179
+ # user_prompt = {"survey_title":survey_title, "claims":cluster_with_claims}
180
+ user_prompt = f'''Generate the outline of the survey paper given the title:{survey_title}, and three lists of sentences describing each cluster of the references used by this survey:{cluster_with_claims}'''
181
+
182
+ messages = [
183
+ {"role": "system", "content": system_prompt},
184
+ {"role": "user", "content": user_prompt},
185
+ {"role": "assistant", "content":"[[1, '1 Abstract'], [1, '2 Introduction'], "}
186
+ ]
187
+
188
+ outputs = self.pipeline(
189
+ messages,
190
+ max_new_tokens=9192,
191
+ )
192
+ result = outputs[0]["generated_text"][-1]['content']
193
+
194
+ self.pipeline.model.disable_adapters()
195
+
196
+ return messages, result
197
+
198
+ def generate_outline_qwen(self, survey_title, cluster_num = 3):
199
+ claims = self.generate_claims()
200
+ cluster_with_claims = ""
201
+ cluster_names = []
202
+ for i in range(cluster_num): # 改为 cluster_num
203
+ cluster = self.cluster[i]
204
+ cluster_with_claims += f'Cluster {i}: {cluster["name"]}\nDescriptions for reference papers in this cluster:\n{claims[i]}\n\n'
205
+ cluster_names.append(cluster["name"])
206
+ # system_prompt = f'''
207
+ # You are a helpful assistant who is helping a researcher to generate an outline for a survey paper.
208
+ # The references used by this survey paper have been clustered into different categories.
209
+ # The researcher will provides you with the title of the survey paper
210
+ # together with the cluster names and the descriptions for entities in each cluster.
211
+ # '''
212
+ system_prompt = f'''Finish the outline of the survey paper following the format of the example : [[1, '1 Introduction'], [1, '2 Perturbations of (co)differentials'], [2, '2.1 Derivations of the tensor algebra'], [3, '2.2.1 ...']......].\
213
+ The first element in the sub-list refers to the hierachy of the section name (from 1 to 3). Sections like Introduction and Conclusion should have the highest level (1)\
214
+ The second element in the sub-list refers to the section name.
215
+ You are required to finish the second and third level subsections name under [1, '3 <Cluster 0's name>'], [1, '4 <Cluster 1's name>'] and [1, '5 <Cluster 2's name>']
216
+ You must not generate third level susections over *3* for each second level subsection, for example, [3, '3.1.4 xxx'], [3, '3.1.5 xxx'] are not allowed.
217
+ *Try to conclude the main findings of each cluster in the second and third level subsections, use highly abstract terms and phrases to describe*
218
+ *Do not include colons, e.g. AutoSurvey: Large Language Models Can Automatically Write Surveys should be written in Large Language Models in Writing Surveys*
219
+ '''
220
+ # user_prompt = {"survey_title":survey_title, "claims":cluster_with_claims}
221
+ cluster_sections = "\n".join([f"[1, '{i+3} {cluster_names[i]}'], [level 2 and 3 sections to finish...]" for i in range(cluster_num)])
222
+
223
+ user_prompt = f'''Finish the outline of the survey paper given the title: {survey_title}, and lists of sentences describing each cluster of the references used by this survey:\n{cluster_with_claims}
224
+ The first level sections' hierarchy is given: [[1, '1 Abstract'], [1, '2 Introduction'], {cluster_sections}, [1, '{cluster_num+3} Future Directions'], [1, '{cluster_num+4} Conclusion']].
225
+ You are required to finish the second and third level subsections under each cluster section with [2, 'a.b xxx'] and [3, 'a.b.c xxx'].
226
+ You must not generate third level susections over *3* for each second level subsection, for example, [3, '3.1.4 xxx'], [3, '3.1.5 xxx'] are not allowed.
227
+ *Try to conclude the main findings of each cluster in the second and third level subsections, use highly abstract terms and phrases to describe*
228
+ *Do not include colons, e.g. AutoSurvey: Large Language Models Can Automatically Write Surveys should be written in Large Language Models in Writing Surveys*
229
+ '''
230
+
231
+ messages = [
232
+ {"role": "system", "content": system_prompt},
233
+ {"role": "user", "content": user_prompt},
234
+ ]
235
+ openai_api_key = os.getenv("OPENAI_API_KEY")
236
+ openai_api_base = os.getenv("OPENAI_API_BASE")
237
+ client = OpenAI(
238
+ # defaults to os.environ.get("OPENAI_API_KEY")
239
+ api_key = openai_api_key,
240
+ base_url = openai_api_base,
241
+ )
242
+ chat_response = client.chat.completions.create(
243
+ model=os.environ.get("MODEL"),
244
+ max_tokens=2048,
245
+ temperature=0.5,
246
+ stop="<|im_end|>",
247
+ stream=True,
248
+ messages= messages
249
+ )
250
+ # Stream the response to console
251
+ text = ""
252
+ for chunk in chat_response:
253
+ if chunk.choices[0].delta.content:
254
+ text += chunk.choices[0].delta.content
255
+ # print('The response is :', text)
256
+ pattern = r'\[(.*)\]'
257
+ match = re.search(pattern, text, re.DOTALL) # re.DOTALL 允许 . 匹配换行符
258
+ text = match.group(1)
259
+ clean_text = re.sub(r'\s+', ' ', text).strip()
260
+ return messages, clean_text
261
+
262
+ def parseOutline(survey_id):
263
+ file_path = f'./src/static/data/txt/{survey_id}/outline.json'
264
+ try:
265
+ with open(file_path, 'r', encoding='utf-8') as file:
266
+ data = json.load(file)
267
+ except Exception as e:
268
+ print(f"Error loading JSON file {file_path}: {e}")
269
+ return []
270
+
271
+ response = data.get('outline', '')
272
+ if not response:
273
+ print("No outline content found in JSON.")
274
+ return []
275
+
276
+ # 提取文本中第一个 '[' 与最后一个 ']' 之间的内容
277
+ def extract_first_last(text):
278
+ first_match = re.search(r'\[', text)
279
+ last_match = re.search(r'\](?!.*\])', text) # 使用负向前瞻查找最后一个 ']'
280
+ if first_match and last_match:
281
+ return '[' + text[first_match.start() + 1:last_match.start()] + ']'
282
+ return None
283
+
284
+ response_extracted = extract_first_last(response)
285
+ if not response_extracted:
286
+ print("Failed to extract a valid list string from the outline content.")
287
+ return []
288
+
289
+ # 检查提取结果是否为“列表的列表”格式(应该以 "[[" 开头)
290
+ fixed_str = response_extracted.strip()
291
+ if not fixed_str.startswith("[["):
292
+ # 如果不是,则去掉原有的首尾括号,再重新包装:[[ ... ]]
293
+ # 注意:这种方式假定内部结构是以逗号分隔的多个列表,而不是单个列表。
294
+ fixed_str = "[[" + fixed_str[1:-1] + "]]"
295
+ # 或者根据你的实际情况,也可简单包装外层括号:
296
+ # fixed_str = "[" + fixed_str + "]"
297
+
298
+ try:
299
+ outline_list = ast.literal_eval(fixed_str)
300
+ except Exception as e:
301
+ print(f"Error converting extracted outline to a list.\nExtracted text: {fixed_str}\nError: {e}")
302
+ return []
303
+
304
+ # 如果结果不是列表,则转换成列表
305
+ if not isinstance(outline_list, list):
306
+ outline_list = list(outline_list)
307
+
308
+ # 如果解析结果不是列表的列表,而是单个列表(例如 [a, b, c]),则将其包装成一个列表
309
+ if outline_list and not all(isinstance(item, list) for item in outline_list):
310
+ outline_list = [outline_list]
311
+
312
+ result = []
313
+ for item in outline_list:
314
+ result.append(item)
315
+ return result
316
+
317
+
318
+ def generateOutlineHTML_qwen(survey_id):
319
+ outline_list = parseOutline(survey_id)
320
+ html = '''
321
+ <div class="container-fluid w-50 d-flex flex-column justify-content-center align-items-center">
322
+
323
+ <style>
324
+ /* 不同层级的样式 */
325
+ .level-1 {
326
+ font-size: 20px;
327
+ font-weight: bold;
328
+ position: relative;
329
+ padding-right: 40px; /* 为箭头留出空间 */
330
+ }
331
+ .level-2 {
332
+ font-size: 18px;
333
+ padding-left: 40px;
334
+ }
335
+ .level-3 {
336
+ font-size: 16px;
337
+ padding-left: 80px;
338
+ }
339
+ .list-group-item {
340
+ border: none;
341
+ }
342
+
343
+ /* 自定义卡片样式 */
344
+ .custom-card {
345
+ background-color: #fff;
346
+ border-radius: 8px;
347
+ padding: 20px;
348
+ margin-top: 20px;
349
+ width: 100%;
350
+ max-width: 800px;
351
+ box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1),
352
+ 0 6px 20px rgba(0, 0, 0, 0.1);
353
+ }
354
+
355
+ /* 自定义卡片主体样式 */
356
+ .custom-card-body {
357
+ padding: 20px;
358
+ }
359
+
360
+ /* 折叠图标样式 */
361
+ .collapse-icon {
362
+ background: none;
363
+ border: none;
364
+ padding: 0;
365
+ position: absolute;
366
+ right: 10px;
367
+ top: 50%;
368
+ transform: translateY(-50%) rotate(0deg);
369
+ cursor: pointer;
370
+ font-size: 16px;
371
+ /* 旋转过渡效果 */
372
+ transition: transform 0.2s;
373
+ }
374
+ /* 去除按钮聚焦时的轮廓 */
375
+ .collapse-icon:focus {
376
+ outline: none;
377
+ }
378
+ /* 当折叠展开时旋转图标 */
379
+ .collapsed .collapse-icon {
380
+ transform: translateY(-50%) rotate(0deg);
381
+ }
382
+ .in .collapse-icon {
383
+ transform: translateY(-50%) rotate(90deg);
384
+ }
385
+ </style>
386
+
387
+ <div class="custom-card">
388
+ <div class="custom-card-body" id="display-outline">
389
+ <ul class="list-group list-group-flush">
390
+ '''
391
+
392
+ # 添加默认的一级标题内容
393
+ default_items = []
394
+
395
+ # 将默认项与解析出的纲要列表合并
396
+ combined_list = default_items + outline_list
397
+
398
+ # 构建树形结构,以便检测一级标题是否有子标题
399
+ def build_outline_tree(outline_list):
400
+ sections = []
401
+ stack = []
402
+ for level, content in outline_list:
403
+ level = int(level)
404
+ node = {'level': level, 'content': content, 'subitems': []}
405
+ if level == 1:
406
+ sections.append(node)
407
+ stack = [node]
408
+ elif level == 2:
409
+ if stack:
410
+ parent = stack[-1]
411
+ parent['subitems'].append(node)
412
+ # stack.append(node)
413
+ else:
414
+ sections.append(node)
415
+ elif level == 3:
416
+ if stack:
417
+ parent = stack[-1]
418
+ parent['subitems'].append(node)
419
+ else:
420
+ sections.append(node)
421
+ return sections
422
+
423
+ sections = build_outline_tree(combined_list)
424
+
425
+ # 生成 HTML
426
+ def generate_html_from_sections(sections):
427
+ html = ''
428
+ section_index = 1 # 用于生成唯一的 ID
429
+
430
+ def generate_node_html(node):
431
+ nonlocal section_index
432
+ level = node['level']
433
+ content = node['content']
434
+ has_subitems = len(node['subitems']) > 0
435
+ if level == 1:
436
+ # 一级标题
437
+ if has_subitems:
438
+ # 如果有子标题,添加下拉图标和可折叠功能
439
+ section_id = f"outline_collapseSection{section_index}"
440
+ section_index += 1
441
+ node_html = f'''
442
+ <li class="list-group-item level-1">
443
+ {content}
444
+ <a class="collapsed" data-toggle="collapse" data-target="#{section_id}" aria-expanded="true" aria-controls="{section_id}">
445
+ &#9654; <!-- 右箭头表示折叠状态 -->
446
+ </a>
447
+ <ul class="list-group collapse in" id="{section_id}">
448
+ '''
449
+ for subitem in node['subitems']:
450
+ node_html += generate_node_html(subitem)
451
+ node_html += '''
452
+ </ul>
453
+ </li>
454
+ '''
455
+ else:
456
+ # 如果没有子标题,不显示下拉图标
457
+ node_html = f'''
458
+ <li class="list-group-item level-1">
459
+ {content}
460
+ </li>
461
+ '''
462
+ elif level == 2:
463
+ node_html = f'<li class="list-group-item level-2">{content}</li>'
464
+ elif level == 3:
465
+ # 三级标题直接显示,已经在二级标题中处理
466
+ node_html = f'<li class="list-group-item level-3">{content}</li>'
467
+ return node_html
468
+
469
+ for section in sections:
470
+ html += generate_node_html(section)
471
+
472
+ return html
473
+
474
+ def generate_list_html(combined_list, editable=True):
475
+ html = '<ul class="list-group list-group-flush">\n' # 开始 <ul>
476
+ for level, content in combined_list:
477
+ # 根据层级添加对应的 class
478
+ if level == 1: # Level 1 的输入框需要禁用
479
+ if editable:
480
+ html += f'<li class="list-group-item level-1"><input type="text" class="form-control" value="{content}" disabled></li>\n'
481
+ else:
482
+ html += f'<li class="list-group-item level-1">{content}</li>\n'
483
+ elif level == 2:
484
+ if editable:
485
+ html += f'<li class="list-group-item level-2" style="padding-left: 20px;"><input type="text" class="form-control" value="{content}"></li>\n'
486
+ else:
487
+ html += f'<li class="list-group-item level-2" style="padding-left: 20px;">{content}</li>\n'
488
+ elif level == 3:
489
+ if editable:
490
+ html += f'<li class="list-group-item level-3" style="padding-left: 40px;"><input type="text" class="form-control" value="{content}"></li>\n'
491
+ else:
492
+ html += f'<li class="list-group-item level-3" style="padding-left: 40px;">{content}</li>\n'
493
+ html += '</ul>' # 结束 </ul>
494
+ return html
495
+
496
+ # 生成列表 HTML
497
+ list_html = generate_list_html(combined_list)
498
+ html += generate_html_from_sections(sections)
499
+
500
+ html += f'''
501
+ </ul>
502
+ </div>
503
+ <div class="custom-card-body" style="display: none" id="edit-outline">
504
+ {list_html}
505
+ </div>
506
+ <button type="button" class="btn btn-secondary btn-lg" id="edit-btn" onclick="editOutline()"><i class="bi bi-pen"></i></button>
507
+ <button type="button" class="btn btn-success btn-lg" id="confirm-btn" style="display: none;" onclick="confirmOutline()"><i class="bi bi-check"></i></button>
508
+ </div>
509
+ <!-- 添加 Bootstrap v3.3.0 的 JavaScript 来处理折叠功能 -->
510
+ <script>
511
+ $(document).ready(function(){{
512
+ $('.collapsed').click(function(){{
513
+ $(this).toggleClass('collapsed');
514
+ }});
515
+ }});
516
+ </script>
517
+
518
+ </div>
519
+ '''
520
+
521
+ html+='''
522
+ <script>
523
+ // 切换到编辑模式
524
+ function editOutline() {
525
+ document.getElementById("display-outline").style.display = "none"; // 隐藏不可编���部分
526
+ document.getElementById("edit-outline").style.display = "block"; // 显示可编辑部分
527
+
528
+ // 显示 "Confirm" 按钮,隐藏 "Edit" 按钮
529
+ document.getElementById("edit-btn").style.display = "none";
530
+ document.getElementById("confirm-btn").style.display = "inline-block";
531
+ }
532
+
533
+ // 确认编辑并提交数据
534
+ function confirmOutline() {
535
+ const outlineData = []; // 用于存储提交到后端的数据
536
+
537
+ // 遍历所有的可编辑输入框
538
+ document.querySelectorAll("#edit-outline .list-group-item").forEach((item) => {
539
+ const level = item.classList.contains("level-1") ? 1 :
540
+ item.classList.contains("level-2") ? 2 : 3; // 获取层级
541
+ const content = item.querySelector("input").value.trim(); // 获取编辑框的值
542
+
543
+ // 将数据转换为数组格式 [level, content]
544
+ outlineData.push([level, content]);
545
+ });
546
+
547
+ console.log("Submitting to backend:", outlineData); // 打印提交数据以供调试
548
+
549
+ // 使用 AJAX 提交数据到后端
550
+ const csrftoken = getCookie("csrftoken"); // 获取 CSRF token
551
+ fetch("/save_outline/", {
552
+ method: "POST",
553
+ headers: {
554
+ "Content-Type": "application/json",
555
+ "X-CSRFToken": csrftoken, // Django 的 CSRF 令牌
556
+ },
557
+ body: JSON.stringify({ outline: outlineData }) // 将数据转换为 JSON 字符串
558
+ })
559
+ .then((response) => response.json())
560
+ .then((data) => {
561
+ if (data.status === "success") {
562
+ $('#sections_').html(data.html);
563
+ alert("Outline updated successfully!");
564
+ } else {
565
+ alert("Error updating outline: " + data.message);
566
+ }
567
+ })
568
+ .catch((error) => {
569
+ console.error("Error:", error);
570
+ alert("Error updating outline. Please check the console for details.");
571
+ });
572
+ }
573
+ </script>
574
+ '''
575
+ return html
576
+
577
+ def insert_section(content, section_header, section_content):
578
+ """
579
+ 在 content 中找到以 section_header 开头的行,并在其后插入 section_content
580
+ section_header: 标题名称,例如 "Abstract" 或 "Conclusion"
581
+ section_content: 要插入的内容(字符串)
582
+ """
583
+ # 修改正则表达式,使得数字后的点是可选的
584
+ pattern = re.compile(
585
+ r'(^#\s+\d+\.?\s+' + re.escape(section_header) + r'\s*$)',
586
+ re.MULTILINE | re.IGNORECASE
587
+ )
588
+ replacement = r'\1\n\n' + section_content + '\n'
589
+ new_content, count = pattern.subn(replacement, content)
590
+ if count == 0:
591
+ print(f"警告: 未找到标题 '{section_header}'。无法插入内容。")
592
+ return new_content
593
+
594
+ def generateOutlineHTML(survey_id):
595
+ outline_list = parseOutline(survey_id)
596
+ html = '''
597
+ <div class="container-fluid w-50 d-flex flex-column justify-content-center align-items-center">
598
+
599
+ <style>
600
+ /* 不同层级的样式 */
601
+ .level-1 {
602
+ font-size: 20px;
603
+ font-weight: bold;
604
+ position: relative;
605
+ padding-right: 40px; /* 为箭头留出空间 */
606
+ }
607
+ .level-2 {
608
+ font-size: 18px;
609
+ padding-left: 40px;
610
+ }
611
+ .level-3 {
612
+ font-size: 16px;
613
+ padding-left: 80px;
614
+ }
615
+ .list-group-item {
616
+ border: none;
617
+ }
618
+
619
+ /* 自定义卡片样式 */
620
+ .custom-card {
621
+ background-color: #fff;
622
+ border-radius: 8px;
623
+ padding: 20px;
624
+ margin-top: 20px;
625
+ width: 100%;
626
+ max-width: 800px;
627
+ box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1),
628
+ 0 6px 20px rgba(0, 0, 0, 0.1);
629
+ }
630
+
631
+ /* 自定义卡片主体样式 */
632
+ .custom-card-body {
633
+ padding: 20px;
634
+ }
635
+
636
+ /* 折叠图标样式 */
637
+ .collapse-icon {
638
+ background: none;
639
+ border: none;
640
+ padding: 0;
641
+ position: absolute;
642
+ right: 10px;
643
+ top: 50%;
644
+ transform: translateY(-50%) rotate(0deg);
645
+ cursor: pointer;
646
+ font-size: 16px;
647
+ /* 旋转过渡效果 */
648
+ transition: transform 0.2s;
649
+ }
650
+ /* 去除按钮聚焦时的轮廓 */
651
+ .collapse-icon:focus {
652
+ outline: none;
653
+ }
654
+ /* 当折叠展开时旋转图标 */
655
+ .collapsed .collapse-icon {
656
+ transform: translateY(-50%) rotate(0deg);
657
+ }
658
+ .in .collapse-icon {
659
+ transform: translateY(-50%) rotate(90deg);
660
+ }
661
+ </style>
662
+
663
+ <div class="custom-card">
664
+ <div class="custom-card-body">
665
+ <ul class="list-group list-group-flush">
666
+ '''
667
+
668
+ # 添加默认的一级标题内容
669
+ default_items = [[1, '1 Abstract'], [1, '2 Introduction']]
670
+
671
+ # 将默认项与解析出的纲要列表合并
672
+ combined_list = default_items + outline_list
673
+
674
+ # 构建树形结构,以便检测一级标题是否有子标题
675
+ def build_outline_tree(outline_list):
676
+ sections = []
677
+ stack = []
678
+ for level, content in outline_list:
679
+ level = int(level)
680
+ node = {'level': level, 'content': content, 'subitems': []}
681
+ if level == 1:
682
+ sections.append(node)
683
+ stack = [node]
684
+ elif level == 2:
685
+ if stack:
686
+ parent = stack[-1]
687
+ parent['subitems'].append(node)
688
+ # stack.append(node)
689
+ else:
690
+ sections.append(node)
691
+ elif level == 3:
692
+ if stack:
693
+ parent = stack[-1]
694
+ parent['subitems'].append(node)
695
+ else:
696
+ sections.append(node)
697
+ return sections
698
+
699
+ sections = build_outline_tree(combined_list)
700
+
701
+ # 生成 HTML
702
+ def generate_html_from_sections(sections):
703
+ html = ''
704
+ section_index = 1 # 用于生成唯一的 ID
705
+
706
+ def generate_node_html(node):
707
+ nonlocal section_index
708
+ level = node['level']
709
+ content = node['content']
710
+ has_subitems = len(node['subitems']) > 0
711
+ if level == 1:
712
+ # 一级标题
713
+ if has_subitems:
714
+ # 如果有子标题,添加下拉图标和可折叠功能
715
+ section_id = f"outline_collapseSection{section_index}"
716
+ section_index += 1
717
+ node_html = f'''
718
+ <li class="list-group-item level-1">
719
+ {content}
720
+ <a class="collapsed" data-toggle="collapse" data-target="#{section_id}" aria-expanded="true" aria-controls="{section_id}">
721
+ &#9654; <!-- 右箭头表示折叠状态 -->
722
+ </a>
723
+ <ul class="list-group collapse in" id="{section_id}">
724
+ '''
725
+ for subitem in node['subitems']:
726
+ node_html += generate_node_html(subitem)
727
+ node_html += '''
728
+ </ul>
729
+ </li>
730
+ '''
731
+ else:
732
+ # 如果没有子标题,不显示下拉图标
733
+ node_html = f'''
734
+ <li class="list-group-item level-1">
735
+ {content}
736
+ </li>
737
+ '''
738
+ elif level == 2:
739
+ node_html = f'<li class="list-group-item level-2">{content}</li>'
740
+ elif level == 3:
741
+ # 三级标题直接显示,已经在二级标题中处理
742
+ node_html = f'<li class="list-group-item level-3">{content}</li>'
743
+ return node_html
744
+
745
+ for section in sections:
746
+ html += generate_node_html(section)
747
+
748
+ return html
749
+
750
+ html += generate_html_from_sections(sections)
751
+
752
+ html += '''
753
+ </ul>
754
+ </div>
755
+ </div>
756
+ <!-- 添加 Bootstrap v3.3.0 的 JavaScript 来处理折叠功能 -->
757
+ <script>
758
+ $(document).ready(function(){
759
+ // 切换箭头方向
760
+ $('.collapsed').click(function(){
761
+ $(this).toggleClass('collapsed');
762
+ });
763
+ });
764
+ </script>
765
+ </div>
766
+ '''
767
+ return html
768
+
769
+ def insert_section(content, section_header, section_content):
770
+ """
771
+ 在 content 中找到以 section_header 开头的行,并在其后插入 section_content
772
+ section_header: 标题名称,例如 "Abstract" 或 "Conclusion"
773
+ section_content: 要插入的内容(字符串)
774
+ """
775
+ # 修改正则表达式,使得数字后的点是可选的
776
+ pattern = re.compile(
777
+ r'(^#\s+\d+\.?\s+' + re.escape(section_header) + r'\s*$)',
778
+ re.MULTILINE | re.IGNORECASE
779
+ )
780
+ replacement = r'\1\n\n' + section_content + '\n'
781
+ new_content, count = pattern.subn(replacement, content)
782
+ if count == 0:
783
+ print(f"警告: 未找到标题 '{section_header}'。无法插入内容。")
784
+ return new_content
785
+
786
+ def generateSurvey(survey_id, title, collection_list, pipeline):
787
+ outline = parseOutline(survey_id)
788
+ default_items = [[1, '1 Abstract'], [1, '2 Introduction'], [1, '3 Overview']]
789
+ outline = str(default_items + outline)
790
+
791
+ client = getQwenClient()
792
+
793
+ context_list = generate_context_list(outline, collection_list)
794
+
795
+ temp = {
796
+ "survey_id": survey_id,
797
+ "outline": str(default_items),
798
+ "survey_title": title,
799
+ "context": context_list,
800
+ "abstract": "",
801
+ "introduction": "",
802
+ "content": "",
803
+ "conclusion": "",
804
+ "references": ""
805
+ }
806
+
807
+ generated_survey_paper = generate_survey_paper_new(outline, context_list, client)
808
+
809
+ generated_introduction = generate_introduction(generated_survey_paper, client)
810
+ # print("\nGenerated Introduction:\n", generated_introduction)
811
+
812
+ abs_generator = AbstractGenerator(pipeline)
813
+ abstract = abs_generator.generate(title, generated_introduction)
814
+ con_generator = ConclusionGenerator(pipeline)
815
+ conclusion = con_generator.generate(title, generated_introduction)
816
+
817
+ abstract = abstract.replace("Abstract:", "")
818
+ conclusion = conclusion.replace("Conclusion:", "")
819
+
820
+ temp["abstract"] = abstract
821
+ temp["introduction"] = generated_introduction
822
+ temp["content"] = generated_survey_paper
823
+ temp["conclusion"] = conclusion
824
+
825
+ temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
826
+ temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
827
+
828
+ output_path = f'./src/static/data/txt/{survey_id}/generated_result.json'
829
+ with open(output_path, 'w', encoding='utf-8') as f:
830
+ json.dump(temp, f, ensure_ascii=False, indent=4)
831
+ print(f"Survey has been saved to {output_path}.")
832
+
833
+ return
834
+
835
+ def generate_future_directions_qwen(client, title, intro):
836
+ system_prompt = f'''You are a helpful assistant that help to generate the future directions of the survey paper given the survey title and survey introduction.'''
837
+ # user_prompt = {"survey_title":survey_title, "claims":cluster_with_claims}
838
+ user_prompt = f'''Help me to generate the future directions of a survey paper given the title: *{title}*, and and the introduction:{intro} within 300 words.'''
839
+
840
+ messages = [
841
+ {"role": "system", "content": system_prompt},
842
+ {"role": "user", "content": user_prompt},
843
+ {"role": "assistant", "content":"Future Directions:"}
844
+ ]
845
+ openai_api_key = os.getenv("OPENAI_API_KEY")
846
+ openai_api_base = os.getenv("OPENAI_API_BASE")
847
+ client = OpenAI(
848
+ # defaults to os.environ.get("OPENAI_API_KEY")
849
+ api_key = openai_api_key,
850
+ base_url = openai_api_base,
851
+ )
852
+ chat_response = client.chat.completions.create(
853
+ model=os.environ.get("MODEL"),
854
+ max_tokens=768,
855
+ temperature=0.5,
856
+ stop="<|im_end|>",
857
+ stream=True,
858
+ messages= messages
859
+ )
860
+ # Stream the response to console
861
+ text = ""
862
+ for chunk in chat_response:
863
+ if chunk.choices[0].delta.content:
864
+ text += chunk.choices[0].delta.content
865
+ return text
866
+
867
+ def generateSurvey_qwen(survey_id, title, collection_list, pipeline):
868
+ outline = str(parseOutline(survey_id))
869
+
870
+ client = getQwenClient()
871
+
872
+ context_list = generate_context_list(outline, collection_list)
873
+
874
+ temp = {
875
+ "survey_id": survey_id,
876
+ "outline": outline,
877
+ "survey_title": title,
878
+ "context": context_list,
879
+ "abstract": "",
880
+ "introduction": "",
881
+ "content": "",
882
+ "future_directions":"",
883
+ "conclusion": "",
884
+ "references": ""
885
+ }
886
+
887
+ generated_survey_paper = generate_survey_paper_new(title, outline, context_list, client)
888
+ # print("Generated Survey Paper:\n", generated_survey_paper)
889
+
890
+ generated_introduction = generate_introduction(generated_survey_paper, client)
891
+ # print("\nGenerated Introduction:\n", generated_introduction)
892
+ abs_generator = AbstractGenerator(pipeline)
893
+ abstract = abs_generator.generate(title, generated_introduction)
894
+ con_generator = ConclusionGenerator(pipeline)
895
+ # conclusion = con_generator.generate(title, generated_introduction)
896
+ #New version: 12/03
897
+ conclusion = generate_conclusion(generated_survey_paper, client)
898
+ abstract = abstract.replace("Abstract:", "")
899
+ conclusion = conclusion.replace("Conclusion:", "")
900
+ # future_directions = generate_future_directions_qwen(client, title, generated_introduction).replace("Future Directions:","")
901
+ #New version: 12/03
902
+ future_directions = generate_future_work(generated_survey_paper, client)
903
+ # references = generate_references_dir('./src/static/data/txt/'+survey_id)
904
+ temp["abstract"] = abstract
905
+ temp["introduction"] = generated_introduction
906
+ temp["content"] = generated_survey_paper
907
+ temp["conclusion"] = conclusion
908
+ temp["future_directions"] = future_directions
909
+ # temp["references"] = "\n\n".join([f"{ref}" for i, ref in enumerate(references)])
910
+ temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
911
+ temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
912
+ temp["content"] = insert_section(temp["content"], "Future Directions", temp["future_directions"])
913
+ output_path = f'./src/static/data/txt/{survey_id}/generated_result.json'
914
+ with open(output_path, 'w', encoding='utf-8') as f:
915
+ json.dump(temp, f, ensure_ascii=False, indent=4)
916
+ print(f"Survey has been saved to {output_path}.")
917
+ return
918
+
919
+ # wza
920
+ def generateSurvey_qwen_new(survey_id, title, collection_list, pipeline, citation_data_list):
921
+ outline = str(parseOutline(survey_id))
922
+ client = getQwenClient()
923
+ context_list = generate_context_list(outline, collection_list)
924
+
925
+ temp = {
926
+ "survey_id": survey_id,
927
+ "outline": outline,
928
+ "survey_title": title,
929
+ "context": context_list,
930
+ "abstract": "",
931
+ "introduction": "",
932
+ "content": "",
933
+ "future_directions": "",
934
+ "conclusion": "",
935
+ "references": ""
936
+ }
937
+
938
+ # 调用generate_survey_paper_new时传入citation_data_list
939
+ generated_survey_paper = generate_survey_paper_new(title, outline, context_list, client, citation_data_list)
940
+
941
+ generated_introduction = generate_introduction_alternate(title, generated_survey_paper, client)
942
+ # generated_introduction = introduction_with_citations(generated_introduction, citation_data_list)
943
+ # print("\nGenerated Introduction:\n", generated_introduction)
944
+ # abs_generator = AbstractGenerator(pipeline)
945
+ # abstract = abs_generator.generate(title, generated_introduction)
946
+ abstract = generate_abstract(generated_survey_paper, client)
947
+ # con_generator = ConclusionGenerator(pipeline)
948
+ # conclusion = con_generator.generate(title, generated_introduction)
949
+ conclusion = generate_conclusion(generated_survey_paper, client)
950
+ abstract = abstract.replace("Abstract:", "")
951
+ conclusion = conclusion.replace("Conclusion:", "")
952
+ # future_directions = generate_future_directions_qwen(client, title, generated_introduction).replace("Future Directions:","")
953
+ #New version: 12/03
954
+ future_directions = generate_future_work(generated_survey_paper, client)
955
+ # references = generate_references_dir('./src/static/data/txt/'+survey_id)
956
+ temp["abstract"] = abstract
957
+ temp["introduction"] = generated_introduction
958
+ temp["content"] = generated_survey_paper
959
+ temp["conclusion"] = conclusion
960
+ temp["future_directions"] = future_directions
961
+ # temp["references"] = "\n\n".join([f"{ref}" for i, ref in enumerate(references)])
962
+ temp["content"] = insert_section(temp["content"], "Abstract", temp["abstract"])
963
+ temp["content"] = insert_section(temp["content"], "Conclusion", temp["conclusion"])
964
+ temp["content"] = insert_section(temp["content"], "Future Directions", temp["future_directions"])
965
+ output_path = f'./src/static/data/txt/{survey_id}/generated_result.json'
966
+ with open(output_path, 'w', encoding='utf-8') as f:
967
+ json.dump(temp, f, ensure_ascii=False, indent=4)
968
+ print(f"Survey has been saved to {output_path}.")
969
+ return
970
+
971
+
972
+ def generate_references_dir(dir):
973
+ client = getQwenClient()
974
+ papers_info = []
975
+ for file in os.listdir(dir):
976
+ if file.endswith(".json"):
977
+ file_path = os.path.join(dir, file)
978
+ with open(file_path, 'r', encoding='utf-8') as f:
979
+ data = json.load(f)
980
+
981
+ papers_info.append({
982
+ "file_path": file_path,
983
+ "title": data.get("title", "Unknown Title"),
984
+ "authors": data.get("authors", "Unknown Author")
985
+ })
986
+ print("The length of papers_info is: ", len(papers_info))
987
+ references = generate_references(papers_info, client)
988
+ return references
989
+
990
+ if __name__ == '__main__':
991
+ model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
992
+
993
+ context = '''
994
+ Many paradigms have been proposed to asses informativeness of data samples for active learning. One of the popular approaches is selecting the most uncertain data sample, i.e the data sample in which current classifier is least confident. Some other approaches are selecting the sample which yields a model with minimum risk or the data sample which yields fastest convergence in gradient based methods.//
995
+ An active under-sampling approach is presented in this paper to change the data distribution of training datasets, and improve the classification accuracy of minority classes while maintaining overall classification performance.//
996
+ In this paper, we propose an uncertainty-based active learning algorithm which requires only samples of one class and a set of unlabeled data in order to operate.//
997
+ The principal contribution of our work is twofold: First, we use Bayes’ rule and density estimation to avoid the need to have a model of all classes for computing the uncertainty measure.//
998
+ This technique reduces the number of input parameters of the problem. At the rest of this paper, we first review recent related works in the fields of active learning and active one-class learning (section II).//
999
+ The classifier predicts that all the samples are non-fraud, it will have a quite high accuracy. However, for problems like fraud detection, minority class classification accuracy is more critical.//
1000
+ The algorithm used and the features selected are always the key points at design time, and many experiments are needed to select the final algorithm and the best suited feature set.//
1001
+ Active learning works by selecting among unlabeled data, the most informative data sample. The informativeness of a sample is the amount of accuracy gain achieved after adding it to the training set.//
1002
+ Some other approaches are selecting the sample which yields a model with minimum risk or the data sample which yields fastest convergence in gradient based methods.//
1003
+ In this paper, we propose a novel approach reducing each within group error, BABoost, that is a variant of AdaBoost.//
1004
+ Simulations on different unbalanced distribution data and experiments performed on several real datasets show that the new method is able to achieve a lower within group error.//
1005
+ Active learning with early stopping can achieve a faster and scalable solution without sacrificing prediction performance.//
1006
+ We also propose an efficient Support Vector Machine (SVM) active learning strategy which queries a small pool of data at each iterative step instead of querying the entire dataset.//
1007
+ The second part consists of applying a treatment method and inducing a classifier for each class distribution.//
1008
+ This time we measured the percentage of the performance loss that was recovered by the treatment method.//
1009
+ We used two well-known over-sampling methods, random over-sampling and SMOTE.//
1010
+ We tested our proposed technique on a sample of three representative functional genomic problems: splice site, protein subcellular localization and phosphorylation site prediction problems.//
1011
+ Among the possible PTMs, phosphorylation is the most studied and perhaps the most important.//
1012
+ The second part consists of applying a treatment method and inducing a classifier for each class distribution.//
1013
+ We show that Active Learning (AL) strategy can be a more efficient alternative to resampling methods to form a balanced training set for the learner in early stages of the learning.//
1014
+ '''
1015
+
1016
+ collection_list = ['activelearningfrompositiveandunlabeleddata', ]
1017
+
1018
+ Global_pipeline = transformers.pipeline(
1019
+ "text-generation",
1020
+ model=model_id,
1021
+ model_kwargs={"torch_dtype": torch.bfloat16},
1022
+ token = os.getenv('HF_API_KEY'),
1023
+ device_map="auto",
1024
+ )
1025
+ Global_pipeline.model.load_adapter(peft_model_id = "technicolor/llama3.1_8b_outline_generation", adapter_name="outline")
1026
+ Global_pipeline.model.load_adapter(peft_model_id ="technicolor/llama3.1_8b_conclusion_generation", adapter_name="conclusion")
1027
+ Global_pipeline.model.load_adapter(peft_model_id ="technicolor/llama3.1_8b_abstract_generation", adapter_name="abstract")
1028
+
1029
+ generateSurvey("test", "Predictive modeling of imbalanced data", collection_list, Global_pipeline)
src/demo/asg_query.py ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from openai import OpenAI
3
+ from datetime import datetime, timedelta
4
+ import re
5
+
6
+ def generate_abstract_qwen(topic):
7
+
8
+ # Initialize the OpenAI client using environment variables
9
+ openai_api_key = os.getenv("OPENAI_API_KEY")
10
+ openai_api_base = os.getenv("OPENAI_API_BASE")
11
+ client = OpenAI(
12
+ api_key = openai_api_key,
13
+ base_url = openai_api_base,
14
+ )
15
+
16
+ ###########################
17
+ # Step 1: Generate a survey abstract for the given topic.
18
+ ###########################
19
+ system_prompt_abstract = """
20
+ You are a skilled research survey writer. Your task is to generate a survey abstract on the given topic. The abstract should cover the main challenges, key concepts, and research directions associated with the topic. Write in clear, concise academic English.
21
+ """
22
+ user_prompt_abstract = f"""
23
+ Topic: {topic}
24
+
25
+ Please generate a comprehensive survey abstract for this topic. Include discussion of core challenges, key terminologies, and emerging methodologies that are critical in the field. The total length of the abstract should be around 300–500 words.
26
+ """
27
+ messages_abstract = [
28
+ {"role": "system", "content": system_prompt_abstract},
29
+ {"role": "user", "content": user_prompt_abstract}
30
+ ]
31
+
32
+ abstract_response = client.chat.completions.create(
33
+ model=os.environ.get("MODEL"),
34
+ max_tokens=2048,
35
+ temperature=0.5,
36
+ stop="<|im_end|>",
37
+ stream=True,
38
+ messages=messages_abstract
39
+ )
40
+
41
+ abstract_text = ""
42
+ for chunk in abstract_response:
43
+ if chunk.choices[0].delta.content:
44
+ abstract_text += chunk.choices[0].delta.content
45
+ abstract_text = abstract_text.strip()
46
+ # print("The abstract is:", abstract_text)
47
+
48
+ return abstract_text
49
+
50
+ def generate_entity_lists_qwen(topic, abstract_text):
51
+ openai_api_key = os.getenv("OPENAI_API_KEY")
52
+ openai_api_base = os.getenv("OPENAI_API_BASE")
53
+ client = OpenAI(
54
+ api_key = openai_api_key,
55
+ base_url = openai_api_base,
56
+ )
57
+ system_prompt_abstract = f"""
58
+ You are an AI assistant specializing in natural language processing and entity recognition. Your task is to extract key entities and core concepts from a given abstract based on a specified topic.
59
+
60
+ You should return two distinct lists:
61
+ 1. **Entity list**: 5 Entities that are synonymous or closely related to the given topic (nouns only). These should be concise (no more than two words) and simplified to their root forms (e.g., removing suffixes like "-ing", "-ed") such as llm for large language model.
62
+ 2. **Concept list**: Core concepts from the abstract that are highly relevant to the topic. These should also be concise (no more than two words) and in their simplest form, one single word is preferred unless the term is inseparable.
63
+
64
+ Ensure that your response follows this exact format:
65
+ Entity list: [entity1, entity2, entity3, entity4, entity5]
66
+ Concept list: [concept1, concept2, concept3, ...concept n]
67
+ Do not include any explanations or additional text.
68
+
69
+ ### **Example**
70
+ #### **Input:**
71
+ Topic: Large Language Models
72
+ Abstract: Ever since the Turing Test was proposed in the 1950s, humans have explored the mastering of language intelligence by machine. Language is essentially a complex, intricate system of human expressions governed by grammatical rules. It poses a significant challenge to develop capable artificial intelligence (AI) algorithms for comprehending and grasping a language. As a major approach, language modeling has been widely studied for language understanding and generation in the past two decades, evolving from statistical language models to neural language models. Recently, pre-trained language models (PLMs) have been proposed by pretraining Transformer models over large-scale corpora, showing strong capabilities in solving various natural language processing (NLP) tasks. Since the researchers have found that model scaling can lead to an improved model capacity, they further investigate the scaling effect by increasing the parameter scale to an even larger size. Interestingly, when the parameter scale exceeds a certain level, these enlarged language models not only achieve a significant performance improvement, but also exhibit some special abilities (e.g., in-context learning) that are not present in small-scale language models (e.g., BERT). To discriminate the language models in different parameter scales, the research community has coined the term large language models (LLM) for the PLMs of significant size (e.g., containing tens or hundreds of billions of parameters). Recently, the research on LLMs has been largely advanced by both academia and industry, and a remarkable progress is the launch of ChatGPT (a powerful AI chatbot developed based on LLMs), which has attracted widespread attention from society. The technical evolution of LLMs has been making an important impact on the entire AI community, which would revolutionize the way how we develop and use AI algorithms. Considering this rapid technical progress, in this survey, we review the recent advances of LLMs by introducing the background, key findings, and mainstream techniques. In particular, we focus on four major aspects of LLMs, namely pre-training, adaptation tuning, utilization, and capacity evaluation. Furthermore, we also summarize the available resources for developing LLMs and discuss the remaining issues for future directions. This survey provides an up-to-date review of the literature on LLMs, which can be a useful resource for both researchers and engineers.
73
+
74
+ #### **Expected Output:**
75
+ "entity list": ["language model", "plm", "large language", "llm", "llms"]
76
+ "concept list": ["turing", "language intelligence", "ai", "generation", "statistical", "neural", "pre-train", "transformer", "corpora", "nlp", "in-context", "bert", "chatgpt", "adaptation", "utilization"]
77
+ Make sure to strictly follow this format in your response.
78
+ """
79
+
80
+ user_prompt_abstract = f"""
81
+ Topic: {topic}
82
+ Abstract: {abstract_text}
83
+
84
+ Based on the given topic and abstract, extract the following:
85
+ 1. A **list of 5 most key entities (nouns)** that are synonymous or closely related to the topic. Keep each entity under two words and in its simplest form.
86
+ 2. A **list of core concepts (terms) as many as possible** from the abstract that are highly relevant to the topic. Keep each concept under two words and in its simplest form.
87
+ """
88
+
89
+ messages_abstract = [
90
+ {"role": "system", "content": system_prompt_abstract},
91
+ {"role": "user", "content": user_prompt_abstract}
92
+ ]
93
+
94
+ entity_response = client.chat.completions.create(
95
+ model=os.environ.get("MODEL"),
96
+ max_tokens=2048,
97
+ temperature=0.5,
98
+ stop="<|im_end|>",
99
+ stream=True,
100
+ messages=messages_abstract
101
+ )
102
+
103
+ entity_list = ""
104
+ for chunk in entity_response:
105
+ if chunk.choices[0].delta.content:
106
+ entity_list += chunk.choices[0].delta.content
107
+ entity_list = entity_list.strip()
108
+ # print("The entity lists are:", entity_list)
109
+
110
+ return entity_list
111
+
112
+ def generate_query_qwen(topic):
113
+ # Calculate date range for the arXiv query (last 5 years)
114
+ abstract_text = generate_abstract_qwen(topic)
115
+ entity_list = generate_entity_lists_qwen(topic, abstract_text)
116
+ today = datetime.now()
117
+ five_years_ago = today - timedelta(days=10 * 365) # approximate calculation
118
+ start_date = five_years_ago.strftime('%Y%m%d')
119
+ end_date = today.strftime('%Y%m%d')
120
+
121
+
122
+ # System prompt: Focus on how to extract keywords from the abstract.
123
+ system_prompt_query = """
124
+ You are a research assistant specializing in constructing effective arXiv search queries. Your task is to generate a structured search query using **pre-extracted entity and concept lists** from a given abstract and the topic. Follow these instructions exactly:
125
+
126
+ 1. **Input Data:**
127
+ - **Entity List:** A list of entities that are synonymous or closely related to the given topic.
128
+ - **Concept List:** A list of core concepts from the abstract that are highly relevant to the topic.
129
+
130
+ 2. **Ensure Minimum Keyword Count:**
131
+ - **Entity List** must contain at least **3** nouns of entities. If there are fewer, intelligently supplement additional relevant terms, ensuring that entities are synonyms or closely related to the key entity in the topic (e.g., "LLM" for "Large Language Model").
132
+ - **Concept List** must contain **12-15** domain-specific terms. If there are fewer, intelligently supplement additional relevant terms. Avoid broad terms like "combine" or "introduce."
133
+
134
+ 3. **Standardize Formatting:**
135
+ - Convert all terms to their **base form** without adding any wildcard (`*`).
136
+ - All terms must be **in lowercase**.
137
+
138
+ 4. **Construct the Final Query:**
139
+ - The query must follow this exact structure:
140
+ ```
141
+ (abs:"<Term1>" AND abs:"<Term2>") AND
142
+ (abs:"<Entity1>" OR abs:"<Entity2>" OR abs:"<Entity3>" OR abs:"<Entity4>" OR abs:"<Entity5>") AND
143
+ (abs:"<Concept1>" OR abs:"<Concept2>" OR ... OR abs:"<Concept12>")
144
+ ```
145
+ - **Terms are 2 or 3 keywords or phrases extracted from the topic that you think **must** occur in the abstract of the searching results and are grouped together using `AND` in the first part.** (most important)
146
+ - **Entities are grouped together using `OR` in the second part.**
147
+ - **Concepts are grouped together using `OR` in the third part.**
148
+ - **The two groups are combined using `AND`.**
149
+ - **For compound words with hyphens (e.g., "in-context"), replace `-` with a space, resulting in `"in context"`.**
150
+ - **Do not include any explanations or extra text. Output only the final query.**
151
+ """
152
+
153
+ # User prompt: Provide examples of topics with corresponding query formats.
154
+ # User prompt: Provide examples of topics with corresponding query formats.
155
+ # User prompt: Uses pre-extracted entities and concepts, ensures minimum count, and applies stemming + wildcards.
156
+ user_prompt_query = f"""
157
+ Below are the pre-extracted keywords for constructing the final arXiv query.
158
+
159
+ **Topic:** {topic}
160
+ **Entity list and Concept list:** {entity_list}
161
+
162
+ ### **Processing Rules Applied:**
163
+ - **Ensure the key terms in the topic are included**.
164
+ - **Ensure at least 5 entities** (if fewer, supplement additional relevant terms).
165
+ - **Ensure 12-15 concepts** (if fewer, supplement additional relevant terms).
166
+ - **Convert all terms to lowercase.**
167
+ - **For compound words with hyphens (e.g., "in-context"), replace `-` with a space, resulting in `"in context"`**.
168
+ - **Output only the final query with no extra text.**
169
+
170
+ ### **Example Query Format:**
171
+
172
+ 1. **Topic:** Large Language Models in Recommendation Systems
173
+ **Transformed Entity List:** ["language model", "plm", "large language", "llm", "deep model"]
174
+ **Transformed Concept List:** ["tur", "language intelligence", "ai", "generation", "statistical", "neural", "pretraining", "transformer", "corpora", "nlp", "in context", "bert", "chatgpt", "adaptation", "utilization"]
175
+ **Query:**
176
+ (abs:"large language model" AND abs:"recommendation") AND (abs:"language model" OR abs:"plm" OR abs:"large language" OR abs:"llm" OR abs:"deep model") AND (abs:"tur" OR abs:"language intelligence" OR abs:"ai" OR abs:"generation" OR abs:"statistical" OR abs:"neural" OR abs:"pretraining" OR abs:"transformer" OR abs:"corpora" OR abs:"nlp" OR abs:"in context" OR abs:"bert" OR abs:"chatgpt" OR abs:"adaptation" OR abs:"utilization")
177
+
178
+ 2. **Topic:** Quantum Computing in Physics
179
+ **Transformed Entity List:** ["quantum computing", "qubit", "qc", "quantum device", "topological computing"]
180
+ **Transformed Concept List:** ["decoherence", "entanglement", "error", "topology", "annealing", "photon", "superconducting", "algorithm", "optimization", "verification", "fault tolerance", "noise", "circuit", "quantum machine", "measurement"]
181
+ **Query:**
182
+ (abs:"quantum computing" AND abs:"physics") AND (abs:"quantum computing" OR abs:"qubit" OR abs:"qc" OR abs:"quantum device" OR abs:"topological computing") AND (abs:"decoherence" OR abs:"entanglement" OR abs:"error" OR abs:"topology" OR abs:"annealing" OR abs:"photon" OR abs:"superconducting" OR abs:"algorithm" OR abs:"optimization" OR abs:"verification" OR abs:"fault tolerance" OR abs:"noise" OR abs:"circuit" OR abs:"quantum machine" OR abs:"measurement")
183
+
184
+ ---
185
+
186
+ ### **Now Generate the Query for This Topic:**
187
+ **Topic:** {topic}
188
+ Using the provided **Entity List** and **Concept List**, apply the following steps:
189
+ 1. **Ensure Entity List contains at least 5 items.** If fewer, supplement additional relevant terms.
190
+ 2. **Ensure Concept List contains 12-15 items.** If fewer, supplement additional relevant terms.
191
+ 3. **Convert all terms to lowercase.**
192
+ 4. **For compound words with hyphens (`-`), replace `-` with a space, e.g., `"in-context"` → `"in context"`.**
193
+ 5. **Construct the arXiv search query in the same format as the examples above.**
194
+ 6. **Return only the final query. Do not include explanations or additional text.**
195
+ All the terms in query should not exceed 2 words!
196
+ """
197
+
198
+ # Initialize the OpenAI API client
199
+ openai_api_key = os.getenv("OPENAI_API_KEY")
200
+ openai_api_base = os.getenv("OPENAI_API_BASE")
201
+ client = OpenAI(
202
+ api_key=openai_api_key,
203
+ base_url=openai_api_base,
204
+ )
205
+
206
+ messages = [
207
+ {"role": "system", "content": system_prompt_query},
208
+ {"role": "user", "content": user_prompt_query}
209
+ ]
210
+
211
+ response = client.chat.completions.create(
212
+ model=os.environ.get("MODEL"),
213
+ max_tokens=512,
214
+ temperature=0.5,
215
+ stop="<|im_end|>",
216
+ stream=True,
217
+ messages=messages
218
+ )
219
+
220
+ output_query = ""
221
+ for chunk in response:
222
+ if chunk.choices[0].delta.content:
223
+ output_query += chunk.choices[0].delta.content
224
+ match = re.search(r'\(.*\)', output_query, re.DOTALL)
225
+
226
+ if match:
227
+ extracted_query = match.group(0) # 保留匹配到的整个括号内容
228
+ else:
229
+ extracted_query = output_query.strip() # 如果匹配失败,使用原始查询
230
+
231
+ # 重新拼接 `submittedDate`
232
+ # updated_query = f"{extracted_query} AND submittedDate:[{start_date} TO {end_date}]"
233
+ updated_query = f"{extracted_query}"
234
+ print('The response is :', updated_query)
235
+ return updated_query.strip()
236
+
237
+ def generate_generic_query_qwen(original_query, topic):
238
+ """
239
+ Transforms an overly strict arXiv query into a simplified, more generic version.
240
+
241
+ The new query must be in the format:
242
+ (abs:"<GenericTerm1>" AND abs:"<GenericTerm2>") OR (abs:"<GenericTerm3>" AND abs:"<GenericTerm4>")
243
+
244
+ Here, <GenericTerm1> and <GenericTerm2> represent two generic and common keywords,
245
+ while <GenericTerm3> and <GenericTerm4> are synonyms or closely related terms to the first two.
246
+ related to the given topic. If the terms in the original query are too strict,
247
+ replace them with broader terms that improve matching against arXiv articles.
248
+
249
+ Parameters:
250
+ original_query (str): The output query from generate_query_qwen() which is too strict.
251
+ topic (str): The research topic.
252
+
253
+ Returns:
254
+ str: The simplified arXiv query.
255
+ """
256
+
257
+ system_prompt = """
258
+ You are a research assistant specializing in constructing effective and broad arXiv search queries.
259
+ Your job is to transform an overly strict query into a simplified, generic one.
260
+
261
+ Instructions:
262
+ 1. Input:
263
+ - A strict query that might be too specific.
264
+ - A topic which the query intends to capture.
265
+
266
+ 2. Requirements:
267
+ - Create a new query that only has the structure:
268
+ (abs:"<GenericTerm1>" AND abs:"<GenericTerm2>") OR (abs:"<GenericTerm3>" AND abs:"<GenericTerm4>")
269
+ - Replace <GenericTerm1> and <GenericTerm2> with two generic and common keywords for the topic.
270
+ - Replace <GenericTerm3> and <GenericTerm4> with the synonyms or closely related terms to the <GenericTerm1> and <GenericTerm2>.
271
+ - If the terms from the original query are too narrow, modify them to more broadly represent the given topic.
272
+ - All keywords must be in lowercase and in their base form.
273
+ - Each term should be one or two words.
274
+
275
+ 3. Output:
276
+ - Return only the final query in the exact format with no extra explanations.
277
+ """
278
+
279
+ user_prompt = f"""
280
+ Original Query: {original_query}
281
+ Topic: {topic}
282
+
283
+ The original query may be too strict and fails to match a broad range of arXiv articles.
284
+ Please generate a new query in the format:
285
+ (abs:"<GenericTerm1>" AND abs:"<GenericTerm2>") OR (abs:"<GenericTerm3>" AND abs:"<GenericTerm4>")
286
+ Replace <GenericTerm1> and <GenericTerm2> with more generic and commonly used terms that represent the topic.
287
+ Output only the final query.
288
+ """
289
+
290
+ openai_api_key = os.getenv("OPENAI_API_KEY")
291
+ openai_api_base = os.getenv("OPENAI_API_BASE")
292
+
293
+ # Initialize the OpenAI API client (assuming a similar interface as before)
294
+ client = OpenAI(
295
+ api_key=openai_api_key,
296
+ base_url=openai_api_base,
297
+ )
298
+
299
+ messages = [
300
+ {"role": "system", "content": system_prompt},
301
+ {"role": "user", "content": user_prompt},
302
+ ]
303
+
304
+ response = client.chat.completions.create(
305
+ model=os.environ.get("MODEL"),
306
+ max_tokens=512,
307
+ temperature=0.5,
308
+ stop="<|im_end|>",
309
+ stream=True,
310
+ messages=messages
311
+ )
312
+
313
+ output_query = ""
314
+ for chunk in response:
315
+ if chunk.choices[0].delta.content:
316
+ output_query += chunk.choices[0].delta.content
317
+
318
+ # Use regex to extract the new simplified query in the exact required format
319
+ match = re.search(r'\(.*\)', output_query, re.DOTALL)
320
+ if match:
321
+ extracted_query = match.group(0)
322
+ else:
323
+ extracted_query = output_query.strip()
324
+
325
+ print('The response is :', extracted_query)
326
+ return extracted_query.strip()
src/demo/asg_retriever.py ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import uuid
3
+ import re
4
+ import os
5
+ import json
6
+ import chromadb
7
+ from .asg_splitter import TextSplitting
8
+ from langchain_huggingface import HuggingFaceEmbeddings
9
+ import time
10
+ import concurrent.futures
11
+
12
+ class Retriever:
13
+ client = None
14
+ cur_dir = os.getcwd()
15
+ chromadb_path = os.path.join(cur_dir, "chromadb")
16
+
17
+ def __init__ (self):
18
+ self.client = chromadb.PersistentClient(path=self.chromadb_path)
19
+
20
+ def create_collection_chroma(self, collection_name: str):
21
+ """
22
+ The Collection will be created with collection_name, the name must follow the rules:\n
23
+ 0. Collection name must be unique, if the name exists then try to get this collection\n
24
+ 1. The length of the name must be between 3 and 63 characters.\n
25
+ 2. The name must start and end with a lowercase letter or a digit, and it can contain dots, dashes, and underscores in between.\n
26
+ 3. The name must not contain two consecutive dots.\n
27
+ 4. The name must not be a valid IP address.\n
28
+ """
29
+ try:
30
+ self.client.create_collection(name=collection_name)
31
+ except chromadb.db.base.UniqueConstraintError:
32
+ self.get_collection_chroma(collection_name)
33
+ return collection_name
34
+
35
+ def get_collection_chroma (self, collection_name: str):
36
+ collection = self.client.get_collection(name=collection_name)
37
+ return collection
38
+
39
+ def add_documents_chroma (self, collection_name: str, embeddings_list: list[list[float]], documents_list: list[dict], metadata_list: list[dict]) :
40
+ """
41
+ Please make sure that embeddings_list and metadata_list are matched with documents_list\n
42
+ Example of one metadata: {"doc_name": "Test2.pdf", "page": "9"}\n
43
+ The id will be created automatically as uuid v4
44
+ The chunks content and metadata will be logged (appended) into ./logs/<collection_name>.json
45
+ """
46
+ collection = self.get_collection_chroma(collection_name)
47
+ num = len(documents_list)
48
+ ids=[str(uuid.uuid4()) for i in range(num) ]
49
+
50
+ collection.add(
51
+ documents= documents_list,
52
+ metadatas= metadata_list,
53
+ embeddings= embeddings_list,
54
+ ids=ids
55
+ )
56
+ logpath = os.path.join(self.cur_dir, "logs", f"{collection_name}.json")
57
+ os.makedirs(os.path.dirname(logpath), exist_ok=True)
58
+ logs = []
59
+ try:
60
+ with open (logpath, 'r', encoding="utf-8") as chunklog:
61
+ logs = json.load(chunklog)
62
+ except (FileNotFoundError, json.decoder.JSONDecodeError):
63
+ logs = []
64
+
65
+ added_log= [{"chunk_id": ids[i], "metadata": metadata_list[i], "page_content": documents_list[i]} \
66
+ for i in range(num)]
67
+
68
+ logs.extend(added_log)
69
+
70
+ # write back
71
+ with open (logpath, "w", encoding="utf-8") as chunklog:
72
+ json.dump(logs, chunklog, indent=4)
73
+ print(f"Logged document information to '{logpath}'.")
74
+
75
+ def query_chroma(self, collection_name: str, query_embeddings: list[list[float]], n_results: int = 5) -> dict:
76
+ # return n closest results (chunks and metadatas) in order
77
+ collection = self.get_collection_chroma(collection_name)
78
+ result = collection.query(
79
+ query_embeddings=query_embeddings,
80
+ n_results=n_results,
81
+ )
82
+ return result
83
+
84
+ def update_chroma (self, collection_name: str, id_list: list[str], embeddings_list: list[list[float]], documents_list: list[str], metadata_list: list[dict]):
85
+ collection = self.get_collection_chroma(collection_name)
86
+ num = len(documents_list)
87
+ collection.update(
88
+ ids=id_list,
89
+ embeddings=embeddings_list,
90
+ metadatas=metadata_list,
91
+ documents=documents_list,
92
+ )
93
+ update_list = [{"chunk_id": id_list[i], "metadata": metadata_list[i], "page_content": documents_list[i]} for i in range(num)]
94
+
95
+ # update the chunk log
96
+ logs = []
97
+
98
+ logpath = os.path.join(self.cur_dir, "logs", f"{collection_name}.json")
99
+ try:
100
+ with open (logpath, 'r', encoding="utf-8") as chunklog:
101
+ logs = json.load(chunklog)
102
+ except (FileNotFoundError, json.decoder.JSONDecodeError):
103
+ logs = [] # old_log does not exist or empty, then no need to update
104
+ else:
105
+ for i in range(num):
106
+ for log in logs:
107
+ if (log["chunk_id"] == update_list[i]["chunk_id"]):
108
+ log["metadata"] = update_list[i]["metadata"]
109
+ log["page_content"] = update_list[i]["page_content"]
110
+ break
111
+
112
+ with open (logpath, "w", encoding="utf-8") as chunklog:
113
+ json.dump(logs, chunklog, indent=4)
114
+ print(f"Updated log file at '{logpath}'.")
115
+
116
+ def delete_collection_entries_chroma(self, collection_name: str, id_list: list[str]):
117
+ collection = self.get_collection_chroma(collection_name)
118
+ collection.delete(ids=id_list)
119
+ print(f"Deleted entries with ids: {id_list} from collection '{collection_name}'.")
120
+
121
+ def delete_collection_chroma(self, collection_name: str):
122
+ print(f"The collection {collection_name} will be deleted forever!")
123
+ self.client.delete_collection(collection_name)
124
+ try:
125
+ logpath = os.path.join(self.cur_dir, "logs", f"{collection_name}.json")
126
+ print(f"Collection {collection_name} has been removed, deleting log file of this collection")
127
+ os.remove(logpath)
128
+ except FileNotFoundError:
129
+ print("The log of this collection did not exist!")
130
+
131
+ def list_collections_chroma(self):
132
+ collections = self.client.list_collections()
133
+
134
+ # Generate a legal collection name from a PDF filename
135
+ def legal_pdf(filename: str) -> str:
136
+ pdf_index = filename.lower().rfind('.pdf')
137
+ if pdf_index != -1:
138
+ name_before_pdf = filename[:pdf_index]
139
+ else:
140
+ name_before_pdf = filename
141
+ name_before_pdf = name_before_pdf.strip()
142
+ name = re.sub(r'[^a-zA-Z0-9._-]', '', name_before_pdf)
143
+ name = name.lower()
144
+ while '..' in name:
145
+ name = name.replace('..', '.')
146
+ name = name[:63]
147
+ if len(name) < 3:
148
+ name = name.ljust(3, '0') # fill with '0' if the length is less than 3
149
+ if not re.match(r'^[a-z0-9]', name):
150
+ name = 'a' + name[1:]
151
+ if not re.match(r'[a-z0-9]$', name):
152
+ name = name[:-1] + 'a'
153
+ ip_pattern = re.compile(r'^(\d{1,3}\.){3}\d{1,3}$')
154
+ if ip_pattern.match(name):
155
+ name = 'ip_' + name
156
+ return name
157
+
158
+ def process_pdf(file_path: str, survey_id: str, embedder: HuggingFaceEmbeddings, mode: str):
159
+ # Load and split the PDF
160
+ split_start_time = time.time()
161
+ splitters = TextSplitting().mineru_recursive_splitter(file_path, survey_id, mode)
162
+
163
+ documents_list = [document.page_content for document in splitters]
164
+ for i in range(len(documents_list)):
165
+ documents_list[i] = documents_list[i].replace('\n', ' ')
166
+ print(f"Splitting took {time.time() - split_start_time} seconds.")
167
+
168
+ # Embed the documents
169
+ # embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
170
+ embed_start_time = time.time()
171
+ doc_results = embedder.embed_documents(documents_list)
172
+ if isinstance(doc_results, torch.Tensor):
173
+ embeddings_list = doc_results.tolist()
174
+ else:
175
+ embeddings_list = doc_results
176
+ print(f"Embedding took {time.time() - embed_start_time} seconds.")
177
+
178
+ # Prepare metadata
179
+ metadata_list = [{"doc_name": os.path.basename(file_path)} for i in range(len(documents_list))]
180
+
181
+ title = os.path.splitext(os.path.basename(file_path))[0]
182
+
183
+
184
+ title_new = title.strip()
185
+ invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*','_']
186
+ for char in invalid_chars:
187
+ title_new = title_new.replace(char, ' ')
188
+ collection_name = legal_pdf(title_new)
189
+
190
+ retriever = Retriever()
191
+ retriever.list_collections_chroma()
192
+ retriever.create_collection_chroma(collection_name)
193
+ retriever.add_documents_chroma(
194
+ collection_name=collection_name,
195
+ embeddings_list=embeddings_list,
196
+ documents_list=documents_list,
197
+ metadata_list=metadata_list
198
+ )
199
+
200
+ return collection_name, embeddings_list, documents_list, metadata_list,title_new
201
+
202
+ def query_embeddings(collection_name: str, query_list: list):
203
+ embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
204
+ retriever = Retriever()
205
+
206
+ final_context = ""
207
+
208
+ seen_chunks = set()
209
+ for query_text in query_list:
210
+ query_embeddings = embedder.embed_query(query_text)
211
+ query_result = retriever.query_chroma(collection_name=collection_name, query_embeddings=[query_embeddings], n_results=2)
212
+
213
+ query_result_chunks = query_result["documents"][0]
214
+ # query_result_ids = query_result["ids"][0]
215
+
216
+ for chunk in query_result_chunks:
217
+ if chunk not in seen_chunks:
218
+ final_context += chunk.strip() + "//\n"
219
+ seen_chunks.add(chunk)
220
+ return final_context
221
+
222
+ # new, may be in parallel
223
+ def query_embeddings_new(collection_name: str, query_list: list):
224
+ embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
225
+ retriever = Retriever()
226
+
227
+ final_context = ""
228
+
229
+ seen_chunks = set()
230
+ def process_query(query_text):
231
+ query_embeddings = embedder.embed_query(query_text)
232
+ query_result = retriever.query_chroma(
233
+ collection_name=collection_name,
234
+ query_embeddings=[query_embeddings],
235
+ n_results=2
236
+ )
237
+ query_result_chunks = query_result["documents"][0]
238
+ return query_result_chunks
239
+
240
+ with concurrent.futures.ThreadPoolExecutor() as executor:
241
+ futures = {executor.submit(process_query, query_text): query_text for query_text in query_list}
242
+ for future in concurrent.futures.as_completed(futures):
243
+ query_result_chunks = future.result()
244
+ for chunk in query_result_chunks:
245
+ if chunk not in seen_chunks:
246
+ final_context += chunk.strip() + "//\n"
247
+ seen_chunks.add(chunk)
248
+ return final_context
249
+
250
+ # wza
251
+ def query_embeddings_new_new(collection_name: str, query_list: list):
252
+ embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
253
+ retriever = Retriever()
254
+
255
+ final_context = "" # Stores concatenated context
256
+ citation_data_list = [] # Stores chunk content and collection name as source
257
+ seen_chunks = set() # Ensures unique chunks are added
258
+
259
+ def process_query(query_text):
260
+ # Embed the query text and retrieve relevant chunks
261
+ query_embeddings = embedder.embed_query(query_text)
262
+ query_result = retriever.query_chroma(
263
+ collection_name=collection_name,
264
+ query_embeddings=[query_embeddings],
265
+ n_results=5 # Fixed number of results
266
+ )
267
+ return query_result
268
+
269
+ with concurrent.futures.ThreadPoolExecutor() as executor:
270
+ future_to_query = {executor.submit(process_query, q): q for q in query_list}
271
+ for future in concurrent.futures.as_completed(future_to_query):
272
+ query_text = future_to_query[future]
273
+ try:
274
+ query_result = future.result()
275
+ except Exception as e:
276
+ print(f"Query '{query_text}' failed with exception: {e}")
277
+ continue
278
+
279
+ if "documents" not in query_result or "distances" not in query_result:
280
+ continue
281
+ if not query_result["documents"] or not query_result["distances"]:
282
+ continue
283
+ docs_list = query_result["documents"][0] if query_result["documents"] else []
284
+ dist_list = query_result["distances"][0] if query_result["distances"] else []
285
+
286
+ if len(docs_list) != len(dist_list):
287
+ continue
288
+
289
+ for chunk, distance in zip(docs_list, dist_list):
290
+ processed_chunk = chunk.strip()
291
+ if processed_chunk not in seen_chunks:
292
+ final_context += processed_chunk + "//\n"
293
+ seen_chunks.add(processed_chunk)
294
+ citation_data_list.append({
295
+ "source": collection_name,
296
+ "distance": distance,
297
+ "content": processed_chunk,
298
+ })
299
+
300
+ return final_context, citation_data_list
301
+
302
+ # concurrent version for both collection names and queries
303
+ def query_multiple_collections(collection_names: list[str], query_list: list[str], survey_id: str) -> dict:
304
+ """
305
+ Query multiple collections in parallel and return the combined results.
306
+
307
+ Args:
308
+ collection_names (list[str]): List of collection names to query.
309
+ query_list (list[str]): List of queries to execute on each collection.
310
+
311
+ Returns:
312
+ dict: Combined results from all collections, grouped by collection.
313
+ """
314
+ # Define embedder inside the function
315
+ embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
316
+ retriever = Retriever()
317
+
318
+ def query_single_collection(collection_name: str):
319
+ """
320
+ Query a single collection for all queries in the query_list.
321
+ """
322
+ final_context = ""
323
+ seen_chunks = set()
324
+
325
+ def process_query(query_text):
326
+ # Embed the query
327
+ query_embeddings = embedder.embed_query(query_text)
328
+ # Query the collection
329
+ query_result = retriever.query_chroma(
330
+ collection_name=collection_name,
331
+ query_embeddings=[query_embeddings],
332
+ n_results=5
333
+ )
334
+ query_result_chunks = query_result["documents"][0]
335
+ return query_result_chunks
336
+
337
+ # Process all queries in parallel for the given collection
338
+ with concurrent.futures.ThreadPoolExecutor() as executor:
339
+ futures = {executor.submit(process_query, query_text): query_text for query_text in query_list}
340
+ for future in concurrent.futures.as_completed(futures):
341
+ query_result_chunks = future.result()
342
+ for chunk in query_result_chunks:
343
+ if chunk not in seen_chunks:
344
+ final_context += chunk.strip() + "//\n"
345
+ seen_chunks.add(chunk)
346
+
347
+ return final_context
348
+
349
+ # Outer parallelism for multiple collections
350
+ results = {}
351
+ with concurrent.futures.ThreadPoolExecutor() as executor:
352
+ futures = {executor.submit(query_single_collection, collection_name): collection_name for collection_name in collection_names}
353
+ for future in concurrent.futures.as_completed(futures):
354
+ collection_name = futures[future]
355
+ results[collection_name] = future.result()
356
+
357
+ # Automatically save the results to a JSON file
358
+ file_path = f'./src/static/data/info/{survey_id}/retrieved_context.json'
359
+ with open(file_path, 'w', encoding='utf-8') as f:
360
+ json.dump(results, f, ensure_ascii=False, indent=4)
361
+
362
+ print(f"Results saved to {file_path}")
363
+
364
+ return results
src/demo/asg_splitter.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .asg_loader import DocumentLoading
2
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
3
+
4
+ class TextSplitting:
5
+ def mineru_recursive_splitter(self, file_path, survey_id, mode):
6
+ docs = DocumentLoading().load_pdf(file_path, survey_id, mode)
7
+ text_splitter = RecursiveCharacterTextSplitter(
8
+ chunk_size=400,
9
+ chunk_overlap=30,
10
+ length_function=len,
11
+ is_separator_regex=False,
12
+ )
13
+ texts = text_splitter.create_documents([docs])
14
+ return texts
15
+
16
+ def pypdf_recursive_splitter(self, file_path, survey_id):
17
+ docs = DocumentLoading().pypdf_loader(file_path, survey_id)
18
+ text_splitter = RecursiveCharacterTextSplitter(
19
+ chunk_size=300,
20
+ chunk_overlap=20,
21
+ length_function=len,
22
+ is_separator_regex=False,
23
+ )
24
+ texts = text_splitter.create_documents([docs])
25
+ return texts
src/demo/category_and_tsne.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.metrics import silhouette_score
2
+
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ import matplotlib.pyplot as plt
7
+ from sklearn.manifold import TSNE
8
+ from sklearn.cluster import AgglomerativeClustering
9
+ import json
10
+
11
+ IMG_PATH = './src/static/img/'
12
+
13
+ plt.switch_backend('agg')
14
+ device = 0
15
+ # tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", model_max_length = 128)
16
+ # model = AutoModel.from_pretrained("bert-base-uncased").to(device)
17
+
18
+ from sentence_transformers import SentenceTransformer
19
+ from umap import UMAP
20
+ from sklearn.decomposition import PCA
21
+ from sklearn.cluster import AgglomerativeClustering
22
+ from sklearn.feature_extraction.text import CountVectorizer
23
+ from bertopic.vectorizers import ClassTfidfTransformer
24
+ from bertopic.representation import KeyBERTInspired
25
+ from bertopic import BERTopic
26
+
27
+ import numpy as np
28
+ import matplotlib.pyplot as plt
29
+ from sklearn.manifold import TSNE
30
+ import seaborn as sns
31
+
32
+ class DimensionalityReduction:
33
+ def fit(self, X):
34
+ return self
35
+
36
+ def transform(self, X):
37
+ return X
38
+
39
+ class ClusteringWithTopic:
40
+ def __init__(self, df, n_topics=3):
41
+ embedding_model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)
42
+ # umap_model = DimensionalityReduction()
43
+ umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', init = 'pca')
44
+ hdbscan_model = AgglomerativeClustering(n_clusters=n_topics)
45
+ vectorizer_model = CountVectorizer(stop_words="english", min_df=1, ngram_range=(1, 2))
46
+ ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=False)# True
47
+ keybert_model = KeyBERTInspired()
48
+
49
+ self.df = df
50
+ self.embeddings = embeddings = embedding_model.encode(df, show_progress_bar=True)
51
+
52
+ representation_model = {
53
+ "KeyBERT": keybert_model,
54
+ # "OpenAI": openai_model, # Uncomment if you will use OpenAI
55
+ # "MMR": mmr_model,
56
+ # "POS": pos_model
57
+ }
58
+ self.topic_model = BERTopic(
59
+
60
+ # Pipeline models
61
+ embedding_model=embedding_model,
62
+ umap_model=umap_model,
63
+ hdbscan_model=hdbscan_model,
64
+ vectorizer_model=vectorizer_model,
65
+ ctfidf_model=ctfidf_model,
66
+ representation_model=representation_model,
67
+
68
+ # Hyperparameters
69
+ top_n_words=10,
70
+ verbose=True
71
+ )
72
+
73
+ def __init__(self, df, n_topics_list):
74
+ """
75
+ 初始化 ClusteringWithTopic,接受一个 n_topics_list,其中包含多个聚类数目,
76
+ 选取 silhouette_score 最高的结果。
77
+ """
78
+ embedding_model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)
79
+ self.embeddings = embedding_model.encode(df, show_progress_bar=True)
80
+
81
+ self.df = df
82
+ self.n_topics_list = n_topics_list
83
+
84
+ self.embedding_model = embedding_model
85
+ self.umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine',init ='pca')
86
+ self.vectorizer_model = CountVectorizer(stop_words="english", min_df=1, ngram_range=(1, 2))
87
+ self.ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=False)
88
+ self.keybert_model = KeyBERTInspired()
89
+ self.representation_model = {"KeyBERT": self.keybert_model}
90
+
91
+ # 用于存储不同聚类数目的结果
92
+ self.best_n_topics = None
93
+ self.best_labels = None
94
+ self.best_score = -1
95
+ # def fit_and_get_labels(self, X):
96
+ # topics, probs = self.topic_model.fit_transform(self.df, self.embeddings)
97
+ # return topics
98
+ def fit_and_get_labels(self):
99
+ """
100
+ 对不同的 n_topics 进行聚类,计算 silhouette_score,选取最佳的 n_topics 进行后续操作。
101
+ """
102
+ for n_topics in self.n_topics_list:
103
+ hdbscan_model = AgglomerativeClustering(n_clusters=n_topics)
104
+
105
+ topic_model = BERTopic(
106
+ embedding_model= self.embedding_model,
107
+ umap_model=self.umap_model,
108
+ hdbscan_model=hdbscan_model,
109
+ vectorizer_model=self.vectorizer_model,
110
+ ctfidf_model=self.ctfidf_model,
111
+ representation_model=self.representation_model,
112
+ top_n_words=10,
113
+ verbose=False
114
+ )
115
+
116
+ topics, _ = topic_model.fit_transform(self.df, self.embeddings)
117
+
118
+ # 计算 silhouette_score
119
+ if len(set(topics)) > 1: # silhouette_score 需要至少 2 个类别
120
+ score = silhouette_score(self.embeddings, topics)
121
+ else:
122
+ score = -1 # 单个类别时,silhouette_score 无意义
123
+
124
+ print(f"n_topics={n_topics}, silhouette_score={score}")
125
+
126
+ # 记录最佳的 n_topics
127
+ if score > self.best_score:
128
+ self.best_score = score
129
+ self.best_n_topics = n_topics
130
+ self.best_labels = topics
131
+ self.best_topic_model = topic_model
132
+
133
+ print(f"Best n_topics={self.best_n_topics}, Best silhouette_score={self.best_score}")
134
+ return self.best_labels, self.best_topic_model, self.best_n_topics
135
+
136
+ def clustering(df, n_cluster, survey_id):
137
+ text = df['retrieval_result'].astype(str)
138
+ clustering = ClusteringWithTopic(text, n_cluster)
139
+ df['label'] = clustering.fit_and_get_labels(text)
140
+
141
+ print("The clustering result is: ")
142
+ for col in df.columns:
143
+ print(f"{col}: {df.iloc[0][col]}")
144
+
145
+ # Save topic model information as JSON
146
+ topic_json = clustering.topic_model.get_topic_info().to_json()
147
+ with open(f'./src/static/data/info/{survey_id}/topic.json', 'w', encoding="utf-8") as file:
148
+ file.write(topic_json)
149
+
150
+ # Create a dictionary from 'ref_title' and 'retrieval_result' columns
151
+ description_dict = dict(zip(df['ref_title'], df['retrieval_result']))
152
+
153
+ # Save the dictionary to description.json
154
+ with open(f'./src/static/data/info/{survey_id}/description.json', 'w', encoding="utf-8") as file:
155
+ json.dump(description_dict, file, ensure_ascii=False, indent=4)
156
+ # df['top_n_words'] = clustering.topic_model.get_topic_info()['Representation'].tolist()
157
+ # df['topic_word'] = clustering.topic_model.get_topic_info()['KeyBERT'].tolist()
158
+
159
+
160
+ X = np.array(clustering.embeddings)
161
+ perplexity = 10
162
+ if X.shape[0] <= perplexity:
163
+ perplexity = max(1, X.shape[0] // 2)
164
+
165
+ tsne = TSNE(n_components=2, init='pca', perplexity=perplexity, random_state=42)
166
+ X_tsne = tsne.fit_transform(X)
167
+ colors = scatter(X_tsne, df['label'])
168
+
169
+ plt.savefig(IMG_PATH + 'tsne_' + survey_id + '.png', dpi=800, transparent=True)
170
+
171
+ plt.close()
172
+ output_tsv_filename = "./src/static/data/tsv/" + survey_id + '.tsv'
173
+ df.to_csv(output_tsv_filename, sep='\t')
174
+ return df, colors
175
+
176
+ def clustering(df, n_topics_list, survey_id):
177
+ text = df['retrieval_result'].astype(str)
178
+ clustering = ClusteringWithTopic(text, n_topics_list)
179
+ df['label'], topic_model, best_n_topics = clustering.fit_and_get_labels()
180
+
181
+ print("The clustering result is: ")
182
+ for col in df.columns:
183
+ print(f"{col}: {df.iloc[0][col]}")
184
+
185
+ # 保存 topic model 信息
186
+ topic_json = topic_model.get_topic_info().to_json()
187
+ with open(f'./src/static/data/info/{survey_id}/topic.json', 'w', encoding="utf-8") as file:
188
+ file.write(topic_json)
189
+
190
+ # 创建描述信息
191
+ description_dict = dict(zip(df['ref_title'], df['retrieval_result']))
192
+ with open(f'./src/static/data/info/{survey_id}/description.json', 'w', encoding="utf-8") as file:
193
+ json.dump(description_dict, file, ensure_ascii=False, indent=4)
194
+
195
+ # t-SNE 降维可视化
196
+ X = np.array(clustering.embeddings)
197
+ perplexity = min(10, max(1, X.shape[0] // 2)) # 避免 perplexity 过大
198
+
199
+ tsne = TSNE(n_components=2, init='pca', perplexity=perplexity, random_state=42)
200
+ X_tsne = tsne.fit_transform(X)
201
+
202
+ colors = scatter(X_tsne, df['label']) # 计算颜色
203
+
204
+ plt.savefig(IMG_PATH + 'tsne_' + survey_id + '.png', dpi=800, transparent=True)
205
+
206
+ plt.close()
207
+ output_tsv_filename = "./src/static/data/tsv/" + survey_id + '.tsv'
208
+ df.to_csv(output_tsv_filename, sep='\t')
209
+ return df, colors, best_n_topics
210
+
211
+ def scatter(x, colors):
212
+ sns.set_style('whitegrid')
213
+ sns.set_palette('Set1')
214
+ sns.set_context("notebook", font_scale=1.5,
215
+ rc={"lines.linewidth": 2.5})
216
+ # We choose a color palette with seaborn.
217
+ palette = np.array(sns.hls_palette(8, l=0.4, s=.8))
218
+ color_hex = sns.color_palette(sns.hls_palette(8, l=0.4, s=.8)).as_hex()
219
+ # We create a scatter plot.
220
+ f = plt.figure(figsize=(8, 8))
221
+ ax = plt.subplot(aspect='equal')
222
+ sc = ax.scatter(x[:, 0], x[:, 1], lw=0, s=1,
223
+ c=palette[colors.astype(np.int32)])
224
+ c = [palette[x] if x >= 0 else (0.0, 0.0, 0.0) for x in colors]
225
+ for i in range(x.shape[0]):
226
+ ax.text(x[i, 0], x[i, 1], '[' + str(i) + ']', fontsize=20, color=c[i], weight='1000')
227
+ plt.xlim(-25, 25)
228
+ plt.ylim(-25, 25)
229
+ ax.axis('off')
230
+ ax.axis('tight')
231
+ return color_hex[:colors.nunique()]
src/demo/count_files.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ def count_files_in_folders(parent_folder):
4
+ folder_counts = {}
5
+
6
+ for folder in os.listdir(parent_folder):
7
+ folder_path = os.path.join(parent_folder, folder)
8
+ if os.path.isdir(folder_path):
9
+ num_files = len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])
10
+ folder_counts[folder] = num_files
11
+
12
+ return folder_counts
13
+
14
+ # 指定arxiv_downloads文件夹路径
15
+ parent_folder = "arxiv_downloads"
16
+ counts = count_files_in_folders(parent_folder)
17
+
18
+ # 输出统计结果
19
+ for folder, num_files in counts.items():
20
+ print(f"{folder}: {num_files} files")
src/demo/download.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import xml.etree.ElementTree as ET
4
+ import urllib.parse
5
+ from tqdm import tqdm
6
+ import time
7
+ import re
8
+
9
+ import requests
10
+ from asg_query import generate_generic_query_qwen, generate_query_qwen
11
+ from dotenv import load_dotenv
12
+
13
+ load_dotenv()
14
+
15
+ PARENT_FOLDER = "arxiv_downloads_new_new_new"
16
+ os.makedirs(PARENT_FOLDER, exist_ok=True)
17
+
18
+ def sanitize_filename(filename):
19
+ filename = filename.replace("\n", "").strip()
20
+ filename = re.sub(r'[\/:*?"<>|]', '_', filename)
21
+ return filename[:100] + ".pdf"
22
+
23
+ def search_arxiv_papers(topic, max_results=50):
24
+ query_qwen = generate_query_qwen(topic)
25
+ encoded_query = urllib.parse.quote_plus(query_qwen)
26
+ url = f"https://export.arxiv.org/api/query?search_query={encoded_query}&start=0&max_results={max_results}&sortBy=submittedDate"
27
+
28
+ # base_url = "http://export.arxiv.org/api/query?"
29
+ # query = f"search_query=all:{topic.replace(' ', '+')}&start=0&max_results={max_results}&sortBy=relevance&sortOrder=descending"
30
+ # url = base_url + query
31
+
32
+ response = requests.get(url)
33
+ if response.status_code != 200:
34
+ print(f"Error fetching data for {topic}: {response.status_code}")
35
+ return []
36
+
37
+ root = ET.fromstring(response.text)
38
+ entries = root.findall("{http://www.w3.org/2005/Atom}entry")
39
+
40
+ papers = []
41
+ for entry in entries:
42
+ title = entry.find("{http://www.w3.org/2005/Atom}title").text
43
+ pdf_link = entry.find("{http://www.w3.org/2005/Atom}id").text.replace("abs", "pdf")
44
+ papers.append({"title": title, "pdf_link": pdf_link})
45
+
46
+ return papers
47
+
48
+ def download_pdf(url, folder, filename):
49
+ file_path = os.path.join(folder, filename)
50
+
51
+ response = requests.get(url, stream=True)
52
+ if response.status_code == 200:
53
+ with open(file_path, 'wb') as file:
54
+ for chunk in response.iter_content(chunk_size=1024):
55
+ file.write(chunk)
56
+ else:
57
+ print(f"Failed to download {url}")
58
+
59
+ def download_arxiv_papers(topic, max_results=50):
60
+ folder_name = os.path.join(PARENT_FOLDER, topic.replace(" ", "_"))
61
+ os.makedirs(folder_name, exist_ok=True)
62
+
63
+ papers = search_arxiv_papers(topic, max_results)
64
+
65
+ if not papers:
66
+ print(f"No papers found for topic: {topic}")
67
+ return
68
+
69
+ print(f"Downloading {len(papers)} papers for topic: {topic}")
70
+
71
+ for paper in tqdm(papers, total=len(papers)):
72
+ filename = sanitize_filename(paper['title'])
73
+ pdf_link = paper["pdf_link"]
74
+ download_pdf(pdf_link, folder_name, filename)
75
+ time.sleep(2)
76
+
77
+ print(f"Download complete. Papers saved in: {folder_name}")
78
+
79
+
80
+ def search_arxiv_with_query(query, max_results=50):
81
+ """
82
+ Query the arXiv API with a given query string.
83
+
84
+ Parameters:
85
+ query (str): The query string (URL-unencoded).
86
+ max_results (int): Maximum number of results to request.
87
+
88
+ Returns:
89
+ list: A list of dictionaries containing paper metadata.
90
+ """
91
+ encoded_query = urllib.parse.quote_plus(query)
92
+ url = f"https://export.arxiv.org/api/query?search_query={encoded_query}&start=0&max_results={max_results}&sortBy=submittedDate"
93
+
94
+ response = requests.get(url)
95
+ if response.status_code != 200:
96
+ print(f"Error fetching data with query: {query} | status code: {response.status_code}")
97
+ return []
98
+
99
+ try:
100
+ root = ET.fromstring(response.text)
101
+ except Exception as e:
102
+ print("Error parsing XML:", e)
103
+ return []
104
+
105
+ entries = root.findall("{http://www.w3.org/2005/Atom}entry")
106
+ papers = []
107
+ for entry in entries:
108
+ title = entry.find("{http://www.w3.org/2005/Atom}title").text.strip()
109
+ pdf_link = entry.find("{http://www.w3.org/2005/Atom}id").text.replace("abs", "pdf")
110
+ papers.append({"title": title, "pdf_link": pdf_link})
111
+ return papers
112
+
113
+ def download_arxiv_papers_new(topic, max_results=50, min_results=10):
114
+ """
115
+ Download arXiv papers for a given topic.
116
+
117
+ Process:
118
+ 1. Use a strict query generated by generate_query_qwen(topic) to query arXiv.
119
+ 2. If the number of results is fewer than `min_results`, then generate a more generic query
120
+ using generate_generic_query_qwen() and run additional searches.
121
+ 3. Combine non-duplicate papers (filtered by title) until reaching max_results or exhausting attempts.
122
+ 4. Download the PDF of each paper.
123
+
124
+ Parameters:
125
+ topic (str): The research topic.
126
+ max_results (int): Total maximum number of papers to download (default is 50).
127
+ min_results (int): Minimum acceptable number of papers from the first query (default is 10).
128
+ """
129
+ folder_name = os.path.join(PARENT_FOLDER, topic.replace(" ", "_"))
130
+ os.makedirs(folder_name, exist_ok=True)
131
+
132
+ # 1. Initial strict query.
133
+ strict_query = generate_query_qwen(topic)
134
+ papers = search_arxiv_with_query(strict_query, max_results=max_results)
135
+
136
+ # Use a dict keyed by title to avoid duplicates.
137
+ total_papers = {paper["title"]: paper for paper in papers}
138
+ print(f"[Strict Query] Found {len(total_papers)} papers for topic: {topic}")
139
+
140
+ # 2. If the strict query returns fewer than min_results papers,
141
+ # use the generic query to broaden the search.
142
+ attempts = 0
143
+ MAX_ATTEMPTS = 5 # Limit attempts to avoid infinite loops.
144
+ while len(total_papers) < max_results and len(total_papers) < min_results and attempts < MAX_ATTEMPTS:
145
+ # Generate a less strict (generic) query
146
+ generic_query = generate_generic_query_qwen(strict_query, topic)
147
+ print(f"[Generic Query Attempt {attempts + 1}] Using generic query: {generic_query}")
148
+ generic_papers = search_arxiv_with_query(generic_query, max_results=max_results)
149
+
150
+ new_count = 0
151
+ for paper in generic_papers:
152
+ if paper["title"] not in total_papers:
153
+ total_papers[paper["title"]] = paper
154
+ new_count += 1
155
+ if len(total_papers) >= max_results:
156
+ break
157
+
158
+ attempts += 1
159
+ strict_query = generic_query # Update the query for the next iteration.
160
+
161
+ total_paper_list = list(total_papers.values())[:max_results]
162
+
163
+ if not total_paper_list:
164
+ print(f"No papers found for topic: {topic}")
165
+ return
166
+
167
+ print(f"Downloading {len(total_paper_list)} papers for topic: {topic}")
168
+ for paper in tqdm(total_paper_list, total=len(total_paper_list)):
169
+ filename = sanitize_filename(paper['title'])
170
+ pdf_link = paper["pdf_link"]
171
+ download_pdf(pdf_link, folder_name, filename)
172
+ time.sleep(2) # Delay to avoid overwhelming the arXiv API
173
+
174
+ print(f"Download complete. Papers saved in: {folder_name}")
175
+
176
+ first_topics = [
177
+ "quantum computing: bqp, quantum supremacy, and related concepts",
178
+ "fixed-parameter tractability and related concepts in computational complexity",
179
+ "fundamental concepts in computational complexity theory",
180
+ "pcp theorem and its implications in approximation and complexity theory",
181
+ "interconnections in theoretical computer science: seth, 3sum, apsp, and related concepts",
182
+ "nosql database systems for flexible and scalable data management",
183
+ "temporal databases, real-time databases, and data management systems",
184
+ "large language model integration with databases for enhanced data management and survey analysis",
185
+ "ai-driven database management",
186
+ "distributed systems and databases: key concepts and technologies",
187
+ "graph databases and query languages: traversal, indexing, and analytics",
188
+ "graph databases: models, data modeling, and applications",
189
+ "multi-model databases: mongodb, arangodb, and jsonb",
190
+ "time-series data management and analytics",
191
+ "advanced data management and retrieval techniques",
192
+ "vector databases and their role in modern data management and retrieval",
193
+ "content delivery networks: technologies and strategies for optimization",
194
+ "lpwan technologies: lora, zigbee 3.0, 6lowpan, and related protocols in iot",
195
+ "network slicing and emerging technologies in 6g networks",
196
+ "advanced concepts and technologies in software-defined networking and network function virtualization",
197
+ "battery electrolyte formulation in lithium-ion batteries",
198
+ "flow batteries as energy storage systems",
199
+ "internal consistency, self-feedback, and reliability in large language models",
200
+ "attention mechanisms in large language models",
201
+ "controlled text generation with large language models in natural language processing",
202
+ "domain adaptation and specialized nlp applications",
203
+ "evaluation of large language models for natural language processing",
204
+ "information extraction and large language models in natural language processing",
205
+ "techniques for low-resource natural language processing",
206
+ "model compression techniques for transformer models",
207
+ "multi-agent offline policy reinforcement learning: decentralized learning and cooperative policy optimization",
208
+ "multimodal learning and its applications",
209
+ "reasoning capabilities of large language models",
210
+ "transformer models in natural language processing"
211
+ ]
212
+
213
+ second_topics = [
214
+ "semi-supervised learning",
215
+ "out-of-distribution detection",
216
+ "in-context learning"
217
+ ]
218
+
219
+ if __name__ == '__main__':
220
+ for topic in first_topics:
221
+ print(f"\nProcessing topic (first list): {topic}")
222
+ download_arxiv_papers_new(topic, max_results=50, min_results=20)
223
+ for topic in second_topics:
224
+ print(f"\nProcessing topic (second list): {topic}")
225
+ download_arxiv_papers_new(topic, max_results=50, min_results=20)
src/demo/latex_template/acl.sty ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ % This is the LaTex style file for *ACL.
2
+ % The official sources can be found at
3
+ %
4
+ % https://github.com/acl-org/acl-style-files/
5
+ %
6
+ % This package is activated by adding
7
+ %
8
+ % \usepackage{acl}
9
+ %
10
+ % to your LaTeX file. When submitting your paper for review, add the "review" option:
11
+ %
12
+ % \usepackage[review]{acl}
13
+
14
+ \newif\ifacl@finalcopy
15
+ \newif\ifacl@anonymize
16
+ \newif\ifacl@linenumbers
17
+ \newif\ifacl@pagenumbers
18
+ \DeclareOption{final}{\acl@finalcopytrue\acl@anonymizefalse\acl@linenumbersfalse\acl@pagenumbersfalse}
19
+ \DeclareOption{review}{\acl@finalcopyfalse\acl@anonymizetrue\acl@linenumberstrue\acl@pagenumberstrue}
20
+ \DeclareOption{preprint}{\acl@finalcopytrue\acl@anonymizefalse\acl@linenumbersfalse\acl@pagenumberstrue}
21
+ \ExecuteOptions{final} % final copy is the default
22
+
23
+ % include hyperref, unless user specifies nohyperref option like this:
24
+ % \usepackage[nohyperref]{acl}
25
+ \newif\ifacl@hyperref
26
+ \DeclareOption{hyperref}{\acl@hyperreftrue}
27
+ \DeclareOption{nohyperref}{\acl@hyperreffalse}
28
+ \ExecuteOptions{hyperref} % default is to use hyperref
29
+ \ProcessOptions\relax
30
+
31
+ \typeout{Conference Style for ACL}
32
+
33
+ \usepackage{xcolor}
34
+
35
+ \ifacl@linenumbers
36
+ % Add draft line numbering via the lineno package
37
+ % https://texblog.org/2012/02/08/adding-line-numbers-to-documents/
38
+ \usepackage[switch,mathlines]{lineno}
39
+
40
+ % Line numbers in gray Helvetica 8pt
41
+ \font\aclhv = phvb at 8pt
42
+ \renewcommand\linenumberfont{\aclhv\color{lightgray}}
43
+
44
+ % Zero-fill line numbers
45
+ % NUMBER with left flushed zeros \fillzeros[<WIDTH>]<NUMBER>
46
+ \newcount\cv@tmpc@ \newcount\cv@tmpc
47
+ \def\fillzeros[#1]#2{\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi
48
+ \cv@tmpc=1 %
49
+ \loop\ifnum\cv@tmpc@<10 \else \divide\cv@tmpc@ by 10 \advance\cv@tmpc by 1 \fi
50
+ \ifnum\cv@tmpc@=10\relax\cv@tmpc@=11\relax\fi \ifnum\cv@tmpc@>10 \repeat
51
+ \ifnum#2<0\advance\cv@tmpc1\relax-\fi
52
+ \loop\ifnum\cv@tmpc<#1\relax0\advance\cv@tmpc1\relax\fi \ifnum\cv@tmpc<#1 \repeat
53
+ \cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi \relax\the\cv@tmpc@}%
54
+ \renewcommand\thelinenumber{\fillzeros[3]{\arabic{linenumber}}}
55
+ \linenumbers
56
+
57
+ \setlength{\linenumbersep}{1.6cm}
58
+
59
+ % Bug: An equation with $$ ... $$ isn't numbered, nor is the previous line.
60
+
61
+ % Patch amsmath commands so that the previous line and the equation itself
62
+ % are numbered. Bug: multline has an extra line number.
63
+ % https://tex.stackexchange.com/questions/461186/how-to-use-lineno-with-amsmath-align
64
+ \usepackage{etoolbox} %% <- for \pretocmd, \apptocmd and \patchcmd
65
+
66
+ \newcommand*\linenomathpatch[1]{%
67
+ \expandafter\pretocmd\csname #1\endcsname {\linenomath}{}{}%
68
+ \expandafter\pretocmd\csname #1*\endcsname {\linenomath}{}{}%
69
+ \expandafter\apptocmd\csname end#1\endcsname {\endlinenomath}{}{}%
70
+ \expandafter\apptocmd\csname end#1*\endcsname {\endlinenomath}{}{}%
71
+ }
72
+ \newcommand*\linenomathpatchAMS[1]{%
73
+ \expandafter\pretocmd\csname #1\endcsname {\linenomathAMS}{}{}%
74
+ \expandafter\pretocmd\csname #1*\endcsname {\linenomathAMS}{}{}%
75
+ \expandafter\apptocmd\csname end#1\endcsname {\endlinenomath}{}{}%
76
+ \expandafter\apptocmd\csname end#1*\endcsname {\endlinenomath}{}{}%
77
+ }
78
+
79
+ %% Definition of \linenomathAMS depends on whether the mathlines option is provided
80
+ \expandafter\ifx\linenomath\linenomathWithnumbers
81
+ \let\linenomathAMS\linenomathWithnumbers
82
+ %% The following line gets rid of an extra line numbers at the bottom:
83
+ \patchcmd\linenomathAMS{\advance\postdisplaypenalty\linenopenalty}{}{}{}
84
+ \else
85
+ \let\linenomathAMS\linenomathNonumbers
86
+ \fi
87
+
88
+ \AtBeginDocument{%
89
+ \linenomathpatch{equation}%
90
+ \linenomathpatchAMS{gather}%
91
+ \linenomathpatchAMS{multline}%
92
+ \linenomathpatchAMS{align}%
93
+ \linenomathpatchAMS{alignat}%
94
+ \linenomathpatchAMS{flalign}%
95
+ }
96
+ \else
97
+ % Hack to ignore these commands, which review mode puts into the .aux file.
98
+ \newcommand{\@LN@col}[1]{}
99
+ \newcommand{\@LN}[2]{}
100
+ \newcommand{\nolinenumbers}{}
101
+ \fi
102
+
103
+ \PassOptionsToPackage{a4paper,margin=2.5cm,heightrounded=true}{geometry}
104
+ \RequirePackage{geometry}
105
+
106
+ \setlength\columnsep{0.6cm}
107
+ \newlength\titlebox
108
+ \setlength\titlebox{11\baselineskip}
109
+ % \titlebox should be a multiple of \baselineskip so that
110
+ % column height remaining fits an exact number of lines of text
111
+
112
+ \flushbottom \twocolumn \sloppy
113
+
114
+ % We're never going to need a table of contents, so just flush it to
115
+ % save space --- suggested by drstrip@sandia-2
116
+ \def\addcontentsline#1#2#3{}
117
+
118
+ \ifacl@pagenumbers
119
+ \pagenumbering{arabic}
120
+ \else
121
+ \thispagestyle{empty}
122
+ \pagestyle{empty}
123
+ \fi
124
+
125
+ %% Title and Authors %%
126
+
127
+ \let\Thanks\thanks % \Thanks and \thanks used to be different, but keep this for backwards compatibility.
128
+
129
+ \newcommand\outauthor{%
130
+ \begin{tabular}[t]{c}
131
+ \ifacl@anonymize
132
+ \bfseries Anonymous ACL submission
133
+ \else
134
+ \bfseries\@author
135
+ \fi
136
+ \end{tabular}}
137
+
138
+ % Mostly taken from deproc.
139
+ \AtBeginDocument{
140
+ \def\maketitle{\par
141
+ \begingroup
142
+ \def\thefootnote{\fnsymbol{footnote}}
143
+ \twocolumn[\@maketitle]
144
+ \@thanks
145
+ \endgroup
146
+ \setcounter{footnote}{0}
147
+ \let\maketitle\relax
148
+ \let\@maketitle\relax
149
+ \gdef\@thanks{}\gdef\@author{}\gdef\@title{}\let\thanks\relax}
150
+ \def\@maketitle{\vbox to \titlebox{\hsize\textwidth
151
+ \linewidth\hsize \vskip 0.125in minus 0.125in \centering
152
+ {\Large\bfseries \@title \par} \vskip 0.2in plus 1fil minus 0.1in
153
+ {\def\and{\unskip\enspace{\rmfamily and}\enspace}%
154
+ \def\And{\end{tabular}\hss \egroup \hskip 1in plus 2fil
155
+ \hbox to 0pt\bgroup\hss \begin{tabular}[t]{c}\bfseries}%
156
+ \def\AND{\end{tabular}\hss\egroup \hfil\hfil\egroup
157
+ \vskip 0.25in plus 1fil minus 0.125in
158
+ \hbox to \linewidth\bgroup\large \hfil\hfil
159
+ \hbox to 0pt\bgroup\hss \begin{tabular}[t]{c}\bfseries}
160
+ \hbox to \linewidth\bgroup\large \hfil\hfil
161
+ \hbox to 0pt\bgroup\hss
162
+ \outauthor
163
+ \hss\egroup
164
+ \hfil\hfil\egroup}
165
+ \vskip 0.3in plus 2fil minus 0.1in
166
+ }}
167
+ }
168
+
169
+ % margins and font size for abstract
170
+ \renewenvironment{abstract}%
171
+ {\begin{center}\large\textbf{\abstractname}\end{center}%
172
+ \begin{list}{}%
173
+ {\setlength{\rightmargin}{0.6cm}%
174
+ \setlength{\leftmargin}{0.6cm}}%
175
+ \item[]\ignorespaces%
176
+ \@setsize\normalsize{12pt}\xpt\@xpt
177
+ }%
178
+ {\unskip\end{list}}
179
+
180
+ % Resizing figure and table captions - SL
181
+ % Support for interacting with the caption, subfigure, and subcaption packages - SL
182
+ \RequirePackage{caption}
183
+ \DeclareCaptionFont{10pt}{\fontsize{10pt}{12pt}\selectfont}
184
+ \captionsetup{font=10pt}
185
+
186
+ \RequirePackage{natbib}
187
+ % for citation commands in the .tex, authors can use:
188
+ % \citep, \citet, and \citeyearpar for compatibility with natbib, or
189
+ % \cite, \newcite, and \shortcite for compatibility with older ACL .sty files
190
+ \renewcommand\cite{\citep} % to get "(Author Year)" with natbib
191
+ \newcommand\shortcite{\citeyearpar}% to get "(Year)" with natbib
192
+ \newcommand\newcite{\citet} % to get "Author (Year)" with natbib
193
+ \newcommand{\citeposs}[1]{\citeauthor{#1}'s (\citeyear{#1})} % to get "Author's (Year)"
194
+
195
+ \bibliographystyle{acl_natbib}
196
+
197
+ % Bibliography
198
+
199
+ % Don't put a label in the bibliography at all. Just use the unlabeled format
200
+ % instead.
201
+ \def\thebibliography#1{\vskip\parskip%
202
+ \vskip\baselineskip%
203
+ \def\baselinestretch{1}%
204
+ \ifx\@currsize\normalsize\@normalsize\else\@currsize\fi%
205
+ \vskip-\parskip%
206
+ \vskip-\baselineskip%
207
+ \section*{References\@mkboth
208
+ {References}{References}}\list
209
+ {}{\setlength{\labelwidth}{0pt}\setlength{\leftmargin}{\parindent}
210
+ \setlength{\itemindent}{-\parindent}}
211
+ \def\newblock{\hskip .11em plus .33em minus -.07em}
212
+ \sloppy\clubpenalty4000\widowpenalty4000
213
+ \sfcode`\.=1000\relax}
214
+ \let\endthebibliography=\endlist
215
+
216
+
217
+ % Allow for a bibliography of sources of attested examples
218
+ \def\thesourcebibliography#1{\vskip\parskip%
219
+ \vskip\baselineskip%
220
+ \def\baselinestretch{1}%
221
+ \ifx\@currsize\normalsize\@normalsize\else\@currsize\fi%
222
+ \vskip-\parskip%
223
+ \vskip-\baselineskip%
224
+ \section*{Sources of Attested Examples\@mkboth
225
+ {Sources of Attested Examples}{Sources of Attested Examples}}\list
226
+ {}{\setlength{\labelwidth}{0pt}\setlength{\leftmargin}{\parindent}
227
+ \setlength{\itemindent}{-\parindent}}
228
+ \def\newblock{\hskip .11em plus .33em minus -.07em}
229
+ \sloppy\clubpenalty4000\widowpenalty4000
230
+ \sfcode`\.=1000\relax}
231
+ \let\endthesourcebibliography=\endlist
232
+
233
+ % sections with less space
234
+ \def\section{\@startsection {section}{1}{\z@}{-2.0ex plus
235
+ -0.5ex minus -.2ex}{1.5ex plus 0.3ex minus .2ex}{\large\bfseries\raggedright}}
236
+ \def\subsection{\@startsection{subsection}{2}{\z@}{-1.8ex plus
237
+ -0.5ex minus -.2ex}{0.8ex plus .2ex}{\normalsize\bfseries\raggedright}}
238
+ %% changed by KO to - values to get the initial parindent right
239
+ \def\subsubsection{\@startsection{subsubsection}{3}{\z@}{-1.5ex plus
240
+ -0.5ex minus -.2ex}{0.5ex plus .2ex}{\normalsize\bfseries\raggedright}}
241
+ \def\paragraph{\@startsection{paragraph}{4}{\z@}{1.5ex plus
242
+ 0.5ex minus .2ex}{-1em}{\normalsize\bfseries}}
243
+ \def\subparagraph{\@startsection{subparagraph}{5}{\parindent}{1.5ex plus
244
+ 0.5ex minus .2ex}{-1em}{\normalsize\bfseries}}
245
+
246
+ % Footnotes
247
+ \footnotesep 6.65pt %
248
+ \skip\footins 9pt plus 4pt minus 2pt
249
+ \def\footnoterule{\kern-3pt \hrule width 5pc \kern 2.6pt }
250
+ \setcounter{footnote}{0}
251
+
252
+ % Lists and paragraphs
253
+ \parindent 1em
254
+ \topsep 4pt plus 1pt minus 2pt
255
+ \partopsep 1pt plus 0.5pt minus 0.5pt
256
+ \itemsep 2pt plus 1pt minus 0.5pt
257
+ \parsep 2pt plus 1pt minus 0.5pt
258
+
259
+ \leftmargin 2em \leftmargini\leftmargin \leftmarginii 2em
260
+ \leftmarginiii 1.5em \leftmarginiv 1.0em \leftmarginv .5em \leftmarginvi .5em
261
+ \labelwidth\leftmargini\advance\labelwidth-\labelsep \labelsep 5pt
262
+
263
+ \def\@listi{\leftmargin\leftmargini}
264
+ \def\@listii{\leftmargin\leftmarginii
265
+ \labelwidth\leftmarginii\advance\labelwidth-\labelsep
266
+ \topsep 2pt plus 1pt minus 0.5pt
267
+ \parsep 1pt plus 0.5pt minus 0.5pt
268
+ \itemsep \parsep}
269
+ \def\@listiii{\leftmargin\leftmarginiii
270
+ \labelwidth\leftmarginiii\advance\labelwidth-\labelsep
271
+ \topsep 1pt plus 0.5pt minus 0.5pt
272
+ \parsep \z@ \partopsep 0.5pt plus 0pt minus 0.5pt
273
+ \itemsep \topsep}
274
+ \def\@listiv{\leftmargin\leftmarginiv
275
+ \labelwidth\leftmarginiv\advance\labelwidth-\labelsep}
276
+ \def\@listv{\leftmargin\leftmarginv
277
+ \labelwidth\leftmarginv\advance\labelwidth-\labelsep}
278
+ \def\@listvi{\leftmargin\leftmarginvi
279
+ \labelwidth\leftmarginvi\advance\labelwidth-\labelsep}
280
+
281
+ \abovedisplayskip 7pt plus2pt minus5pt%
282
+ \belowdisplayskip \abovedisplayskip
283
+ \abovedisplayshortskip 0pt plus3pt%
284
+ \belowdisplayshortskip 4pt plus3pt minus3pt%
285
+
286
+ % Less leading in most fonts (due to the narrow columns)
287
+ % The choices were between 1-pt and 1.5-pt leading
288
+ \def\@normalsize{\@setsize\normalsize{11pt}\xpt\@xpt}
289
+ \def\small{\@setsize\small{10pt}\ixpt\@ixpt}
290
+ \def\footnotesize{\@setsize\footnotesize{10pt}\ixpt\@ixpt}
291
+ \def\scriptsize{\@setsize\scriptsize{8pt}\viipt\@viipt}
292
+ \def\tiny{\@setsize\tiny{7pt}\vipt\@vipt}
293
+ \def\large{\@setsize\large{14pt}\xiipt\@xiipt}
294
+ \def\Large{\@setsize\Large{16pt}\xivpt\@xivpt}
295
+ \def\LARGE{\@setsize\LARGE{20pt}\xviipt\@xviipt}
296
+ \def\huge{\@setsize\huge{23pt}\xxpt\@xxpt}
297
+ \def\Huge{\@setsize\Huge{28pt}\xxvpt\@xxvpt}
298
+
299
+ % The hyperref manual (section 9) says hyperref should be loaded after natbib
300
+ \ifacl@hyperref
301
+ \PassOptionsToPackage{breaklinks}{hyperref}
302
+ \RequirePackage{hyperref}
303
+ % make links dark blue
304
+ \definecolor{darkblue}{rgb}{0, 0, 0.5}
305
+ \hypersetup{colorlinks=true, citecolor=darkblue, linkcolor=darkblue, urlcolor=darkblue}
306
+ \else
307
+ % This definition is used if the hyperref package is not loaded.
308
+ % It provides a backup, no-op definiton of \href.
309
+ % This is necessary because \href command is used in the acl_natbib.bst file.
310
+ \def\href#1#2{{#2}}
311
+ \usepackage{url}
312
+ \fi
src/demo/latex_template/template.tex ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ \documentclass[11pt]{article}
2
+
3
+ \usepackage[final]{acl}
4
+ \usepackage{times}
5
+ \usepackage{latexsym}
6
+ \usepackage[T1]{fontenc}
7
+ \usepackage{microtype}
8
+ \usepackage{graphicx}
9
+ \usepackage{amsmath}
10
+
11
+ \author{
12
+ InteractiveSurvey \\
13
+ Affiliation Line 1 \\
14
+ Affiliation Line 2 \\
15
+ \texttt{InteractiveSurvey@domain} \\
16
+ }
17
+
18
+ \begin{document}
19
+ \maketitle
20
+
21
+
22
+ \end{document}
src/demo/main.py ADDED
@@ -0,0 +1,448 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import json
3
+ import os
4
+
5
+ import pandas as pd
6
+ from langchain_huggingface import HuggingFaceEmbeddings
7
+ from asg_retriever import legal_pdf
8
+ from asg_loader import DocumentLoading
9
+ from asg_retriever import Retriever, query_embeddings_new_new
10
+ from asg_generator import generate_sentence_patterns, generate
11
+ from category_and_tsne import clustering
12
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
13
+ import time
14
+ import torch
15
+ import re
16
+ import transformers
17
+ from dotenv import load_dotenv
18
+ from asg_clustername import generate_cluster_name_new
19
+ from asg_outline import OutlineGenerator, generateSurvey_qwen_new
20
+ import os
21
+ from markdown_pdf import MarkdownPdf, Section # Assuming you are using markdown_pdf
22
+ from typing import Any
23
+
24
+ def clean_str(input_str):
25
+ input_str = str(input_str).strip().lower()
26
+ if input_str == "none" or input_str == "nan" or len(input_str) == 0:
27
+ return ""
28
+ input_str = input_str.replace('\\n',' ').replace('\n',' ').replace('\r',' ').replace('——',' ').replace('——',' ').replace('__',' ').replace('__',' ').replace('........','.').replace('....','.').replace('....','.').replace('..','.').replace('..','.').replace('..','.').replace('. . . . . . . . ','. ').replace('. . . . ','. ').replace('. . . . ','. ').replace('. . ','. ').replace('. . ','. ')
29
+ input_str = re.sub(r'\\u[0-9a-z]{4}', ' ', input_str).replace(' ',' ').replace(' ',' ')
30
+ return input_str
31
+
32
+ def remove_invalid_citations(text, valid_collection_names):
33
+ """
34
+ 只保留 [xxx\] 中的 xxx 属于 valid_collection_names 的引用,
35
+ 其余的引用标记一律删除。
36
+ """
37
+ pattern = r"\[(.*?)\\\]" # 匹配形如 [xxx\] 的内容
38
+ all_matches = re.findall(pattern, text)
39
+
40
+ new_text = text
41
+ for match in all_matches:
42
+ cleaned_match = match.rstrip('\\') # 去除末尾的 \
43
+ if cleaned_match not in valid_collection_names:
44
+ new_text = new_text.replace(f"[{match}\\]", "")
45
+ return new_text
46
+ def normalize_citations_with_mapping(paper_text):
47
+ # 使用正则表达式匹配所有引用标记(形如 [citation1])
48
+ citations = re.findall(r'\[.*?\]', paper_text)
49
+ # 去重并保持顺序
50
+ unique_citations = list(dict.fromkeys(citations))
51
+ # 生成引用映射表,把原始引用标记映射为数字引用
52
+ citation_mapping = {citation: f'[{i + 1}]' for i, citation in enumerate(unique_citations)}
53
+
54
+ # 在文本中替换老引用为新引用
55
+ normalized_text = paper_text
56
+ for old_citation, new_citation in citation_mapping.items():
57
+ normalized_text = normalized_text.replace(old_citation, new_citation)
58
+
59
+ # 生成从数字到原始引用标记的反向映射
60
+ # 用 rstrip('\\') 去掉末尾的反斜杠
61
+ reverse_mapping = {
62
+ i + 1: unique_citations[i].strip('[]').rstrip('\\')
63
+ for i in range(len(unique_citations))
64
+ }
65
+
66
+ return normalized_text, reverse_mapping
67
+ def generate_references_section(citation_mapping, collection_pdf_mapping):
68
+
69
+ references = ["# References"] # 生成引用部分
70
+ for num in sorted(citation_mapping.keys()):
71
+ collection_name = citation_mapping[num]
72
+ pdf_name = collection_pdf_mapping.get(collection_name, "Unknown PDF")
73
+ if pdf_name.endswith(".pdf"):
74
+ pdf_name = pdf_name[:-4]
75
+ # 在每一行末尾添加两个空格以确保换行
76
+ references.append(f"[{num}] {pdf_name} ")
77
+
78
+ return "\n".join(references)
79
+ def fix_citation_punctuation_md(text):
80
+ """
81
+ 把类似于 'some text. \[1]' 或 'some text. \[2]' 调整为 'some text \[1].'
82
+ 仅针对已经变成 \[1], \[2] 之类数字引用的 Markdown 情况有效。
83
+ 如果还没有变成 \[数字],则需先经过 normalize_citations_with_mapping。
84
+ """
85
+ # 正则表达式匹配点号后带有空格或无空格,紧接 \[数字] 的情况
86
+ pattern = r'\.\s*(\\\[\d+\])'
87
+ replacement = r' \1.'
88
+ fixed_text = re.sub(pattern, replacement, text)
89
+ return fixed_text
90
+ def finalize_survey_paper(paper_text,
91
+ Global_collection_names,
92
+ Global_file_names):
93
+
94
+ # 1) 删除所有不想要的旧引用(包括 [数字]、[Sewon, 2021] 等)
95
+ paper_text = remove_invalid_citations(paper_text, Global_collection_names)
96
+
97
+ # 2) 规范化引用 => [1][2]...
98
+ normalized_text, citation_mapping = normalize_citations_with_mapping(paper_text)
99
+
100
+ # 3) 修复标点,比如 .[1] => [1].
101
+ normalized_text = fix_citation_punctuation_md(normalized_text)
102
+
103
+ # 4) 构造 {collection_name: pdf_file_name} 字典
104
+ collection_pdf_mapping = dict(zip(Global_collection_names, Global_file_names))
105
+
106
+ # 5) 生成 References
107
+ references_section = generate_references_section(citation_mapping, collection_pdf_mapping)
108
+
109
+ # 6) 合并正文和 References
110
+ final_paper = normalized_text.strip() + "\n\n" + references_section
111
+ return final_paper
112
+
113
+ class ASG_system:
114
+ def __init__(self, root_path: str, survey_id:str, pdf_path: str, survey_title: str, cluster_standard: str) -> None:
115
+ load_dotenv()
116
+ self.pdf_path = pdf_path
117
+ self.txt_path = root_path + "/txt"
118
+ self.tsv_path = root_path + "/tsv"
119
+ self.md_path = root_path + "/md"
120
+ self.info_path = root_path + "/info"
121
+ self.result_path = root_path + "/result"
122
+
123
+ self.survey_id = survey_id
124
+ self.survey_title = survey_title
125
+ self.cluster_standard = cluster_standard
126
+
127
+ self.collection_names = []
128
+ self.file_names = []
129
+ self.citation_data = []
130
+ self.description_list = []
131
+ self.ref_list = []
132
+ self.cluster_names = []
133
+ self.collection_names_clustered = []
134
+ self.df_selected = ''
135
+
136
+
137
+ model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
138
+ self.embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
139
+ self.pipeline = transformers.pipeline(
140
+ "text-generation",
141
+ model=model_id,
142
+ model_kwargs={"torch_dtype": torch.bfloat16},
143
+ token = os.getenv('HF_API_KEY'),
144
+ device_map="auto",
145
+ )
146
+ self.pipeline.model.load_adapter(peft_model_id = "technicolor/llama3.1_8b_outline_generation", adapter_name="outline")
147
+ self.pipeline.model.load_adapter(peft_model_id ="technicolor/llama3.1_8b_abstract_generation", adapter_name="abstract")
148
+ self.pipeline.model.load_adapter(peft_model_id ="technicolor/llama3.1_8b_conclusion_generation", adapter_name="conclusion")
149
+
150
+ os.makedirs(self.txt_path, exist_ok=True)
151
+ os.makedirs(f'{self.txt_path}/{self.survey_id}', exist_ok=True)
152
+
153
+ os.makedirs(self.tsv_path, exist_ok=True)
154
+
155
+ os.makedirs(self.md_path, exist_ok=True)
156
+ os.makedirs(f'{self.md_path}/{self.survey_id}', exist_ok=True)
157
+
158
+ os.makedirs(self.info_path, exist_ok=True)
159
+ os.makedirs(f'{self.info_path}/{self.survey_id}', exist_ok=True)
160
+
161
+ os.makedirs(self.result_path, exist_ok=True)
162
+ os.makedirs(f'{self.result_path}/{self.survey_id}', exist_ok=True)
163
+
164
+ def parsing_pdfs(self, mode="intro") -> None:
165
+ pdf_files = os.listdir(self.pdf_path)
166
+ loader = DocumentLoading()
167
+
168
+
169
+ for pdf_file in pdf_files:
170
+
171
+ pdf_file = os.path.join(self.pdf_path, pdf_file)
172
+
173
+ split_start_time = time.time()
174
+
175
+ base_name = os.path.splitext(os.path.basename(pdf_file))[0]
176
+ target_dir = os.path.join(self.md_path, self.survey_id, base_name, "auto")
177
+ md_dir = os.path.join(self.md_path, self.survey_id)
178
+
179
+ loader.convert_pdf_to_md(pdf_file, md_dir)
180
+
181
+ md_file_path = os.path.join(target_dir, f"{base_name}.md")
182
+ print(md_file_path)
183
+ print("*"*24)
184
+ if not os.path.exists(md_file_path):
185
+ raise FileNotFoundError(f"Markdown file {md_file_path} does not exist. Conversion might have failed.")
186
+
187
+ if mode == "intro":
188
+ doc = loader.process_md_file(md_file_path, self.survey_id, self.txt_path)
189
+ elif mode == "full":
190
+ doc = loader.process_md_file_full(md_file_path, self.survey_id,self.txt_path)
191
+
192
+ text_splitter = RecursiveCharacterTextSplitter(
193
+ chunk_size=400,
194
+ chunk_overlap=30,
195
+ length_function=len,
196
+ is_separator_regex=False,
197
+ )
198
+ splitters = text_splitter.create_documents([doc])
199
+ documents_list = [document.page_content for document in splitters]
200
+ for i in range(len(documents_list)):
201
+ documents_list[i] = documents_list[i].replace('\n', ' ')
202
+ print(f"Splitting took {time.time() - split_start_time} seconds.")
203
+
204
+ embed_start_time = time.time()
205
+
206
+ doc_results = self.embedder.embed_documents(documents_list)
207
+ if isinstance(doc_results, torch.Tensor):
208
+ embeddings_list = doc_results.tolist()
209
+ else:
210
+ embeddings_list = doc_results
211
+ print(f"Embedding took {time.time() - embed_start_time} seconds.")
212
+
213
+ # Prepare metadata
214
+ metadata_list = [{"doc_name": os.path.basename(pdf_file)} for i in range(len(documents_list))]
215
+
216
+ title = os.path.splitext(os.path.basename(pdf_file))[0]
217
+
218
+
219
+ title_new = title.strip()
220
+ invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*','_']
221
+ for char in invalid_chars:
222
+ title_new = title_new.replace(char, ' ')
223
+ print("============================")
224
+ print(title_new)
225
+
226
+ # New logic to create collection_name
227
+ # filename = os.path.basename(file_path)
228
+ collection_name = legal_pdf(title_new)
229
+
230
+ retriever = Retriever()
231
+ retriever.list_collections_chroma()
232
+ retriever.create_collection_chroma(collection_name)
233
+ retriever.add_documents_chroma(
234
+ collection_name=collection_name,
235
+ embeddings_list=embeddings_list,
236
+ documents_list=documents_list,
237
+ metadata_list=metadata_list
238
+ )
239
+
240
+ self.collection_names.append(collection_name)
241
+ self.file_names.append(title_new)
242
+ print(self.collection_names)
243
+ print(self.file_names)
244
+
245
+ json_files = os.listdir(os.path.join(self.txt_path, self.survey_id))
246
+ ref_paper_num = len(json_files)
247
+ print(f'The length of the json files is {ref_paper_num}')
248
+
249
+
250
+ json_data_pd = pd.DataFrame()
251
+ for _ in json_files:
252
+ file_path = os.path.join(self.txt_path, self.survey_id, _)
253
+
254
+ with open(file_path, 'r', encoding="utf-8") as file:
255
+ data = json.load(file)
256
+
257
+ # Extract necessary information
258
+ title = data.get("title", "")
259
+ abstract = data.get("abstract", "")
260
+ authors = data.get("authors", "")
261
+ introduction = data.get("introduction", "")
262
+ new_data = {
263
+ "reference paper title": title,
264
+ "reference paper citation information (can be collected from Google scholar/DBLP)": authors,
265
+ "reference paper abstract (Please copy the text AND paste here)": abstract,
266
+ "reference paper introduction (Please copy the text AND paste here)": introduction,
267
+ "reference paper doi link (optional)": "",
268
+ "reference paper category label (optional)": ""
269
+ }
270
+
271
+ # 将新数据转换为 DataFrame
272
+ new_data_df = pd.DataFrame([new_data])
273
+
274
+ # 使用 pd.concat 而不是 append
275
+ json_data_pd = pd.concat([json_data_pd, new_data_df], ignore_index=True)
276
+
277
+ # Save the DataFrame to a variable for further use
278
+ input_pd = json_data_pd
279
+
280
+ if ref_paper_num>0:
281
+
282
+ ## change col name
283
+ input_pd['ref_title'] = [filename for filename in self.file_names]
284
+ input_pd["ref_context"] = [""]*ref_paper_num
285
+ input_pd["ref_entry"] = input_pd["reference paper citation information (can be collected from Google scholar/DBLP)"]
286
+ input_pd["abstract"] = input_pd["reference paper abstract (Please copy the text AND paste here)"].apply(lambda x: clean_str(x) if len(str(x))>0 else 'Invalid abstract')
287
+ input_pd["intro"] = input_pd["reference paper introduction (Please copy the text AND paste here)"].apply(lambda x: clean_str(x) if len(str(x))>0 else 'Invalid introduction')
288
+
289
+ # optional columns
290
+ input_pd["label"] = input_pd["reference paper category label (optional)"].apply(lambda x: str(x) if len(str(x))>0 else '')
291
+ #input_pd["label"] = input_pd["reference paper category id (optional)"].apply(lambda x: str(x) if len(str(x))>0 else '')
292
+ ## output tsv
293
+ # output_tsv_filename = self.tsv_path + self.survey_id + '.tsv'
294
+ output_tsv_filename = os.path.join(self.tsv_path, self.survey_id + '.tsv')
295
+
296
+ #output_df = input_pd[["ref_title","ref_context","ref_entry","abstract","intro","description"]]
297
+ output_df = input_pd[["ref_title","ref_context","ref_entry","abstract","intro", 'label']]
298
+ # print(output_df)
299
+
300
+ #pdb.set_trace()
301
+ output_df.to_csv(output_tsv_filename, sep='\t')
302
+
303
+ def description_generation(self) -> None:
304
+ query= self.cluster_standard
305
+ query_list = generate_sentence_patterns(query)
306
+ for name in self.collection_names:
307
+ context, citation_data = query_embeddings_new_new(name, query_list)
308
+ self.citation_data.extend(citation_data)
309
+
310
+ description = generate(context, query, name)
311
+ self.description_list.append(description)
312
+
313
+ citation_path = f'{self.info_path}/{self.survey_id}/citation_data.json'
314
+ os.makedirs(os.path.dirname(citation_path), exist_ok=True)
315
+ with open(citation_path, 'w', encoding="utf-8") as outfile:
316
+ json.dump(self.citation_data, outfile, indent=4, ensure_ascii=False)
317
+
318
+ file_path = f'{self.tsv_path}/{self.survey_id}.tsv'
319
+
320
+ with open(file_path, 'r', newline='', encoding='utf-8') as infile:
321
+ reader = csv.reader(infile, delimiter='\t')
322
+ rows = list(reader)
323
+ if rows:
324
+ headers = rows[0]
325
+ headers.append('retrieval_result')
326
+
327
+ updated_rows = [headers]
328
+ for row, description in zip(rows[1:], self.description_list):
329
+ row.append(description)
330
+ updated_rows.append(row)
331
+
332
+ with open(file_path, 'w', newline='', encoding='utf-8') as outfile:
333
+ writer = csv.writer(outfile, delimiter='\t')
334
+ writer.writerows(updated_rows)
335
+
336
+ print('Updated file has been saved to', file_path)
337
+ else:
338
+ print('Input file is empty.')
339
+
340
+ def agglomerative_clustering(self) -> None:
341
+ df = pd.read_csv(f'{self.tsv_path}/{self.survey_id}.tsv', sep='\t', index_col=0, encoding='utf-8')
342
+ df_selected = df
343
+
344
+ df_selected, _ = clustering(df_selected, 3, self.survey_id, self.info_path, self.tsv_path)
345
+ self.df_selected = df_selected
346
+
347
+ df_tmp = df_selected.reset_index()
348
+ df_tmp['index'] = df_tmp.index
349
+ ref_titles = list(df_tmp.groupby(df_tmp['label'])['ref_title'].apply(list))
350
+ # ref_indexs = list(df_tmp.groupby(df_tmp['label'])['index'].apply(list))
351
+
352
+ category_label_summarized = generate_cluster_name_new(f"{self.tsv_path}/{self.survey_id}.tsv", self.survey_title)
353
+ self.cluster_names = category_label_summarized
354
+
355
+ cluster_info = {category_label_summarized[i]:ref_titles[i] for i in range(len(category_label_summarized))}
356
+ for key, value in cluster_info.items():
357
+ temp = [legal_pdf(i) for i in value]
358
+ cluster_info[key] = temp
359
+ self.collection_names_clustered.append(temp)
360
+ cluster_info_path = f'{self.info_path}/{self.survey_id}/cluster_info.json'
361
+ with open(cluster_info_path, 'w', encoding="utf-8") as outfile:
362
+ json.dump(cluster_info, outfile, indent=4, ensure_ascii=False)
363
+
364
+ def outline_generation(self) -> None:
365
+ print(self.df_selected)
366
+ print(self.cluster_names)
367
+ outline_generator = OutlineGenerator(self.pipeline, self.df_selected, self.cluster_names)
368
+ outline_generator.get_cluster_info()
369
+ messages, outline = outline_generator.generate_outline_qwen(self.survey_title)
370
+ outline_json = {'messages':messages, 'outline': outline}
371
+ output_path = f'{self.info_path}/{self.survey_id}/outline.json'
372
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
373
+ with open(output_path, 'w', encoding="utf-8") as outfile:
374
+ json.dump(outline_json, outfile, indent=4, ensure_ascii=False)
375
+
376
+
377
+ def section_generation(self) -> None:
378
+ generateSurvey_qwen_new(self.survey_id, self.survey_title, self.collection_names_clustered, self.pipeline, self.citation_data, './txt','./info')
379
+
380
+ def citation_generation(self) -> None:
381
+ """
382
+ Generate citation Markdown and PDF files from JSON and store them in the specified result path.
383
+ """
384
+
385
+ json_filepath = os.path.join(self.info_path, self.survey_id, "generated_result.json")
386
+
387
+ markdown_dir = f'{self.result_path}/{self.survey_id}'
388
+ markdown_filename = f'survey_{self.survey_id}.md'
389
+ markdown_filepath = os.path.join(markdown_dir, markdown_filename)
390
+ pdf_filename = f'survey_{self.survey_id}.pdf'
391
+ pdf_filepath = os.path.join(markdown_dir, pdf_filename)
392
+
393
+ markdown_content = self.get_markdown_content(json_filepath)
394
+ if not markdown_content:
395
+ raise ValueError("Markdown content is empty. Cannot generate citation files.")
396
+
397
+ try:
398
+ with open(markdown_filepath, 'w', encoding='utf-8', encoding="utf-8") as markdown_file:
399
+ markdown_file.write(markdown_content)
400
+ print(f"Markdown content saved to: {markdown_filepath}")
401
+ except Exception as e:
402
+ raise RuntimeError(f"Failed to save Markdown file: {e}")
403
+
404
+ try:
405
+ pdf = MarkdownPdf()
406
+ pdf.meta["title"] = "Citation Results"
407
+ pdf.add_section(Section(markdown_content, toc=False))
408
+ pdf.save(pdf_filepath)
409
+ print(f"PDF content saved to: {pdf_filepath}")
410
+ except Exception as e:
411
+ raise RuntimeError(f"Failed to generate PDF file: {e}")
412
+ print(f"Files generated successfully: \nMarkdown: {markdown_filepath}\nPDF: {pdf_filepath}")
413
+
414
+ def get_markdown_content(self, json_filepath: str) -> str:
415
+ """
416
+ Read a JSON file and generate Markdown content based on its data.
417
+
418
+ :param json_filepath: Path to the JSON file containing survey data.
419
+ :return: A string containing the generated Markdown content.
420
+ """
421
+ try:
422
+ with open(json_filepath, 'r', encoding='utf-8', encoding="utf-8") as json_file:
423
+ survey_data = json.load(json_file)
424
+ except Exception as e:
425
+ raise RuntimeError(f"Failed to read JSON file: {e}")
426
+
427
+ topic = survey_data.get('survey_title', 'Default Topic')
428
+ content = survey_data.get('content', 'No content available.')
429
+
430
+ survey_title_markdown = f"# A Survey of {topic}\n\n"
431
+ survey_content_markdown = content + "\n\n"
432
+
433
+ markdown_content = survey_title_markdown + survey_content_markdown
434
+ markdown_content = finalize_survey_paper(markdown_content, self.collection_names, self.file_names)
435
+ return markdown_content
436
+
437
+ if __name__ == "__main__":
438
+ root_path = "."
439
+ pdf_path = "./pdfs/test"
440
+ survey_title = "Automating Literature Review Generation with LLM"
441
+ cluster_standard = "method"
442
+ asg_system = ASG_system(root_path, 'test', pdf_path, survey_title, cluster_standard)
443
+ asg_system.parsing_pdfs()
444
+ asg_system.description_generation()
445
+ asg_system.agglomerative_clustering()
446
+ asg_system.outline_generation()
447
+ asg_system.section_generation()
448
+ asg_system.citation_generation()
src/demo/migrations/__init__.py ADDED
File without changes
src/demo/models.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from django.db import models
2
+
3
+ # Create your models here.
4
+
5
+ # class Choose_Topic(models.Model):
6
+ # question_text = models.CharField(max_length=200)
7
+ # pub_date = models.DateTimeField('date published')
8
+ #
9
+ #
10
+ # class Choice(models.Model):
11
+ # question = models.ForeignKey(Question, on_delete=models.CASCADE)
12
+ # choice_text = models.CharField(max_length=200)
13
+ # votes = models.IntegerField(default=0)
14
+
src/demo/postprocess.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def reindex_citations(content):
4
+ """
5
+ 将content中所有形如[collection_name]的引用标记,全局重编号为[1]、[2]、[3]...。
6
+ 返回:
7
+ new_content: 替换后的文本
8
+ source_map: {collection_name: index, ...}
9
+ """
10
+ pattern = r"\[([^\[\]]+)\]"
11
+ source_map = {}
12
+ current_index = 1
13
+
14
+ def replace_func(match):
15
+ source = match.group(1)
16
+ nonlocal current_index
17
+ if source not in source_map:
18
+ source_map[source] = current_index
19
+ current_index += 1
20
+ return f"[{source_map[source]}]"
21
+
22
+ new_content = re.sub(pattern, replace_func, content)
23
+ return new_content, source_map
24
+
25
+ def generate_references_section(source_map):
26
+ """
27
+ 根据source_map生成References部分的文本。
28
+ source_map: {collection_name: index, ...}
29
+
30
+ 返回值:
31
+ str:
32
+ "References\n1 collection_name_1\n2 collection_name_2\n..."
33
+ """
34
+ # 将source_map按index排序
35
+ index_to_source = sorted(source_map.items(), key=lambda x: x[1])
36
+ refs_lines = ["References"]
37
+ for source, idx in index_to_source:
38
+ refs_lines.append(f"{idx} {source}")
39
+ return "\n".join(refs_lines)
src/demo/query1.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from openai import OpenAI
3
+ from datetime import datetime, timedelta
4
+ import re
5
+
6
+ def generate_abstract_qwen(topic):
7
+
8
+ # Initialize the OpenAI client using environment variables
9
+ openai_api_key = os.getenv("OPENAI_API_KEY")
10
+ openai_api_base = os.getenv("OPENAI_API_BASE")
11
+ client = OpenAI(
12
+ api_key = openai_api_key,
13
+ base_url = openai_api_base,
14
+ )
15
+
16
+ ###########################
17
+ # Step 1: Generate a survey abstract for the given topic.
18
+ ###########################
19
+ system_prompt_abstract = """
20
+ You are a skilled research survey writer. Your task is to generate a survey abstract on the given topic. The abstract should cover the main challenges, key concepts, and research directions associated with the topic. Write in clear, concise academic English.
21
+ """
22
+ user_prompt_abstract = f"""
23
+ Topic: {topic}
24
+
25
+ Please generate a comprehensive survey abstract for this topic. Include discussion of core challenges, key terminologies, and emerging methodologies that are critical in the field. The total length of the abstract should be around 300–500 words.
26
+ """
27
+ messages_abstract = [
28
+ {"role": "system", "content": system_prompt_abstract},
29
+ {"role": "user", "content": user_prompt_abstract}
30
+ ]
31
+
32
+ abstract_response = client.chat.completions.create(
33
+ model="Qwen2.5-72B-Instruct",
34
+ max_tokens=2048,
35
+ temperature=0.5,
36
+ stop="<|im_end|>",
37
+ stream=True,
38
+ messages=messages_abstract
39
+ )
40
+
41
+ abstract_text = ""
42
+ for chunk in abstract_response:
43
+ if chunk.choices[0].delta.content:
44
+ abstract_text += chunk.choices[0].delta.content
45
+ abstract_text = abstract_text.strip()
46
+ print("The abstract is:", abstract_text)
47
+
48
+ return abstract_text
49
+
50
+ def generate_entity_lists_qwen(topic, abstract_text):
51
+ openai_api_key = os.getenv("OPENAI_API_KEY")
52
+ openai_api_base = os.getenv("OPENAI_API_BASE")
53
+ client = OpenAI(
54
+ api_key = openai_api_key,
55
+ base_url = openai_api_base,
56
+ )
57
+ system_prompt_abstract = f"""
58
+ You are an AI assistant specializing in natural language processing and entity recognition. Your task is to extract key entities and core concepts from a given abstract based on a specified topic.
59
+
60
+ You should return two distinct lists:
61
+ 1. **Entity list**: Entities that are synonymous or closely related to the given topic. These should be concise (no more than two words) and simplified to their root forms (e.g., removing suffixes like "-ing", "-ed").
62
+ 2. **Concept list**: Core concepts from the abstract that are highly relevant to the topic. These should also be concise (no more than two words) and in their simplest form.
63
+
64
+ Ensure that your response follows this exact format:
65
+ Entity list: [entity1, entity2, entity3, ...]
66
+ Concept list: [concept1, concept2, concept3, ...]
67
+ Do not include any explanations or additional text.
68
+
69
+ ### **Example**
70
+ #### **Input:**
71
+ Topic: Large Language Models
72
+ Abstract: Ever since the Turing Test was proposed in the 1950s, humans have explored the mastering of language intelligence by machine. Language is essentially a complex, intricate system of human expressions governed by grammatical rules. It poses a significant challenge to develop capable artificial intelligence (AI) algorithms for comprehending and grasping a language. As a major approach, language modeling has been widely studied for language understanding and generation in the past two decades, evolving from statistical language models to neural language models. Recently, pre-trained language models (PLMs) have been proposed by pretraining Transformer models over large-scale corpora, showing strong capabilities in solving various natural language processing (NLP) tasks. Since the researchers have found that model scaling can lead to an improved model capacity, they further investigate the scaling effect by increasing the parameter scale to an even larger size. Interestingly, when the parameter scale exceeds a certain level, these enlarged language models not only achieve a significant performance improvement, but also exhibit some special abilities (e.g., in-context learning) that are not present in small-scale language models (e.g., BERT). To discriminate the language models in different parameter scales, the research community has coined the term large language models (LLM) for the PLMs of significant size (e.g., containing tens or hundreds of billions of parameters). Recently, the research on LLMs has been largely advanced by both academia and industry, and a remarkable progress is the launch of ChatGPT (a powerful AI chatbot developed based on LLMs), which has attracted widespread attention from society. The technical evolution of LLMs has been making an important impact on the entire AI community, which would revolutionize the way how we develop and use AI algorithms. Considering this rapid technical progress, in this survey, we review the recent advances of LLMs by introducing the background, key findings, and mainstream techniques. In particular, we focus on four major aspects of LLMs, namely pre-training, adaptation tuning, utilization, and capacity evaluation. Furthermore, we also summarize the available resources for developing LLMs and discuss the remaining issues for future directions. This survey provides an up-to-date review of the literature on LLMs, which can be a useful resource for both researchers and engineers.
73
+
74
+ #### **Expected Output:**
75
+ "entity list": ["language model", "plm", "large language", "llm"]
76
+ "concept list": ["turing", "language intelligence", "ai", "generation", "statistical", "neural", "pre-train", "transformer", "corpora", "nlp", "in-context", "bert", "chatgpt", "adaptation", "utilization"]
77
+ Make sure to strictly follow this format in your response.
78
+ """
79
+
80
+ user_prompt_abstract = f"""
81
+ Topic: {topic}
82
+ Abstract: {abstract_text}
83
+
84
+ Based on the given topic and abstract, extract the following:
85
+ 1. A **list of entities** that are synonymous or closely related to the topic. Keep each entity under two words and in its simplest form.
86
+ 2. A **list of core concepts** from the abstract that are highly relevant to the topic. Keep each concept under two words and in its simplest form.
87
+ """
88
+
89
+ messages_abstract = [
90
+ {"role": "system", "content": system_prompt_abstract},
91
+ {"role": "user", "content": user_prompt_abstract}
92
+ ]
93
+
94
+ entity_response = client.chat.completions.create(
95
+ model="Qwen2.5-72B-Instruct",
96
+ max_tokens=2048,
97
+ temperature=0.5,
98
+ stop="<|im_end|>",
99
+ stream=True,
100
+ messages=messages_abstract
101
+ )
102
+
103
+ entity_list = ""
104
+ for chunk in entity_response:
105
+ if chunk.choices[0].delta.content:
106
+ entity_list += chunk.choices[0].delta.content
107
+ entity_list = entity_list.strip()
108
+ print("The entity lists are:", entity_list)
109
+
110
+ return entity_list
111
+
112
+
113
+ def generate_query_qwen(topic):
114
+ # Calculate date range for the arXiv query (last 5 years)
115
+ abstract_text = generate_abstract_qwen(topic)
116
+ entity_list = generate_entity_lists_qwen(topic, abstract_text)
117
+ today = datetime.now()
118
+ five_years_ago = today - timedelta(days=10 * 365) # approximate calculation
119
+ start_date = five_years_ago.strftime('%Y%m%d')
120
+ end_date = today.strftime('%Y%m%d')
121
+
122
+
123
+ # System prompt: Focus on how to extract keywords from the abstract.
124
+ system_prompt_query = """
125
+ You are a research assistant specializing in constructing effective arXiv search queries. Your task is to generate a structured search query using **pre-extracted entity and concept lists** from a given abstract. Follow these instructions exactly:
126
+
127
+ 1. **Input Data:**
128
+ - **Entity List:** A list of entities that are synonymous or closely related to the given topic.
129
+ - **Concept List:** A list of core concepts from the abstract that are highly relevant to the topic.
130
+
131
+ 2. **Ensure Minimum Keyword Count:**
132
+ - **Entity List** must contain at least **5** terms. If there are fewer, intelligently supplement additional relevant terms.
133
+ - **Concept List** must contain **12-15** terms. If there are fewer, intelligently supplement additional relevant terms.
134
+
135
+ 3. **Standardize Formatting:**
136
+ - Convert all terms to their **base form** and ensure they end with a wildcard `*`.
137
+ - Examples: `verification → verif*`, `optimization → optim*`, `retrieval → retriev*`, `embedding → embed*`
138
+ - All terms must be **in lowercase**.
139
+
140
+ 4. **Construct the Final Query:**
141
+ - The query must follow this exact structure:
142
+ ```
143
+ (abs:"<Entity1*>" OR abs:"<Entity2*>" OR abs:"<Entity3*>" OR abs:"<Entity4*>" OR abs:"<Entity5*>") AND
144
+ (abs:"<Concept1*>" OR abs:"<Concept2*>" OR ... OR abs:"<Concept12*>")
145
+ ```
146
+ - **Entities are grouped together using `OR` in the first part.**
147
+ - **Concepts are grouped together using `OR` in the second part.**
148
+ - **The two groups are combined using `AND`.**
149
+ - **Do not include any explanations or extra text. Output only the final query.**
150
+ """
151
+
152
+ # User prompt: Provide examples of topics with corresponding query formats.
153
+ # User prompt: Provide examples of topics with corresponding query formats.
154
+ # User prompt: Uses pre-extracted entities and concepts, ensures minimum count, and applies stemming + wildcards.
155
+ user_prompt_query = f"""
156
+ Below are the pre-extracted keywords for constructing the final arXiv query.
157
+
158
+ **Topic:** {topic}
159
+ **Entity list and Concept list:** {entity_list}
160
+
161
+ ### **Processing Rules Applied:**
162
+ - **Ensure at least 5 entities** (if fewer, supplement additional relevant terms).
163
+ - **Ensure 12-15 concepts** (if fewer, supplement additional relevant terms).
164
+ - **Convert all terms to base form and append wildcard `*`.**
165
+ - **Output only the final query with no extra text.**
166
+
167
+ ### **Example Query Format:**
168
+
169
+ 1. **Topic:** Large Language Models
170
+ **Transformed Entity List:** ["languag model*", "plm*", "larg languag*", "llm*", "deep model*"]
171
+ **Transformed Concept List:** ["tur*", "languag intellig*", "ai*", "gener*", "statist*", "neural*", "pre-train*", "transform*", "corpora*", "nlp*", "in-context*", "bert*", "chatgpt*", "adapt*", "utiliz*"]
172
+ **Query:**
173
+ (abs:"languag model*" OR abs:"plm*" OR abs:"larg languag*" OR abs:"llm*" OR abs:"deep model*") AND (abs:"tur*" OR abs:"languag intellig*" OR abs:"ai*" OR abs:"gener*" OR abs:"statist*" OR abs:"neural*" OR abs:"pre-train*" OR abs:"transform*" OR abs:"corpora*" OR abs:"nlp*" OR abs:"in-context*" OR abs:"bert*" OR abs:"chatgpt*" OR abs:"adapt*" OR abs:"utiliz*")
174
+ 2. **Topic:** Quantum Computing
175
+ **Transformed Entity List:** ["quant comput*", "qubit*", "qc*", "quant devic*", "topolog comput*"]
176
+ **Transformed Concept List:** ["decoheren*", "entangl*", "error*", "topolog*", "anneal*", "photon*", "superconduct*", "algorithm*", "optim*", "verif*", "fault-toler*", "nois*", "cirquit*", "quant machin*", "measur*"]
177
+ **Query:**
178
+ (abs:"quant comput*" OR abs:"qubit*" OR abs:"qc*" OR abs:"quant devic*" OR abs:"topolog comput*") AND (abs:"decoheren*" OR abs:"entangl*" OR abs:"error*" OR abs:"topolog*" OR abs:"anneal*" OR abs:"photon*" OR abs:"superconduct*" OR abs:"algorithm*" OR abs:"optim*" OR abs:"verif*" OR abs:"fault-toler*" OR abs:"nois*" OR abs:"cirquit*" OR abs:"quant machin*" OR abs:"measur*")
179
+ ---
180
+
181
+ ### **Now Generate the Query for This Topic:**
182
+ Using the provided **Entity List** and **Concept List**, apply the following steps:
183
+ 1. **Ensure Entity List contains at least 5 items.** If fewer, supplement additional relevant terms.
184
+ 2. **Ensure Concept List contains 12-15 items.** If fewer, supplement additional relevant terms.
185
+ 3. **Convert all terms to their base form and append `*`.**
186
+ 4. **Construct the arXiv search query in the same format as the examples above.**
187
+ 5. **Return only the final query. Do not include explanations or additional text.**
188
+ """
189
+
190
+ # Initialize the OpenAI API client
191
+ openai_api_key = os.getenv("OPENAI_API_KEY")
192
+ openai_api_base = os.getenv("OPENAI_API_BASE")
193
+ client = OpenAI(
194
+ api_key=openai_api_key,
195
+ base_url=openai_api_base,
196
+ )
197
+
198
+ messages = [
199
+ {"role": "system", "content": system_prompt_query},
200
+ {"role": "user", "content": user_prompt_query}
201
+ ]
202
+
203
+ response = client.chat.completions.create(
204
+ model="Qwen2.5-72B-Instruct",
205
+ max_tokens=512,
206
+ temperature=0.5,
207
+ stop="<|im_end|>",
208
+ stream=True,
209
+ messages=messages
210
+ )
211
+
212
+ output_query = ""
213
+ for chunk in response:
214
+ if chunk.choices[0].delta.content:
215
+ output_query += chunk.choices[0].delta.content
216
+ match = re.search(r'\(.*\)', output_query, re.DOTALL)
217
+
218
+ if match:
219
+ extracted_query = match.group(0) # 保留匹配到的整个括号内容
220
+ else:
221
+ extracted_query = output_query.strip() # 如果匹配失败,使用原始查询
222
+
223
+ # 重新拼接 `submittedDate`
224
+ updated_query = f"{extracted_query} AND submittedDate:[{start_date} TO {end_date}]"
225
+ print('The response is :', updated_query)
226
+ return updated_query.strip()
227
+
228
+
229
+ # Example usage:
230
+ if __name__ == "__main__":
231
+ topic = "Quantum Computing"
232
+ final_query = generate_arxiv_query_chain_of_thought(topic)
233
+ print("\nFinal Query Returned:")
234
+ print(final_query)
src/demo/references.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ from openai import OpenAI
4
+
5
+ def getQwenClient():
6
+ openai_api_key = os.environ.get("OPENAI_API_KEY")
7
+ openai_api_base = os.environ.get("OPENAI_API_BASE")
8
+
9
+ client = OpenAI(
10
+ # defaults to os.environ.get("OPENAI_API_KEY")
11
+ api_key = openai_api_key,
12
+ base_url = openai_api_base,
13
+ )
14
+ return client
15
+
16
+ def generateResponse(client, prompt):
17
+ chat_response = client.chat.completions.create(
18
+ model=os.environ.get("MODEL"),
19
+ max_tokens=1536,
20
+ temperature=0.5,
21
+ stop="<|im_end|>",
22
+ stream=True,
23
+ messages=[{"role": "user", "content": prompt}]
24
+ )
25
+ # Stream the response to console
26
+ text = ""
27
+ for chunk in chat_response:
28
+ if chunk.choices[0].delta.content:
29
+ text += chunk.choices[0].delta.content
30
+ return text
31
+
32
+ def generate_references(papers_info, client):
33
+
34
+ # In-Context Learning
35
+ examples = '''
36
+ Example1:
37
+ Authors: Armen Aghajanyan, Armen Aghajanyan, Anchit Gupta, Akshat Shrivastava, Xilun Chen, Luke Zettlemoyer, and Sonal Gupta
38
+ Title: Muppet: Massive multi-task representations with pre-finetuning
39
+ Reference: Armen Aghajanyan, Anchit Gupta, Akshat Shrivastava, Xilun Chen, Luke Zettlemoyer, and Sonal Gupta. Muppet: Massive multi-task representations with pre-finetuning
40
+
41
+ Example2:
42
+ Authors: Ari Holtzman1, Peter West222, Vered Shwartz3, Yejin Choi4, Luke Zettlemoyer12001
43
+ Title: Surface form competition: Why the highest probability answer isn't always right.
44
+ Reference: Ari Holtzman, Peter West, Vered Shwartz, Yejin Choi, Luke Zettlemoyer. Surface form competition: Why the highest probability answer isn't always right.
45
+
46
+ Example3:
47
+ Authors: Mikel Artetxe, Shruti Bhosale, Naman Goyal, Todor Mihaylov, Myle Ott, Sam Shleifer, Xi Victoria Lin, Jingfei Du, Srinivasan Iyer, Ramakanth Pasunuru, Giri Anantharaman, Xian Li, Shuohui Chen, Halil Akin, Mandeep Baines, Louis Martin, Xing Zhou, Punit Singh Koura, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Mona Diab, Zornitsa Kozareva, Ves Stoyanov
48
+ Title: Efficient large scale language modeling with mixtures of experts.
49
+ Reference: Mikel Artetxe, Shruti Bhosale, Naman Goyal, Todor Mihaylov, Myle Ott, Sam Shleifer, Xi Victoria Lin, Jingfei Du, Srinivasan Iyer, Ramakanth Pasunuru, et al. Efficient large scale language modeling with mixtures of experts.
50
+ '''
51
+
52
+ prompt = f'''
53
+ Based on the following examples, generate the references based on the provided paper information.
54
+ The generated references should be clear, legal and properly formatted.
55
+ If the authors are many, list the first few authors followed by "et al.".
56
+
57
+ Please include the "Reference:" label before each reference as shown in the examples.
58
+
59
+ {examples}
60
+ Now, please generate the references:
61
+
62
+ '''
63
+
64
+ for idx, paper in enumerate(papers_info):
65
+ authors = paper['authors']
66
+ title = paper['title']
67
+ prompt += f'''
68
+ Paper{idx+1}:
69
+ Authors: {authors}
70
+ Title: {title}
71
+ Reference:'''
72
+
73
+ response = generateResponse(client, prompt)
74
+ references = []
75
+ pattern = r'Reference:(.*?)(?=\n\n|$)'
76
+ matches = re.findall(pattern, response, re.S)
77
+
78
+ for match in matches:
79
+ reference = match.strip()
80
+ if reference:
81
+ references.append(reference)
82
+
83
+ return references
src/demo/survey_generation_pipeline/asg_abstract.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+
4
+ class AbstractGenerator:
5
+ def __init__(self, pipeline):
6
+ self.pipeline = pipeline
7
+
8
+ def generate(self, title, intro, mode='lora'):
9
+ if mode == 'lora' or mode == 'test':
10
+ if mode == 'lora':
11
+ self.pipeline.model.set_adapter("abstract")
12
+
13
+ system_prompt = f'''You are a helpful assistant that help to generate the abstract of the survey paper given the survey title and survey introduction.'''
14
+ # user_prompt = {"survey_title":survey_title, "claims":cluster_with_claims}
15
+ user_prompt = f'''Help me to generate the abstract of a survey paper given the title: *{title}*, and and the introduction:{intro}'''
16
+
17
+ messages = [
18
+ {"role": "system", "content": system_prompt},
19
+ {"role": "user", "content": user_prompt},
20
+ {"role": "assistant", "content":"Abstract: This survey "}
21
+ ]
22
+
23
+ outputs = self.pipeline(
24
+ messages,
25
+ max_new_tokens=4096,
26
+ )
27
+ result = outputs[0]["generated_text"][-1]['content']
28
+ return result
29
+ else:
30
+ raise ValueError('mode not supported')
31
+
32
+ if __name__ == '__main__':
33
+ from transformers import pipeline
34
+ import torch
35
+ import transformers
36
+
37
+ model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
38
+ Global_pipeline = transformers.pipeline(
39
+ "text-generation",
40
+ model=model_id,
41
+ model_kwargs={"torch_dtype": torch.bfloat16},
42
+ token = os.getenv('HF_API_KEY'),
43
+ device_map="auto",
44
+ )
45
+ Global_pipeline.model.load_adapter(peft_model_id = "technicolor/llama3.1_8b_outline_generation", adapter_name="outline")
46
+ Global_pipeline.model.load_adapter(peft_model_id ="technicolor/llama3.1_8b_abstract_generation", adapter_name="abstract")
47
+ Global_pipeline.model.load_adapter(peft_model_id ="technicolor/llama3.1_8b_conclusion_generation", adapter_name="conclusion")
48
+ title = "A Survey of Large Language Models"
49
+ intro = '''L
50
+ ANGUAGE is a prominent ability in human beings to
51
+ express and communicate, which develops in early
52
+ childhood and evolves over a lifetime [3, 4]. Machines,
53
+ however, cannot naturally grasp the abilities of understanding and communicating in the form of human language,
54
+ unless equipped with powerful artificial intelligence (AI)
55
+ algorithms. It has been a longstanding research challenge
56
+ to achieve this goal, to enable machines to read, write, and
57
+ communicate like humans [5].
58
+ Technically, language modeling (LM) is one of the major
59
+ approaches to advancing language intelligence of machines.
60
+ In general, LM aims to model the generative likelihood
61
+ of word sequences, so as to predict the probabilities of
62
+ future (or missing) tokens. The research of LM has received
63
+ extensive attention in the literature, which can be divided
64
+ into four major development stages:
65
+ • Statistical language models (SLM). SLMs [6–9] are developed based on statistical learning methods that rose in
66
+ the 1990s. The basic idea is to build the word prediction
67
+ model based on the Markov assumption, e.g., predicting the
68
+ next word based on the most recent context. The SLMs with
69
+ a fixed context length n are also called n-gram language
70
+ models, e.g., bigram and trigram language models. SLMs
71
+ have been widely applied to enhance task performance
72
+ in information retrieval (IR) [10, 11] and natural language
73
+ processing (NLP) [12–14]. However, they often suffer from
74
+ the curse of dimensionality: it is difficult to accurately
75
+ estimate high-order language models since an exponential
76
+ number of transition probabilities need to be estimated.
77
+ Thus, specially designed smoothing strategies such as backoff estimation [15] and Good–Turing estimation [16] have
78
+ been introduced to alleviate the data sparsity problem.
79
+ • Neural language models (NLM). NLMs [1, 17, 18] characterize the probability of word sequences by neural networks,
80
+ e.g., multi-layer perceptron (MLP) and recurrent neural networks (RNNs). As a remarkable contribution, the work in
81
+ [1] introduced the concept of distributed representation of
82
+ words and built the word prediction function conditioned
83
+ on the aggregated context features (i.e., the distributed
84
+ word vectors). By extending the idea of learning effective
85
+ features for text data, a general neural network approach
86
+ was developed to build a unified, end-to-end solution for
87
+ various NLP tasks [2]. Furthermore, word2vec [19, 20] was
88
+ proposed to build a simplified shallow neural network
89
+ for learning distributed word representations, which were
90
+ demonstrated to be very effective across a variety of NLP
91
+ tasks. These studies have initiated the use of language
92
+ models for representation learning (beyond word sequence
93
+ modeling), having an important impact on the field of NLP.
94
+ • Pre-trained language models (PLM). As an early attempt, ELMo [21] was proposed to capture context-aware
95
+ word representations by first pre-training a bidirectional
96
+ LSTM (biLSTM) network (instead of learning fixed word
97
+ representations) and then fine-tuning the biLSTM network
98
+ according to specific downstream tasks. Furthermore, based
99
+ on the highly parallelizable Transformer architecture [22]
100
+ with self-attention mechanisms, BERT [23] was proposed by
101
+ pre-training bidirectional language models with specially
102
+ designed pre-training tasks on large-scale unlabeled corpora. These pre-trained context-aware word representations
103
+ are very effective as general-purpose semantic features,
104
+ which have largely raised the performance bar of NLP
105
+ tasks. This study has inspired a large number of follow-up
106
+ work, which sets the “pre-training and fine-tuning” learning
107
+ paradigm. Following this paradigm, a great number of studies on PLMs have been developed, introducing either different architectures [24, 25] (e.g., GPT-2 [26] and BART [24]) or
108
+ improved pre-training strategies [27–29]. In this paradigm, it
109
+ often requires fine-tuning the PLM for adapting to different
110
+ downstream tasks.
111
+ • Large language models (LLM). Researchers find that
112
+ scaling PLM (e.g., scaling model size or data size) often
113
+ leads to an improved model capacity on downstream tasks
114
+ (i.e., following the scaling law [30]). A number of studies
115
+ have explored the performance limit by training an ever
116
+ larger PLM (e.g., the 175B-parameter GPT-3 and the 540Bparameter PaLM). Although scaling is mainly conducted
117
+ in model size (with similar architectures and pre-training
118
+ tasks), these large-sized PLMs display different behaviors
119
+ from smaller PLMs (e.g., 330M-parameter BERT and 1.5Bparameter GPT-2) and show surprising abilities (called emergent abilities [31]) in solving a series of complex tasks. For
120
+ example, GPT-3 can solve few-shot tasks through in-context
121
+ learning, whereas GPT-2 cannot do well. Thus, the research
122
+ community coins the term “large language models (LLM)”
123
+ 1
124
+ for these large-sized PLMs [32–35], which attract increasing
125
+ research attention (See Figure 1). A remarkable application
126
+ of LLMs is ChatGPT2
127
+ that adapts the LLMs from the GPT
128
+ series for dialogue, which presents an amazing conversation
129
+ ability with humans. We can observe a sharp increase of the
130
+ arXiv papers that are related to LLMs after the release of
131
+ ChatGPT in Figure 1.
132
+ As discussed before, language model is not a new technical concept specially for LLMs, but has evolved with the
133
+ advance of artificial intelligence over the decades. Early language models mainly aim to model and generate text data,
134
+ while latest language models (e.g., GPT-4) focus on complex
135
+ task solving. From language modeling to task solving, it is an
136
+ important leap in scientific thinking, which is the key to
137
+ understand the development of language models in the research history. From the perspective of task solving, the four
138
+ generations of language models have exhibited different levels of model capacities. In Figure 2, we describe the evolution process of language models in terms of the task solving
139
+ capacity. At first, statistical language models mainly assisted
140
+ in some specific tasks (e.g., retrieval or speech tasks), in
141
+ which the predicted or estimated probabilities can enhance
142
+ the performance of task-specific approaches. Subsequently,
143
+ neural language models focused on learning task-agnostic
144
+ representations (e.g., features), aiming to reduce the efforts
145
+ for human feature engineering. Furthermore, pre-trained
146
+ language models learned context-aware representations that
147
+ can be optimized according to downstream tasks. For the
148
+ latest generation of language model, LLMs are enhanced by
149
+ exploring the scaling effect on model capacity, which can be
150
+ considered as general-purpose task solvers. To summarize,
151
+ in the evolution process, the task scope that can be solved
152
+ by language models have been greatly extended, and the
153
+ task performance attained by language models have been
154
+ significantly enhanced.
155
+ In the existing literature, PLMs have been widely discussed and surveyed [36–39], while LLMs are seldom reviewed in a systematic way. To motivate our survey, we first
156
+ highlight three major differences between LLMs and PLMs.
157
+ First, LLMs display some surprising emergent abilities that
158
+ may not be observed in previous smaller PLMs. These abilities are key to the performance of language models on complex tasks, making AI algorithms unprecedently powerful
159
+ and effective. Second, LLMs would revolutionize the way
160
+ that humans develop and use AI algorithms. Unlike small
161
+ PLMs, the major approach to accessing LLMs is through
162
+ the prompting interface (e.g., GPT-4 API). Humans have to
163
+ understand how LLMs work and format their tasks in a way
164
+ that LLMs can follow. Third, the development of LLMs no
165
+ longer draws a clear distinction between research and engineering. The training of LLMs requires extensive practical
166
+ experiences in large-scale data processing and distributed
167
+ parallel training. To develop capable LLMs, researchers
168
+ have to solve complicated engineering issues, working with
169
+ engineers or being engineers.
170
+ Nowadays, LLMs are posing a significant impact on
171
+ the AI community, and the advent of ChatGPT and GPT-4
172
+ leads to the rethinking of the possibilities of artificial general
173
+ intelligence (AGI). OpenAI has published a technical article
174
+ entitled “Planning for AGI and beyond”, which discusses
175
+ the short-term and long-term plans to approach AGI [40],
176
+ and a more recent paper has argued that GPT-4 might be
177
+ considered as an early version of an AGI system [41]. The
178
+ research areas of AI are being revolutionized by the rapid
179
+ progress of LLMs. In the field of NLP, LLMs can serve as a
180
+ general-purpose language task solver (to some extent), and
181
+ the research paradigm has been shifting towards the use
182
+ of LLMs. In the field of IR, traditional search engines are
183
+ challenged by the new information seeking way through AI
184
+ chatbots (i.e., ChatGPT), and New Bing3 presents an initial
185
+ attempt that enhances the search results based on LLMs. In
186
+ the field of CV, the researchers try to develop ChatGPT-like
187
+ vision-language models that can better serve multimodal
188
+ dialogues [42–45], and GPT-4 [46] has supported multimodal input by integrating the visual information. This new
189
+ wave of technology would potentially lead to a prosperous
190
+ ecosystem of real-world applications based on LLMs. For
191
+ instance, Microsoft 365 is being empowered by LLMs (i.e.,
192
+ Copilot) to automate the office work, and OpenAI supports
193
+ the use of plugins in ChatGPT for implementing special
194
+ functions.
195
+ Despite the progress and impact, the underlying principles of LLMs are still not well explored. Firstly, it is
196
+ mysterious why emergent abilities occur in LLMs, instead of
197
+ smaller PLMs. As a more general issue, there lacks a deep,
198
+ detailed investigation of the key factors that contribute to
199
+ the superior abilities of LLMs. It is important to study when
200
+ and how LLMs obtain such abilities [47]. Although there are
201
+ some meaningful discussions about this problem [31, 47],
202
+ more principled investigations are needed to uncover the
203
+ “secrets“ of LLMs. Secondly, it is difficult for the research
204
+ community to train capable LLMs. Due to the huge demand of computation resources, it is very costly to carry
205
+ out repetitive, ablating studies for investigating the effect
206
+ of various strategies for training LLMs. Indeed, LLMs are
207
+ mainly trained by industry, where many important training
208
+ details (e.g., data collection and cleaning) are not revealed
209
+ to the public. Thirdly, it is challenging to align LLMs with
210
+ human values or preferences. Despite the capacities, LLMs
211
+ are also likely to produce toxic, fictitious, or harmful contents. It requires effective and efficient control approaches
212
+ to eliminating the potential risk of the use of LLMs [46].
213
+ Faced with both opportunities and challenges, it needs
214
+ more attention on the research and development of LLMs. In
215
+ order to provide a basic understanding of LLMs, this survey
216
+ conducts a literature review of the recent advances in LLMs
217
+ from four major aspects, including pre-training (how to pretrain a capable LLM), adaptation (how to effectively adapt
218
+ pre-trained LLMs for better use), utilization (how to use
219
+ LLMs for solving various downstream tasks) and capability
220
+ evaluation (how to evaluate the abilities of LLMs and existing
221
+ empirical findings). We thoroughly comb the literature and
222
+ summarize the key findings, techniques, and methods of
223
+ LLMs. For this survey, we also create a GitHub project
224
+ website by collecting the supporting resources for LLMs, at
225
+ the link https://github.com/RUCAIBox/LLMSurvey. We
226
+ are also aware of several related review articles on PLMs
227
+ or LLMs [32, 36, 38, 39, 43, 48–54]. These papers either
228
+ discuss PLMs or some specific (or general) aspects of LLMs.
229
+ Compared with them, we focus on the techniques and
230
+ methods to develop and use LLMs and provide a relatively
231
+ comprehensive reference to important aspects of LLMs.
232
+ The remainder of this survey is organized as follows:
233
+ Section 2 introduces the background for LLMs and the evolution of GPT-series models, followed by the summarization
234
+ of available resources for developing LLMs in Section 3.
235
+ Sections 4, 5, 6, and 7 review and summarize the recent
236
+ progress from the four aspects of pre-training, adaptation,
237
+ utilization, and capacity evaluation, respectively. Then, Section 8 discusses the practical guide for prompt design,
238
+ and Section 9 reviews the applications of LLMs in several
239
+ representative domains. Finally, we conclude the survey in
240
+ Section 10 by summarizing the major findings and discuss
241
+ the remaining issues for future work.
242
+ '''
243
+
244
+
245
+ abstract_generator = AbstractGenerator(Global_pipeline)
246
+ with_lora = abstract_generator.generate(title, intro, mode='lora')
247
+ with_test = abstract_generator.generate(title, intro, mode='test')
src/demo/survey_generation_pipeline/asg_clustername.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import re # Import the regular expressions module
4
+ from openai import OpenAI
5
+ import ast
6
+
7
+ def generate_cluster_name_qwen_sep(tsv_path, survey_title):
8
+ data = pd.read_csv(tsv_path, sep='\t')
9
+
10
+ # Define the system prompt once, outside the loop
11
+ system_prompt = f'''You are a research assistant working on a survey paper. The survey paper is about "{survey_title}". \
12
+ '''
13
+
14
+ result = [] # Initialize the result list
15
+
16
+ for i in range(3): # Assuming labels are 0, 1, 2
17
+ sentence_list = [] # Reset sentence_list for each label
18
+ for j in range(len(data)):
19
+ if data['label'][j] == i:
20
+ sentence_list.append(data['retrieval_result'][j])
21
+
22
+ # Convert the sentence list to a string representation
23
+ user_prompt = f'''
24
+ Given a list of descriptions of sentences about an aspect of the survey, you need to use one phrase (within 8 words) to summarize it and treat it as a section title of your survey paper. \
25
+ Your response should be a list with only one element and without any other information, for example, ["Post-training of LLMs"] \
26
+ Your response must contain one keyword of the survey title, unspecified or irrelevant results are not allowed. \
27
+ The description list is:{sentence_list}'''
28
+
29
+ messages = [
30
+ {"role": "system", "content": system_prompt},
31
+ {"role": "user", "content": user_prompt},
32
+ ]
33
+
34
+ openai_api_key = os.getenv("OPENAI_API_KEY")
35
+ openai_api_base = os.getenv("OPENAI_API_BASE")
36
+ client = OpenAI(
37
+ api_key=openai_api_key,
38
+ base_url=openai_api_base,
39
+ )
40
+
41
+ chat_response = client.chat.completions.create(
42
+ model=os.environ.get("MODEL"),
43
+ max_tokens=768,
44
+ temperature=0.5,
45
+ stop="<|im_end|>",
46
+ stream=True,
47
+ messages=messages
48
+ )
49
+
50
+ # Stream the response to a single text string
51
+ text = ""
52
+ for chunk in chat_response:
53
+ if chunk.choices[0].delta.content:
54
+ text += chunk.choices[0].delta.content
55
+
56
+ # Use regex to extract the first content within []
57
+ match = re.search(r'\[(.*?)\]', text)
58
+ if match:
59
+ cluster_name = match.group(1).strip() # Extract and clean the cluster name
60
+ # 去除集群名称两侧的引号(如果存在)
61
+ cluster_name = cluster_name.strip('"').strip("'")
62
+ result.append(cluster_name)
63
+ else:
64
+ result.append("No Cluster Name Found") # Handle cases where pattern isn't found
65
+ # print("The generated cluster names are:")
66
+ # print(result)
67
+ return result # This will be a list with three elements
68
+
69
+ # Example usage:
70
+ # result = generate_cluster_name_qwen_sep('path_to_your_file.tsv', 'Your Survey Title')
71
+ # print(result) # Output might look like ["Cluster One", "Cluster Two", "Cluster Three"]
72
+
73
+ def refine_cluster_name(cluster_names, survey_title):
74
+ cluster_names = str(cluster_names) # Convert to string to handle list input
75
+ # Define the system prompt to set the context
76
+ system_prompt = f'''You are a research assistant tasked with optimizing and refining a set of section titles for a survey paper. The survey paper is about "{survey_title}".
77
+ '''
78
+
79
+ # Construct the user prompt, including all cluster names
80
+ user_prompt = f'''
81
+ Here is a set of section titles generated for the survey topic "{survey_title}":
82
+ {cluster_names}
83
+ Please ensure that all cluster names are coherent and consistent with each other, and that each name is clear, concise, and accurately reflects the corresponding section.
84
+ Notice to remove the overlapping information between the cluster names.
85
+ Each cluster name should be within 8 words and include a keyword from the survey title.
86
+ Response with a list of section titles in the following format without any other irrelevant information,
87
+ For example, ["Refined Title 1", "Refined Title 2", "Refined Title 3"]
88
+ '''
89
+
90
+ messages = [
91
+ {"role": "system", "content": system_prompt},
92
+ {"role": "user", "content": user_prompt},
93
+ ]
94
+
95
+ # Initialize OpenAI client
96
+ openai_api_key = os.getenv("OPENAI_API_KEY")
97
+ openai_api_base = os.getenv("OPENAI_API_BASE")
98
+ client = OpenAI(
99
+ api_key=openai_api_key,
100
+ base_url=openai_api_base,
101
+ )
102
+
103
+ try:
104
+ chat_response = client.chat.completions.create(
105
+ model=os.environ.get("MODEL"),
106
+ max_tokens=256,
107
+ temperature=0.5,
108
+ stop="<|im_end|>",
109
+ stream=True,
110
+ messages=messages
111
+ )
112
+
113
+ # Stream the response and concatenate into a complete text
114
+ text = ""
115
+ for chunk in chat_response:
116
+ if chunk.choices[0].delta.content:
117
+ text += chunk.choices[0].delta.content
118
+
119
+ # print("The raw response text is:")
120
+ # print(text)
121
+
122
+ # Use regex to extract content within square brackets
123
+ match = re.search(r'\[(.*?)\]', text)
124
+ if match:
125
+ refined_cluster_names = match.group(1).strip() # Extract and clean the cluster name
126
+ else:
127
+ refined_cluster_names = [
128
+ survey_title + ": Definition",
129
+ survey_title + ": Methods",
130
+ survey_title + ": Evaluation"
131
+ ] # Handle cases where pattern isn't found
132
+
133
+ except Exception as e:
134
+ print(f"An error occurred while refining cluster names: {e}")
135
+ refined_cluster_names = ["Refinement Error"] * len(cluster_names)
136
+
137
+ refined_cluster_names = ast.literal_eval(refined_cluster_names) # Convert string to list
138
+
139
+ # print("The refined cluster names are:")
140
+ # print(refined_cluster_names)
141
+ return refined_cluster_names # Returns a list with the refined cluster names、
142
+
143
+
144
+
145
+
146
+ def generate_cluster_name_new(tsv_path, survey_title, cluster_num = 3):
147
+ data = pd.read_csv(tsv_path, sep='\t')
148
+ desp=[]
149
+
150
+
151
+ for i in range(cluster_num): # Assuming labels are 0, 1, 2
152
+ sentence_list = [] # Initialize the sentence list
153
+ for j in range(len(data)):
154
+ if data['label'][j] == i:
155
+ sentence_list.append(data['retrieval_result'][j])
156
+ desp.append(sentence_list)
157
+
158
+ system_prompt = f'''
159
+ You are a research assistant working on a survey paper. The survey paper is about "{survey_title}". '''
160
+
161
+ cluster_info = "\n".join([f'Cluster {i+1}: "{desp[i]}"' for i in range(cluster_num)])
162
+
163
+ user_prompt = f'''
164
+ Your task is to generate {cluster_num} distinctive cluster names (e.g., "Pre-training of LLMs") of the given clusters of reference papers, each reference paper is described by a sentence.
165
+
166
+ The clusters of reference papers are:
167
+ {cluster_info}
168
+
169
+ Your output should be a single list of {cluster_num} cluster names, e.g., ["Pre-training of LLMs", "Fine-tuning of LLMs", "Evaluation of LLMs"]
170
+ Do not output any other text or information.
171
+ '''
172
+
173
+ messages = [
174
+ {"role": "system", "content": system_prompt},
175
+ {"role": "user", "content": user_prompt},
176
+ ]
177
+
178
+ openai_api_key = os.getenv("OPENAI_API_KEY")
179
+ openai_api_base = os.getenv("OPENAI_API_BASE")
180
+ client = OpenAI(
181
+ api_key=openai_api_key,
182
+ base_url=openai_api_base,
183
+ )
184
+
185
+ chat_response = client.chat.completions.create(
186
+ model=os.environ.get("MODEL"),
187
+ max_tokens=768,
188
+ temperature=0.5,
189
+ stop="<|im_end|>",
190
+ stream=True,
191
+ messages=messages
192
+ )
193
+
194
+ # Stream the response to a single text string
195
+ text = ""
196
+ for chunk in chat_response:
197
+ if chunk.choices[0].delta.content:
198
+ text += chunk.choices[0].delta.content
199
+ # print("The raw response text is:")
200
+ # print(text)
201
+
202
+ # Use regex to extract content within square brackets
203
+ match = re.search(r'\[(.*?)\]', text)
204
+ if match:
205
+ refined_cluster_names = match.group(1).strip() # Extract and clean the cluster name
206
+ else:
207
+ predefined_sections = [
208
+ "Definition", "Methods", "Evaluation", "Applications",
209
+ "Challenges", "Future Directions", "Comparisons", "Case Studies"
210
+ ]
211
+
212
+ # 根据 cluster_num 选择前 cluster_num 个预定义类别
213
+ refined_cluster_names = [
214
+ f"{survey_title}: {predefined_sections[i]}" for i in range(cluster_num)
215
+ ]
216
+
217
+ refined_cluster_names = ast.literal_eval(refined_cluster_names) # Convert string to list
218
+
219
+ # print("The refined cluster names are:")
220
+ # print(refined_cluster_names)
221
+ return refined_cluster_names # Returns a list with the refined cluster names、
222
+
223
+
224
+ if __name__ == "__main__":
225
+ refined_result = refine_cluster_name(["Pre-training of LLMs", "Fine-tuning of LLMs", "Evaluation of LLMs"], 'Survey of LLMs')
226
+ # print(refined_result)
227
+
228
+