File size: 10,396 Bytes
5dde370
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#!/bin/bash

# 创建结果目录
RESULTS_DIR="./clustering_results"
mkdir -p $RESULTS_DIR

# 数据库路径
DB_PATH="/home/dyvm6xra/dyvm6xrauser11/workspace/projects/HKU/Chatbot/Data/database"

# 定义日志文件
LOG_FILE="${RESULTS_DIR}/experiments_$(date +%Y%m%d_%H%M%S).log"

# 函数:运行实验并记录日志
run_experiment() {
    echo "运行实验: $1" | tee -a $LOG_FILE
    echo "命令: $2" | tee -a $LOG_FILE
    echo "开始时间: $(date)" | tee -a $LOG_FILE
    
    # 运行命令并将输出同时写入日志
    eval $2 | tee -a $LOG_FILE
    
    echo "结束时间: $(date)" | tee -a $LOG_FILE
    echo "==========================================" | tee -a $LOG_FILE
    echo "" | tee -a $LOG_FILE
}

echo "================ 聚类实验 ================" | tee -a $LOG_FILE
echo "开始时间: $(date)" | tee -a $LOG_FILE
echo "==========================================" | tee -a $LOG_FILE
echo "" | tee -a $LOG_FILE

# ==== 实验1:单独PCA降维 + 不同聚类方法 ====
# # PCA(50) + HDBSCAN
# run_experiment "PCA(50) + HDBSCAN" \
#     "python cluster_topic_exp.py --name pca50_hdbscan --dim_reduction pca --pca_components 50 --clustering hdbscan --db_path $DB_PATH --output_dir $RESULTS_DIR --use_gpu"

# PCA(8) + KMEANS(自动寻找最佳K)
run_experiment "PCA(4) + KMEANS(自动寻找最佳K)" \
    "python cluster_topic_exp.py --name pca4_kmeans_auto --dim_reduction pca --pca_components 4 --clustering kmeans --kmeans_min_k 4 --kmeans_max_k 31 --kmeans_step 2 --db_path $DB_PATH --output_dir $RESULTS_DIR --use_gpu"


# # # ==== 实验2:单独UMAP降维 + 不同聚类方法 ====
# # UMAP(2, n_neighbors=50) + HDBSCAN
# run_experiment "UMAP(2, n_neighbors=50) + HDBSCAN" \
#     "python cluster_topic_exp.py --name umap2_nn50_hdbscan --dim_reduction umap --umap_components 2 --umap_neighbors 50 --umap_min_dist 0.2 --clustering hdbscan --db_path $DB_PATH --output_dir $RESULTS_DIR --use_gpu"

# # UMAP(2, n_neighbors=30) + HDBSCAN
# run_experiment "UMAP(2, n_neighbors=30) + HDBSCAN" \
#     "python cluster_topic_exp.py --name umap2_nn30_hdbscan --dim_reduction umap --umap_components 2 --umap_neighbors 30 --umap_min_dist 0.2 --clustering hdbscan --db_path $DB_PATH --output_dir $RESULTS_DIR --use_gpu"

# # # UMAP(2, n_neighbors=50) + KMEANS(自动寻找最佳K)
# run_experiment "UMAP(2, n_neighbors=50) + KMEANS(自动寻找最佳K)" \
#     "python cluster_topic_exp.py --name umap2_nn50_kmeans_auto --dim_reduction umap --umap_components 2 --umap_neighbors 50 --umap_min_dist 0.2 --clustering kmeans --db_path $DB_PATH --output_dir $RESULTS_DIR --use_gpu --kmeans_min_k 10 --kmeans_max_k 210 --kmeans_step 20"


# run_experiment "UMAP(4, n_neighbors=50) + KMEANS(自动寻找最佳K)" \
#     "python cluster_topic_exp.py --name umap4_nn50_kmeans_auto --dim_reduction umap --umap_components 4 --umap_neighbors 50 --umap_min_dist 0.2 --clustering kmeans --db_path $DB_PATH --output_dir $RESULTS_DIR --use_gpu --kmeans_min_k 10 --kmeans_max_k 210 --kmeans_step 20"

# run_experiment "UMAP(16, n_neighbors=50) + KMEANS(自动寻找最佳K)" \
#     "python cluster_topic_exp.py --name umap16_nn50_kmeans_auto --dim_reduction umap --umap_components 16 --umap_neighbors 50 --umap_min_dist 0.2 --clustering kmeans --db_path $DB_PATH --output_dir $RESULTS_DIR --use_gpu --kmeans_min_k 10 --kmeans_max_k 210 --kmeans_step 20"

# run_experiment "UMAP(32, n_neighbors=50) + KMEANS(自动寻找最佳K)" \
#     "python cluster_topic_exp.py --name umap32_nn50_kmeans_auto --dim_reduction umap --umap_components 32 --umap_neighbors 50 --umap_min_dist 0.2 --clustering kmeans --db_path $DB_PATH --output_dir $RESULTS_DIR --use_gpu --kmeans_min_k 10 --kmeans_max_k 210 --kmeans_step 20"


# # # UMAP(100, n_neighbors=50) + KMEANS(自动寻找最佳K)
# run_experiment "UMAP(100, n_neighbors=50) + KMEANS(自动寻找最佳K)" \
#     "python cluster_topic_exp.py --name umap100_nn50_kmeans_auto --dim_reduction umap --umap_components 100 --umap_neighbors 50 --umap_min_dist 0.2 --clustering kmeans --db_path $DB_PATH --output_dir $RESULTS_DIR --use_gpu"

# # UMAP(64, n_neighbors=50) + KMEANS(自动寻找最佳K)
# run_experiment "UMAP(64, n_neighbors=50) + KMEANS(自动寻找最佳K)" \
#     "python cluster_topic_exp.py --name umap64_nn50_kmeans_auto --dim_reduction umap --umap_components 64 --umap_neighbors 50 --umap_min_dist 0.2 --clustering kmeans --db_path $DB_PATH --output_dir $RESULTS_DIR --use_gpu"

# # ==== 实验3:两步降维 PCA+UMAP + 不同聚类方法 ====
# PCA(50) + UMAP(2) + HDBSCAN(min_cluster_size=100)
# run_experiment "PCA(50) + UMAP(2) + HDBSCAN(min_cluster_size=100)" \
#     "python cluster_topic_exp.py --name pca50_umap2_hdbscan100 --dim_reduction pca_umap --pca_components 50 --umap_components 2 --clustering hdbscan --hdbscan_min_cluster_size 100 --hdbscan_min_samples 10 --db_path $DB_PATH --output_dir $RESULTS_DIR --use_gpu"

# # PCA(50) + UMAP(2) + HDBSCAN(min_cluster_size=50)
# run_experiment "PCA(50) + UMAP(2) + HDBSCAN(min_cluster_size=50)" \
#     "python cluster_topic_exp.py --name pca50_umap2_hdbscan50 --dim_reduction pca_umap --pca_components 50 --umap_components 2 --clustering hdbscan --hdbscan_min_cluster_size 50 --hdbscan_min_samples 5 --db_path $DB_PATH --output_dir $RESULTS_DIR --use_gpu"

# # PCA(50) + UMAP(2) + KMEANS(自动寻找最佳K)
# run_experiment "PCA(50) + UMAP(2) + KMEANS(自动寻找最佳K)" \
#     "python cluster_topic_exp.py --name pca50_umap2_kmeans_auto --dim_reduction pca_umap --pca_components 50 --umap_components 2 --clustering kmeans --db_path $DB_PATH --output_dir $RESULTS_DIR --use_gpu"

# # PCA(100) + UMAP(2) + HDBSCAN
# run_experiment "PCA(100) + UMAP(2) + HDBSCAN" \
#     "python cluster_topic_exp.py --name pca100_umap2_hdbscan --dim_reduction pca_umap --pca_components 100 --umap_components 2 --clustering hdbscan --db_path $DB_PATH --output_dir $RESULTS_DIR --use_gpu"

# # ==== 实验4:不同UMAP参数 + HDBSCAN ====
# # PCA(50) + UMAP(2, min_dist=0.1) + HDBSCAN
# run_experiment "PCA(50) + UMAP(2, min_dist=0.1) + HDBSCAN" \
#     "python cluster_topic_exp.py --name pca50_umap2_md01_hdbscan --dim_reduction pca_umap --pca_components 50 --umap_components 2 --umap_min_dist 0.1 --clustering hdbscan --db_path $DB_PATH --output_dir $RESULTS_DIR --use_gpu"

# # PCA(50) + UMAP(2, min_dist=0.5) + HDBSCAN
# run_experiment "PCA(50) + UMAP(2, min_dist=0.5) + HDBSCAN" \
#     "python cluster_topic_exp.py --name pca50_umap2_md05_hdbscan --dim_reduction pca_umap --pca_components 50 --umap_components 2 --umap_min_dist 0.5 --clustering hdbscan --db_path $DB_PATH --output_dir $RESULTS_DIR --use_gpu"

# ==== 实验5:结果分析脚本 ====
echo "所有实验完成,生成分析报告..." | tee -a $LOG_FILE

cat > ${RESULTS_DIR}/analyze_results.py << 'EOL'
#!/usr/bin/env python
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tabulate import tabulate

# 结果目录
results_dir = "./clustering_results"

# 加载所有实验结果
results = []
for filename in os.listdir(results_dir):
    if filename.endswith("_results.json"):
        with open(os.path.join(results_dir, filename), 'r') as f:
            try:
                data = json.load(f)
                results.append(data)
            except json.JSONDecodeError:
                print(f"无法解析: {filename}")

if not results:
    print("未找到任何结果文件")
    exit(1)

# 提取关键指标到DataFrame
data = []
for res in results:
    row = {
        "实验名称": res.get("experiment_name", "未知"),
        "降维方法": res.get("parameters", {}).get("dimension_reduction", {}).get("method", "未知"),
        "降维维度": res.get("parameters", {}).get("dimension_reduction", {}).get("n_components", "未知"),
        "聚类方法": res.get("parameters", {}).get("clustering", {}).get("method", "未知"),
    }
    
    # 添加聚类特定参数
    if row["聚类方法"] == "hdbscan":
        row["min_cluster_size"] = res.get("parameters", {}).get("clustering", {}).get("min_cluster_size", "未知")
        row["min_samples"] = res.get("parameters", {}).get("clustering", {}).get("min_samples", "未知")
    elif row["聚类方法"] == "kmeans":
        row["n_clusters"] = res.get("parameters", {}).get("clustering", {}).get("n_clusters", "未知")
    
    # 添加UMAP特定参数
    if row["降维方法"] == "umap":
        row["umap_n_neighbors"] = res.get("parameters", {}).get("dimension_reduction", {}).get("umap_n_neighbors", "未知")
        row["umap_min_dist"] = res.get("parameters", {}).get("dimension_reduction", {}).get("umap_min_dist", "未知")
    
    # 添加指标
    row["聚类数量"] = res.get("metrics", {}).get("n_clusters", "未知")
    row["噪声比例"] = res.get("metrics", {}).get("noise_ratio", "未知")
    row["轮廓系数"] = res.get("metrics", {}).get("silhouette_score", "未知")
    row["CH指数"] = res.get("metrics", {}).get("calinski_harabasz_score", "未知")
    row["最佳K值"] = res.get("metrics", {}).get("best_k", "未适用")
    
    data.append(row)

df = pd.DataFrame(data)

# 生成结果报告
print("=" * 80)
print("实验结果分析")
print("=" * 80)

# 打印表格
print("\n实验结果概览:")
print(tabulate(df, headers="keys", tablefmt="pipe", showindex=False))

# 保存到Excel
excel_file = os.path.join(results_dir, "实验结果分析.xlsx")
df.to_excel(excel_file, index=False)
print(f"\n详细结果已保存到: {excel_file}")

# 绘制轮廓系数对比
plt.figure(figsize=(12, 6))
sns.barplot(x="实验名称", y="轮廓系数", data=df)
plt.xticks(rotation=90)
plt.title("不同实验的轮廓系数对比")
plt.tight_layout()
plt.savefig(os.path.join(results_dir, "轮廓系数对比.png"))

# 绘制CH指数对比
plt.figure(figsize=(12, 6))
sns.barplot(x="实验名称", y="CH指数", data=df)
plt.xticks(rotation=90)
plt.title("不同实验的Calinski-Harabasz指数对比")
plt.tight_layout()
plt.savefig(os.path.join(results_dir, "CH指数对比.png"))

print("\n分析图表已生成")
EOL

# 设置脚本执行权限
chmod +x ${RESULTS_DIR}/analyze_results.py

echo "实验全部完成!" | tee -a $LOG_FILE
echo "总结果保存在: ${RESULTS_DIR}" | tee -a $LOG_FILE
echo "您可以运行以下命令分析结果:" | tee -a $LOG_FILE
echo "python ${RESULTS_DIR}/analyze_results.py" | tee -a $LOG_FILE