File size: 71,859 Bytes
a97d040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92d8c87
 
 
 
a97d040
 
 
 
 
 
 
 
 
 
 
 
 
 
92d8c87
 
 
 
 
 
 
 
a97d040
 
 
92d8c87
a97d040
92d8c87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a97d040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92d8c87
 
a97d040
92d8c87
 
 
 
 
 
 
 
 
 
a97d040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92d8c87
a97d040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92d8c87
a97d040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92d8c87
a97d040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92d8c87
a97d040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92d8c87
a97d040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92d8c87
 
a97d040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92d8c87
a97d040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92d8c87
a97d040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92d8c87
 
a97d040
92d8c87
 
a97d040
92d8c87
 
a97d040
 
 
 
 
 
 
 
 
 
 
 
 
92d8c87
a97d040
 
 
92d8c87
a97d040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92d8c87
a97d040
 
 
 
 
 
 
 
 
92d8c87
a97d040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92d8c87
a97d040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92d8c87
a97d040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92d8c87
a97d040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92d8c87
a97d040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92d8c87
a97d040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92d8c87
a97d040
 
 
 
 
92d8c87
a97d040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92d8c87
a97d040
92d8c87
a97d040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92d8c87
a97d040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92d8c87
 
 
 
a97d040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
from __future__ import unicode_literals
import sys

from django.shortcuts import render
from django.http import HttpResponse
from django.views.decorators.csrf import csrf_exempt
from django.core.files.uploadedfile import InMemoryUploadedFile
import os
import json
import requests
import time
import pandas as pd
import shutil
import traceback
from io import BytesIO

import hashlib
import re
import os
import csv
import xml.etree.ElementTree as ET
import urllib.parse

from django.http import JsonResponse
from django.http import HttpResponse
from django.views.decorators.csrf import csrf_exempt
from django.core.files.storage import default_storage

# from .parse import DocumentLoading
from .asg_retriever import legal_pdf, process_pdf, query_embeddings_new_new
from .asg_generator import generate,generate_sentence_patterns
from .asg_outline import OutlineGenerator,generateOutlineHTML_qwen, generateSurvey_qwen_new
from .asg_clustername import generate_cluster_name_new
from .postprocess import generate_references_section
from .asg_query import generate_generic_query_qwen, generate_query_qwen
from .asg_add_flowchart import insert_ref_images, detect_flowcharts
from .asg_mindmap import generate_graphviz_png, insert_outline_image
from .asg_latex import tex_to_pdf, insert_figures, md_to_tex, preprocess_md
# from .survey_generator_api import ensure_all_papers_cited
import glob

from langchain_huggingface import HuggingFaceEmbeddings
from dotenv import load_dotenv
from pathlib import Path
from markdown_pdf import MarkdownPdf, Section
import tempfile
from .path_utils import get_path

dotenv_path = os.path.join(os.path.dirname(__file__), ".env")
load_dotenv()
# # 打印所有环境变量(可选,调试时使用)
# print("所有环境变量:", os.environ)

# # 获取特定环境变量
# openai_api_key = os.getenv("OPENAI_API_KEY")
# openai_api_base = os.getenv("OPENAI_API_BASE")

# # 打印获取到的值
# print(f"OPENAI_API_KEY: {openai_api_key}")
# print(f"OPENAI_API_BASE: {openai_api_base}")

# 获取路径配置
paths_config = get_path('pdf')  # 使用 get_path 函数获取路径配置
DATA_PATH = get_path('pdf')
TXT_PATH = get_path('txt')
TSV_PATH = get_path('tsv')
MD_PATH = get_path('md')
INFO_PATH = get_path('info')
IMG_PATH = get_path('img')

paths = [DATA_PATH, TXT_PATH, TSV_PATH, MD_PATH, INFO_PATH, IMG_PATH]

# 安全地创建目录
for path in paths:
    try:
        path_obj = Path(path)
        if not path_obj.exists():
            path_obj.mkdir(parents=True, exist_ok=True)
            print(f"Created directory: {path}")
        else:
            print(f"Directory already exists: {path}")
    except (PermissionError, OSError) as e:
        print(f"Warning: Could not create directory {path}: {e}")
        # 在 Hugging Face Spaces 中,如果无法创建目录,使用临时目录
        if os.environ.get('SPACE_ID') or os.environ.get('HF_SPACE_ID'):
            temp_dir = tempfile.mkdtemp()
            # 更新路径为临时目录
            if 'pdf' in path:
                DATA_PATH = os.path.join(temp_dir, 'pdf/')
            elif 'txt' in path:
                TXT_PATH = os.path.join(temp_dir, 'txt/')
            elif 'tsv' in path:
                TSV_PATH = os.path.join(temp_dir, 'tsv/')
            elif 'md' in path:
                MD_PATH = os.path.join(temp_dir, 'md/')
            elif 'info' in path:
                INFO_PATH = os.path.join(temp_dir, 'info/')
            elif 'img' in path:
                IMG_PATH = os.path.join(temp_dir, 'img/')
            print(f"Using temporary directory: {temp_dir}")



Survey_dict = {
    '2742488' : 'Energy Efficiency in Cloud Computing',
    '2830555' : 'Cache Management for Real-Time Systems',
    '2907070' : 'Predictive Modeling on Imbalanced Data',
    '3073559' : 'Malware Detection with Data Mining',
    '3274658' : 'Analysis of Handwritten Signature'
}



Survey_Topic_dict = {
    '2742488' : ['energy'],
    '2830555' : ['cache'],
    '2907070' : ['imbalanced'],
    '3073559' : ['malware', 'detection'],
    '3274658' : ['handwritten', 'signature']
}


Survey_n_clusters = {
    '2742488' : 3,
    '2830555' : 3,
    '2907070' : 3,
    '3073559' : 3,
    '3274658' : 2
}

Global_survey_id = ""
Global_survey_title=""
Global_ref_list = []
Global_category_description = []
Global_category_label = []
Global_df_selected = ""
Global_test_flag = True
Global_collection_names = []
Global_collection_names_clustered = []
Global_file_names=[]
Global_description_list = []
Global_pipeline = None
Global_cluster_names = []
Global_citation_data = []
Global_cluster_num = 4


embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

from demo.category_and_tsne import clustering

class reference_collection(object):
    def __init__(
            self,
            input_df
    ):
        self.input_df = input_df

    def full_match_with_entries_in_pd(self, query_paper_titles):
        entries_in_pd = self.input_df.copy()
        entries_in_pd['ref_title'] = entries_in_pd['ref_title'].apply(str.lower)
        query_paper_titles = [i.lower() for i in query_paper_titles]

        # matched_entries = entries_in_pd[entries_in_pd['ref_title'].isin(query_paper_titles)]
        matched_entries = self.input_df[entries_in_pd['ref_title'].isin(query_paper_titles)]
        return matched_entries,matched_entries.shape[0]

    # select the sentences that can match with the topic words
    def match_ref_paper(self, query_paper_titles,match_mode='full', match_ratio=70):
        # query_paper_title = query_paper_title.lower()
        # two modes for str matching
        if match_mode == 'full':
            matched_entries, matched_num = self.full_match_with_entries_in_pd(query_paper_titles)
        return matched_entries, matched_num


def generate_uid():
    uid_str=""
    hash = hashlib.sha1()
    hash.update(str(time.time()).encode('utf-8'))
    uid_str= hash.hexdigest()[:10]

    return uid_str

def index(request):
    return render(request, 'demo/index.html')

def delete_files(request):
    if request.method == 'POST':
        try:
            # 使用动态路径而不是硬编码路径
            folders = [DATA_PATH, TSV_PATH, TXT_PATH, MD_PATH]
            for folder in folders:
                if os.path.exists(folder):
                    for filename in os.listdir(folder):
                        file_path = os.path.join(folder, filename)
                        try:
                            if os.path.isfile(file_path) or os.path.islink(file_path):
                                os.unlink(file_path)
                            elif os.path.isdir(file_path):
                                shutil.rmtree(file_path)
                        except Exception as e:
                            return JsonResponse({'success': False, 'message': str(e)})
            return JsonResponse({'success': True})
        except Exception as e:
            return JsonResponse({'success': False, 'message': str(e)})

    return JsonResponse({'success': False, 'message': 'Invalid request method'})

def clean_str(input_str):
    input_str = str(input_str).strip().lower()
    if input_str == "none" or input_str == "nan" or len(input_str) == 0:
        return ""
    input_str = input_str.replace('\\n',' ').replace('\n',' ').replace('\r',' ').replace('——',' ').replace('——',' ').replace('__',' ').replace('__',' ').replace('........','.').replace('....','.').replace('....','.').replace('..','.').replace('..','.').replace('..','.').replace('. . . . . . . . ','. ').replace('. . . . ','. ').replace('. . . . ','. ').replace('. . ','. ').replace('. . ','. ')
    input_str = re.sub(r'\\u[0-9a-z]{4}', ' ', input_str).replace('  ',' ').replace('  ',' ')
    return input_str

def PosRank_get_top5_ngrams(input_pd):
    pos = {'NOUN', 'PROPN', 'ADJ'}
    extractor = PosRank()

    abs_top5_unigram_list_list = []
    abs_top5_bigram_list_list = []
    abs_top5_trigram_list_list = []
    intro_top5_unigram_list_list = []
    intro_top5_bigram_list_list = []
    intro_top5_trigram_list_list = []

    for line_index,pd_row in input_pd.iterrows():

        input_str=pd_row["abstract"].replace('-','')
        extractor.load_document(input=input_str,language='en',normalization=None)

        #unigram
        unigram_extractor=extractor
        unigram_extractor.candidate_selection(maximum_word_number=1,minimum_word_number=1)
        unigram_extractor.candidate_weighting(window=6,pos=pos,normalized=False)
        abs_top5_unigram_list = []
        for (keyphrase, score) in unigram_extractor.get_n_best(n=5, stemming=True):
            keyphrase = keyphrase.replace('-','')
            if len(keyphrase)>2:
                abs_top5_unigram_list.append(keyphrase)
        #pdb.set_trace()
        #bigram
        bigram_extractor=extractor
        bigram_extractor.candidate_selection(maximum_word_number=2,minimum_word_number=2)
        bigram_extractor.candidate_weighting(window=6,pos=pos,normalized=False)
        abs_top5_bigram_list = []
        for (keyphrase, score) in bigram_extractor.get_n_best(n=5, stemming=True):
            keyphrase = keyphrase.replace('-','')
            if len(keyphrase)>2:
                abs_top5_bigram_list.append(keyphrase)

        #trigram
        trigram_extractor=extractor
        trigram_extractor.candidate_selection(maximum_word_number=3,minimum_word_number=3)
        trigram_extractor.candidate_weighting(window=6,pos=pos,normalized=False)
        abs_top5_trigram_list = []
        for (keyphrase, score) in trigram_extractor.get_n_best(n=5, stemming=True):
            keyphrase = keyphrase.replace('-','')
            if len(keyphrase)>2:
                abs_top5_trigram_list.append(keyphrase)

        abs_top5_unigram_list_list.append(abs_top5_unigram_list)
        abs_top5_bigram_list_list.append(abs_top5_bigram_list)
        abs_top5_trigram_list_list.append(abs_top5_trigram_list)

    return abs_top5_unigram_list_list,abs_top5_bigram_list_list,abs_top5_trigram_list_list

def process_file(file_name, survey_id, mode):
    global embedder
    result = process_pdf(file_name, survey_id, embedder, mode)
    collection_name = result[0]
    name = result[-1]
    return collection_name, name

def sanitize_filename_py(filename):
    last_dot = filename.rfind('.')
    
    def sanitize_part(part):
        part = part.lower()
        part = re.sub(r'[^a-z0-9]', ' ', part)
        part = re.sub(r'\s+', ' ', part)
        part = part.strip()
        words = part.split(' ')        
        if len(words) == 0:
            return ''  
        words[0] = words[0].capitalize()
        
        return ' '.join(words)
    
    if last_dot == -1:
        # No extension
        return sanitize_part(filename)
    elif last_dot == 0:
        # Hidden file
        extension = filename[1:]
        return '.' + sanitize_part(extension)
    else:
        # With extension
        name = filename[:last_dot]
        extension = filename[last_dot + 1:]
        return sanitize_part(name) + '.' + sanitize_part(extension)

def get_existing_survey_ids():

    tsv_directory = get_path('tsv')
    survey_ids = []
    try:
        for file_name in os.listdir(tsv_directory):
            if file_name.endswith(".tsv"):
                # 去掉 .tsv 后缀
                survey_ids.append(file_name[:-4])
    except Exception as e:
        print("Error reading tsv directory:", e)
    return survey_ids

def get_surveys(request):

    surveys = get_existing_survey_ids()
    return JsonResponse({'surveys': surveys})

@csrf_exempt
def upload_refs(request):
    
    start_time = time.time()
    RECOMMENDED_PDF_DIR = get_path('pdf', 'recommend_pdfs')
    if request.method == 'POST':
        if not request.FILES:
            if not os.path.exists(RECOMMENDED_PDF_DIR):
                return JsonResponse({'error': 'No file part'}, status=400)
    

        is_valid_submission = True
        has_label_id = False
        has_ref_link = False

        filenames = []
        collection_names = []
        filesizes = []
        file_dict = request.FILES

        global Global_survey_id
        global Global_test_flag
        global Global_collection_names
        global Global_survey_title
        global Global_file_names

        Global_survey_title = request.POST.get('topic', False)
        process_pdf_mode = request.POST.get('mode', False)
        file_dict = request.FILES.copy()
        if os.path.exists(RECOMMENDED_PDF_DIR):
            for pdf_name in os.listdir(RECOMMENDED_PDF_DIR):
                if pdf_name.endswith(".pdf"):
                    pdf_path = os.path.join(RECOMMENDED_PDF_DIR, pdf_name)

                    pdf_content = BytesIO()
                    with open(pdf_path, 'rb') as f:
                        shutil.copyfileobj(f, pdf_content)
                    pdf_content.seek(0)

                    uploaded_pdf = InMemoryUploadedFile(
                        pdf_content,
                        field_name="file",
                        name=pdf_name,
                        content_type="application/pdf",
                        size=os.path.getsize(pdf_path),
                        charset=None
                    )

                    file_dict[f"recommend_{pdf_name}"] = uploaded_pdf

            shutil.rmtree(RECOMMENDED_PDF_DIR)

        survey_id_choice = request.POST.get('survey_id')
        if survey_id_choice == "new":
            custom_survey_id = request.POST.get('custom_survey_id', '').strip()
            if custom_survey_id:
                Global_survey_id = custom_survey_id
            else:
                Global_survey_id = 'test_4' if Global_test_flag else generate_uid()
        else:
            Global_survey_id = survey_id_choice
        uid_str = Global_survey_id

        for file_name in file_dict:
            file = file_dict[file_name]
            if not file.name:
                return JsonResponse({'error': 'No selected file'}, status=400)
            if file:
                sanitized_filename = sanitize_filename_py(os.path.splitext(file.name)[0])
                file_extension = os.path.splitext(file.name)[1].lower()
                if sanitized_filename in filenames:
                    continue
                sanitized_filename = f"{sanitized_filename}{file_extension}"

                file_path = os.path.join(get_path('pdf', Global_survey_id), sanitized_filename)
                if default_storage.exists(file_path):
                    default_storage.delete(file_path)
                
                saved_file_name = default_storage.save(file_path, file)
                file_size = round(float(file.size) / 1024000, 2)

                collection_name, processed_file = process_file(saved_file_name, Global_survey_id, process_pdf_mode)
                Global_collection_names.append(collection_name)
                Global_file_names.append(processed_file)
                filenames.append(processed_file)
                filesizes.append(file_size)
                print(filenames)
                print(filesizes)

        new_file_name = Global_survey_id
        csvfile_name = new_file_name + '.'+ file_name.split('.')[-1]

        json_data_pd = pd.DataFrame()
        json_files_path = get_path('txt', Global_survey_id) + '/*.json'
        json_files = glob.glob(json_files_path)

        # Dictionary to hold title and abstract pairs
        title_abstract_dict = {}
        filtered_json_files = [
            json_file for json_file in json_files
            if os.path.splitext(os.path.basename(json_file))[0] in filenames
        ]
        ref_paper_num = len(filtered_json_files)
        print(f'The length of the json files is {ref_paper_num}')

        # Iterate over each JSON file
        for file_path in filtered_json_files:
            with open(file_path, 'r', encoding= "utf-8") as file:
                data = json.load(file)

                # Extract necessary information
                title = data.get("title", "")
                abstract = data.get("abstract", "")
                authors = data.get("authors", "")
                introduction = data.get("introduction", "")

                new_data = {
                    "reference paper title": title,
                    "reference paper citation information (can be collected from Google scholar/DBLP)": authors,
                    "reference paper abstract (Please copy the text AND paste here)": abstract,
                    "reference paper introduction (Please copy the text AND paste here)": introduction,
                    "reference paper doi link (optional)": "",
                    "reference paper category label (optional)": ""
                }

                new_data_df = pd.DataFrame([new_data])
                json_data_pd = pd.concat([json_data_pd, new_data_df], ignore_index=True)
                title_abstract_dict[title] = abstract

        input_pd = json_data_pd
        output_path = get_path('txt', Global_survey_id, 'title_abstract_pairs.json')
        os.makedirs(os.path.dirname(output_path), exist_ok=True)

        with open(output_path, 'w', encoding="utf-8") as outfile:
            json.dump(title_abstract_dict, outfile, indent=4, ensure_ascii=False)

        print(f'Title-abstract pairs have been saved to {output_path}')

        if ref_paper_num>0:

            print('The filenames are:', filenames)
            print('The json files are:', filtered_json_files)
            input_pd['ref_title'] = [filename for filename in filenames]
            input_pd["ref_context"] = [""]*ref_paper_num
            input_pd["ref_entry"] = input_pd["reference paper citation information (can be collected from Google scholar/DBLP)"]
            input_pd["abstract"] = input_pd["reference paper abstract (Please copy the text AND paste here)"].apply(lambda x: clean_str(x) if len(str(x))>0 else 'Invalid abstract')
            input_pd["intro"] = input_pd["reference paper introduction (Please copy the text AND paste here)"].apply(lambda x: clean_str(x) if len(str(x))>0 else 'Invalid introduction')

            input_pd["label"] = input_pd["reference paper category label (optional)"].apply(lambda x: str(x) if len(str(x))>0 else '')

            try:
                output_tsv_filename = get_path('tsv', filename=new_file_name + '.tsv')
                os.makedirs(os.path.dirname(output_tsv_filename), exist_ok=True)

                output_df = input_pd[["ref_title","ref_context","ref_entry","abstract","intro"]]

                if has_label_id == True:
                    output_df["label"]=input_pd["label"]
                else:
                    output_df["label"]=[""]*input_pd.shape[0]

                output_df.to_csv(output_tsv_filename, sep='\t')
            except:
                print("Cannot output tsv")
                is_valid_submission = False

        else:
            is_valid_submission = False

        if is_valid_submission == True:
            ref_ids = [i for i in range(output_df['ref_title'].shape[0])]
            ref_list = {
                        'ref_ids':ref_ids,
                        'is_valid_submission':is_valid_submission,
                        "uid":uid_str,
                        "tsv_filename":output_tsv_filename,
                        # 'topic_words': clusters_topic_words,
                        'filenames': filenames,
                        'filesizes': filesizes,
                        'survey_id': Global_survey_id
                        }

        else:
            ref_list = {'ref_ids':[],'is_valid_submission':is_valid_submission,"uid":uid_str,"tsv_filename":output_tsv_filename, 'filenames': filenames, 'filesizes': filesizes, 'survey_id': Global_survey_id}
        ref_list = json.dumps(ref_list)
        print("--- %s seconds used in processing files ---" % (time.time() - start_time))
        return HttpResponse(ref_list)

@csrf_exempt
def generate_arxiv_query(request):
    def search_arxiv_with_query(query, max_results=50):
        encoded_query = urllib.parse.quote_plus(query)
        url = f"https://export.arxiv.org/api/query?search_query={encoded_query}&start=0&max_results={max_results}&sortBy=submittedDate"
        
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Error fetching data with query: {query} | status code: {response.status_code}")
            return []
        
        try:
            root = ET.fromstring(response.text)
        except Exception as e:
            print("Error parsing XML:", e)
            return []
        
        ns = "{http://www.w3.org/2005/Atom}"
        entries = root.findall(f"{ns}entry")
        papers = []
        for entry in entries:
            title_elem = entry.find(f"{ns}title")
            title = title_elem.text.strip() if title_elem is not None else ""
            summary_elem = entry.find(f"{ns}summary")
            summary_text = summary_elem.text.strip() if summary_elem is not None else ""
            link_elem = entry.find(f"{ns}id")
            link_text = link_elem.text.strip() if link_elem is not None else ""
            arxiv_id = link_text.split('/')[-1]
            pdf_link = f"https://arxiv.org/pdf/{arxiv_id}.pdf"

            papers.append({
                "title": title,
                "summary": summary_text,
                "pdf_link": pdf_link,
                "arxiv_id": arxiv_id
            })
        
        return papers
    if request.method == 'POST':
        try:
            data = json.loads(request.body)
            topic = data.get('topic', '').strip()
            if not topic:
                return JsonResponse({'error': 'Topic is required.'}, status=400)
            max_results = 50
            min_results = 10
            strict_query = generate_query_qwen(topic)
            papers_strict = search_arxiv_with_query(strict_query, max_results=max_results)

            total_papers = {paper["title"]: paper for paper in papers_strict}

            if len(total_papers) >= min_results:
                papers_list = list(total_papers.values())  # dict -> list

                return JsonResponse({
                    "papers": papers_list,  # 例如 [{"title": "...", "summary": "...", "pdf_link": "...", "arxiv_id": "..."}]
                    "count": len(papers_list),
                }, status=200)

            attempts = 0
            MAX_ATTEMPTS = 5
            current_query = strict_query  # 方便追踪当前 query

            while len(total_papers) < min_results and attempts < MAX_ATTEMPTS:
                # 生成更宽松的查询
                generic_query = generate_generic_query_qwen(current_query, topic)
                papers_generic = search_arxiv_with_query(generic_query, max_results=max_results)

                # 合并新结果
                new_count = 0
                for paper in papers_generic:
                    if paper["title"] not in total_papers:
                        total_papers[paper["title"]] = paper
                        new_count += 1

                attempts += 1
                current_query = generic_query  # 将本轮的宽松查询作为"新的严格查询"

                if len(total_papers) >= min_results:
                    # 一旦达到 min_results,就返回此时的查询
                    papers_list = list(total_papers.values())  # dict -> list

                    return JsonResponse({
                        "papers": papers_list,  # 例如 [{"title": "...", "summary": "...", "pdf_link": "...", "arxiv_id": "..."}]
                        "count": len(papers_list),
                    }, status=200)

            return JsonResponse({
                'error': f'Not enough references found even after {attempts} attempts.',
                'count': len(total_papers),
            }, status=400)

        except Exception as e:
            return JsonResponse({'error': str(e)}, status=500)

    return JsonResponse({'error': 'Invalid request method.'}, status=405)

@csrf_exempt
def download_pdfs(request):
    def clean_filename(filename):

        filename = filename.strip()  # 去掉首尾空格和换行符
        filename = re.sub(r'[\\/*?:"<>|\n\r]', '', filename)  # 移除非法字符
        return filename
    if request.method == "POST":
        try:
            data = json.loads(request.body)
            pdf_links = data.get("pdf_links", [])
            pdf_titles = data.get("pdf_titles", [])  # PDF 标题列表
            print(pdf_links)

            if not pdf_links:
                return JsonResponse({"message": "No PDFs to download."}, status=400)

            base_dir = get_path('pdf', 'recommend_pdfs')
            os.makedirs(base_dir, exist_ok=True)  # 确保文件夹存在

            downloaded_files = []
            for i, pdf_url in enumerate(pdf_links):
                try:
                    response = requests.get(pdf_url, stream=True)
                    if response.status_code == 200:
                        # 处理文件名,确保合法
                        sanitized_title = clean_filename(pdf_titles[i]) if i < len(pdf_titles) else f"file_{i}"
                        pdf_filename = os.path.join(base_dir, f"{sanitized_title}.pdf")

                        # 下载 PDF
                        with open(pdf_filename, "wb") as pdf_file:
                            for chunk in response.iter_content(chunk_size=1024):
                                pdf_file.write(chunk)

                        downloaded_files.append(pdf_filename)
                        print(f"Success: {pdf_filename}")
                    else:
                        print(f"Failed to download {pdf_url}")

                except Exception as e:
                    print(f"Error downloading {pdf_url}: {e}")

            print("Download finished")
            return JsonResponse({"message": f"Downloaded {len(downloaded_files)} PDFs successfully!", "files": downloaded_files})

        except json.JSONDecodeError:
            return JsonResponse({"message": "Invalid JSON data."}, status=400)
        except Exception as e:
            return JsonResponse({"message": "An error occurred.", "error": str(e)}, status=500)

    return JsonResponse({"message": "Invalid request method."}, status=405)
@csrf_exempt
def annotate_categories(request):
    html = generateOutlineHTML_qwen(Global_survey_id)
    print("The outline has been parsed successfully.")
    return JsonResponse({'html': html})

@csrf_exempt
def get_topic(request):
    topic = request.POST.get('topics', False)
    references, ref_links, ref_ids = get_refs(topic)
    global Global_survey_id
    Global_survey_id = topic
    ref_list = {
        'references' : references,
        'ref_links'  : ref_links,
        'ref_ids'    : ref_ids
    }
    ref_list = json.dumps(ref_list)
    return HttpResponse(ref_list)

@csrf_exempt
def automatic_taxonomy(request):
    global Global_description_list, Global_df_selected, Global_cluster_names, Global_ref_list, Global_category_label, Global_collection_names_clustered, Global_cluster_num
    refs_json = request.POST.get("refs")
    ref_list = json.loads(refs_json)
    ref_list = [int(item) for item in ref_list]
    print(ref_list)
    query = request.POST.get("taxonomy_standard")

    query_list = generate_sentence_patterns(query)

    for name in Global_collection_names:
        context, citation_data = query_embeddings_new_new(name, query_list)
        Global_citation_data.extend(citation_data)

        description = generate(context, query, name)
        Global_description_list.append(description)

    # 保存引用数据
    citation_path = get_path('info', Global_survey_id, 'citation_data.json')
    os.makedirs(os.path.dirname(citation_path), exist_ok=True)
    with open(citation_path, 'w', encoding='utf-8') as f:
        json.dump(Global_citation_data, f, ensure_ascii=False, indent=2)

    # 读取TSV文件
    file_path = get_path('tsv', Global_survey_id + '.tsv')

    Global_ref_list = ref_list

    print('Categorization survey id', Global_survey_id)

    colors, category_label =  Clustering_refs(n_clusters=Global_cluster_num)
    Global_category_label = category_label

    df_tmp = Global_df_selected.reset_index()
    df_tmp['index'] = df_tmp.index
    ref_titles = list(df_tmp.groupby(df_tmp['label'])['ref_title'].apply(list))
    ref_indexs = list(df_tmp.groupby(df_tmp['label'])['index'].apply(list))

    info = pd.read_json(get_path('info', Global_survey_id, 'topic.json'))
    category_label = info['KeyBERT'].to_list()
    category_label_summarized=[]

    tsv_path = get_path('tsv', Global_survey_id + '.tsv')

    cluster_num = Global_cluster_num
    category_label_summarized = generate_cluster_name_new(tsv_path, Global_survey_title, cluster_num)   
    Global_cluster_names = category_label_summarized

    cate_list = {
        'colors': colors,
        'category_label': category_label_summarized,
        'survey_id': Global_survey_id,
        'ref_titles': [[i.title() for i in j] for j in ref_titles],
        'ref_indexs': ref_indexs
    }
    print(cate_list)
    cate_list = json.dumps(cate_list)

    cluster_info = {category_label_summarized[i]:ref_titles[i] for i in range(len(category_label_summarized))}
    for key, value in cluster_info.items():
        temp = [legal_pdf(i) for i in value]
        cluster_info[key] = temp
        Global_collection_names_clustered.append(temp)
    cluster_info_path = get_path('info', Global_survey_id, 'cluster_info.json')
    with open(cluster_info_path, 'w', encoding="utf-8") as outfile:
        json.dump(cluster_info, outfile, indent=4, ensure_ascii=False)

    
    outline_generator = OutlineGenerator(Global_df_selected, Global_cluster_names)
    outline_generator.get_cluster_info()
    messages, outline = outline_generator.generate_outline_qwen(Global_survey_title, Global_cluster_num)

    outline_json = {'messages':messages, 'outline': outline}
    output_path = get_path('txt', Global_survey_id, 'outline.json')
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w', encoding="utf-8") as outfile:
        json.dump(outline_json, outfile, indent=4, ensure_ascii=False)

    return HttpResponse(cate_list)

@csrf_exempt
def save_updated_cluster_info(request):
    global Global_collection_names
    if request.method == 'POST':
        try:
            data = json.loads(request.body)
            survey_id = Global_survey_id
            updated_cate_list = data.get('updated_cate_list')
            ref_indexs = updated_cate_list.get("ref_indexs", [])
            rearranged_collection_names = [
                [Global_collection_names[index] for index in group] for group in ref_indexs
            ]
            updated_cate_list["collection_name"] = rearranged_collection_names

            if not survey_id or not updated_cate_list:
                return JsonResponse({"error": "Missing survey_id or updated_cate_list"}, status=400)

            save_dir = get_path('info', str(survey_id))
            os.makedirs(save_dir, exist_ok=True)
            save_path = os.path.join(save_dir, 'cluster_info_updated.json')

            with open(save_path, 'w', encoding='utf-8') as f:
                json.dump(updated_cate_list, f, ensure_ascii=False, indent=4)

            return JsonResponse({"message": "Cluster info updated and saved successfully!"}, status=200)
        except Exception as e:
            return JsonResponse({"error": str(e)}, status=500)
    else:
        return JsonResponse({"error": "Invalid request method. Only POST is allowed."}, status=405)

import os
import json
from django.http import JsonResponse
from django.views.decorators.csrf import csrf_exempt
from django.conf import settings

@csrf_exempt
def save_outline(request):
    if request.method == 'POST':
        try:
            data = json.loads(request.body)
            updated_outline = data.get('outline', [])

            outline_data = {
                "messages": [
                    {
                        "role": "system",
                        "content": "Finish the outline of the survey paper..."
                    },
                    {
                        "role": "user",
                        "content": "Finish the outline..."
                    }
                ],
                "outline": str(updated_outline)
            }

            file_path = get_path('txt', Global_survey_id, 'outline.json')
            os.makedirs(os.path.dirname(file_path), exist_ok=True)

            with open(file_path, 'w', encoding='utf-8') as file:
                json.dump(outline_data, file, indent=4, ensure_ascii=False)
            
            html = generateOutlineHTML_qwen(Global_survey_id)

            return JsonResponse({"status": "success", "html": html})
        except Exception as e:
            return JsonResponse({"status": "error", "message": str(e)}, status=400)
    else:
        return JsonResponse({"status": "error", "message": "Invalid request method"}, status=405)

@csrf_exempt
def select_sections(request):
    sections = request.POST
    survey = {}

    for k,v in sections.items():
        survey['title'] = "A Survey of " + Survey_dict[Global_survey_id]

        if k == "abstract":
            survey['abstract'] = ["The issue of class imbalance is pervasive in various practical applications of machine learning and data mining, including information retrieval and filtering, and the detection of credit card fraud. The problem of imbalanced learning concerns the effectiveness of learning algorithms when faced with underrepresented data and severe class distribution skews. The classification of data with imbalanced class distribution significantly hinders the performance of most standard classifier learning algorithms that assume a relatively balanced class distribution and equal misclassification costs.",
                                  "In this survey, we present a comprehensive overview of predictive modeling on imbalanced data. We categorize existing literature into three clusters: Sampling approaches, Algorithmic approaches, and Meta-learning approaches, which we introduce in detail. Our aim is to provide readers with a thorough understanding of the different strategies proposed to tackle the class imbalance problem and evaluate their effectiveness in enhancing the performance of learning algorithms."]
        if k == "introduction":
            survey['introduction'] = [
              {
                'subtitle': 'Background',
                'content' : '''Class imbalance is a common problem in machine learning and data mining, where the distribution of classes in the training dataset is highly skewed, with one class being significantly underrepresented compared to the other(s). This issue is prevalent in many real-world applications, including fraud detection, medical diagnosis, anomaly detection, and spam filtering, to name a few.
                               \nThe problem of imbalanced data affects the performance of many learning algorithms, which typically assume a balanced class distribution and equal misclassification costs. When the data is imbalanced, standard learning algorithms tend to favor the majority class, resulting in low accuracy in predicting the minority class. This drawback can lead to serious consequences, such as false negative errors in fraud detection or misdiagnosis in medical applications.
                               \nTo address the class imbalance problem, various techniques have been proposed, including resampling methods, cost-sensitive learning, and ensemble methods, among others. Resampling methods involve creating synthetic samples or under/oversampling the minority/majority classes to balance the data. Cost-sensitive learning assigns different misclassification costs to different classes to prioritize the minority class's correct prediction. Ensemble methods combine multiple models to improve predictive performance.
                               \nThe effectiveness of these techniques varies depending on the dataset and problem at hand. Hence, it is crucial to conduct a comprehensive evaluation of the different approaches to identify the most suitable one for a specific application. As such, your survey paper aims to provide an overview of the current state-of-the-art predictive modeling techniques for imbalanced data and highlight their strengths and limitations.
                            '''
              },
             {
                'subtitle': 'Methodologies', # Sampling approaches, Algorithmic approaches, and Meta-learning approaches
                'content' : '''Exisiting works are mainly categorized into Sampling approaches, Algorithmic approaches, and Meta-learning approaches.
                              \nSampling approaches:
                              \nResampling techniques are among the most popular methods for handling imbalanced data. These techniques involve either oversampling the minority class or undersampling the majority class to create a more balanced dataset. Examples of oversampling methods include SMOTE (Synthetic Minority Over-sampling Technique), ADASYN (Adaptive Synthetic Sampling), and Borderline-SMOTE. Undersampling techniques include random undersampling and Tomek Links. Moreover, hybrid methods, which combine both oversampling and undersampling, have also been proposed.
                              \nAlgorithmic approaches:
                              \nAnother approach to address the class imbalance problem is to modify the learning algorithm itself. Examples of such algorithmic approaches include cost-sensitive learning, where different costs are assigned to different types of misclassifications. Another approach is to adjust the decision threshold of the classifier, where the threshold is shifted to increase sensitivity towards the minority class. Additionally, ensemble methods, such as bagging, boosting, and stacking, have been proposed to combine multiple classifiers to improve predictive performance.
                              \nMeta-learning approaches:
                              \nMeta-learning approaches aim to automatically select the most suitable sampling or algorithmic approach for a specific dataset and problem. These approaches involve training a meta-classifier on multiple base classifiers, each using a different sampling or algorithmic approach. The meta-classifier then selects the most appropriate approach based on the characteristics of the input dataset. Examples of meta-learning approaches include MetaCost, MetaCostNN, and RAkEL.
                              \nThese approaches have shown promising results in addressing the class imbalance problem. However, their effectiveness depends on the specific characteristics of the dataset and problem at hand. Therefore, a comprehensive evaluation of different approaches is necessary to identify the most suitable one for a particular application.
                            '''
             },
             {
                'subtitle': 'Reminder',
                'content' : 'The rest of the paper is organized as follows. In section 2, we introduce the class imbalance problem and its causes and characteristics. Evaluation metrics are addressed in section 3. Section 4 presents an overview of the existing techniques for handling imbalanced data. Applications is illustrated in Section 5. Section 6 shows challenges and open issues. Conclusion and future directions are in Section 7.'
             }
            ]

        if k == "c_and_c":
            survey['c_and_c'] = '''Imbalanced data is a common problem in many real-world applications of machine learning and data mining, where the distribution of classes is highly skewed, with one or more classes being significantly underrepresented compared to the others. This can occur due to various reasons, such as sampling bias, data collection limitations, class overlap, or natural class distribution. The causes of imbalanced data can differ across different domains and applications, and understanding them is essential for developing effective predictive modeling techniques.
                            \nIn addition to the causes, imbalanced data is characterized by several properties that make it challenging for traditional machine learning algorithms. Firstly, the data imbalance results in a class distribution bias, where the majority class dominates the data, and the minority class(es) are often overshadowed, leading to poor classification performance. Secondly, the imbalance can lead to an asymmetric misclassification cost, where misclassifying the minority class is often more costly than misclassifying the majority class, resulting in high false negative rates. Thirdly, imbalanced data can exhibit class overlap, where instances from different classes are difficult to distinguish, leading to low discriminative power of the features and classifiers. Finally, imbalanced data can pose challenges for model evaluation and comparison, as traditional performance metrics such as accuracy, precision, and recall, can be misleading or inadequate in imbalanced settings.
                            \nUnderstanding the causes and characteristics of imbalanced data is crucial for developing effective and efficient predictive modeling techniques that can handle such data. The next section of this survey will discuss the various approaches proposed in the literature to address the imbalanced learning problem, with a focus on sampling, algorithmic, and meta-learning approaches.
                            '''
        if k == "evaluation":
            survey['evaluation'] = '''Evaluation metrics are an essential aspect of machine learning and data mining, as they quantify the performance of predictive models on a given dataset. In the case of imbalanced data, traditional evaluation metrics such as accuracy, precision, and recall may not be sufficient or even appropriate due to the class imbalance and asymmetry in misclassification costs. Therefore, alternative metrics have been proposed to measure the performance of predictive models on imbalanced datasets.
                            \nOne commonly used evaluation metric for imbalanced data is the area under the receiver operating characteristic curve (AUC-ROC). The AUC-ROC is a measure of the model's ability to distinguish between positive and negative instances and is computed as the area under the curve of the ROC plot. The ROC plot is a graphical representation of the trade-off between true positive rate (TPR) and false positive rate (FPR) for different decision thresholds. A perfect classifier would have an AUC-ROC score of 1, while a random classifier would have a score of 0.5.
                            \nAnother popular evaluation metric for imbalanced data is the area under the precision-recall curve (AUC-PR). The AUC-PR measures the precision-recall trade-off of the model and is computed as the area under the curve of the precision-recall plot. The precision-recall plot shows the relationship between precision and recall for different decision thresholds. A perfect classifier would have an AUC-PR score of 1, while a random classifier would have a score proportional to the ratio of positive to negative instances.
                            \nOther evaluation metrics for imbalanced data include F-measure, geometric mean, balanced accuracy, and cost-sensitive measures such as weighted and cost-sensitive versions of traditional metrics. F-measure is a harmonic mean of precision and recall, which balances the trade-off between them. The geometric mean is another metric that balances TPR and FPR and is useful in highly imbalanced datasets. Balanced accuracy is the average of TPR and TNR (true negative rate) and is useful in datasets where the class imbalance is extreme. Cost-sensitive measures incorporate the cost of misclassification and can be tailored to the specific application domain.
                            \nChoosing an appropriate evaluation metric for imbalanced data is essential to avoid biased or misleading performance estimates. The selection of metrics should be based on the application requirements, the class distribution, and the misclassification costs. In the next section, we will discuss various sampling, algorithmic, and meta-learning approaches proposed in the literature to address the imbalanced learning problem and their associated evaluation metrics.
                            '''

        if k == "methodology":
            survey['methodology'] = [
                'Our survey categorized existing works into three types: Sampling approaches, Algorithmic approaches, and Meta-learning approaches. Sampling approaches involve oversampling or undersampling, while algorithmic approaches modify the learning algorithm itself. Meta-learning approaches aim to automatically select the most suitable approach based on the characteristics of the input dataset.',
                [{'subtitle': 'Sampling approaches',
                  'content': 'For sampling approaches, Batista, et al. [1] proposed a simple experimental design to assess the performance of class imbalance treatment methods.  E.A.P.A. et al. [2] performs a broad experimental evaluation involving ten methods, three of them proposed by the authors, to deal with the class imbalance problem in thirteen uci data sets.  Batuwita, et al. [3] presents a method to improve fsvms for cil (called fsvm-cil), which can be used to handle the class imbalance problem in the presence of outliers and noise.  V. et al. [4] implements a wrapper approach that computes the amount of under-sampling and synthetic generation of the minority class examples (smote) to improve minority class accuracy.  Chen, et al. [5] presents ranked minority oversampling in boosting (ramoboost), which is a ramo technique based on the idea of adaptive synthetic data generation in an ensemble learning system.  Chen, et al. [6] proposes a new feature selection method, feature assessment by sliding thresholds (fast), which is based on the area under a roc curve generated by moving the decision boundary of a single feature classifier with thresholds placed using an even-bin distribution.  Davis, et al. [7] shows that a deep connection exists between roc space and pr space, such that a curve dominates in roc space if and only if it dominates in pr space.  In classifying documents, the system combines the predictions of the learners by applying evolutionary techniques as well [8]. Ertekin, et al. [9] is concerns with the class imbalance problem which has been known to hinder the learning performance of classification algorithms.  Ertekin, et al. [10] demonstrates that active learning is capable of solving the problem.  Garcı́aÿ, et al. [11] analyzes a generalization of a new metric to evaluate the classification performance in imbalanced domains, combining some estimate of the overall accuracy with a plain index about how dominant the class with the highest individual accuracy is.  Ghasemi, et al. [12] proposes an active learning algorithm that can work when only samples of one class as well as a set of unlabeled data are available.  He, et al. [13] provides a comprehensive review of the development of research in learning from imbalanced data.  Li, et al. [14] proposes an oversampling method based on support degree in order to guide people to select minority class samples and generate new minority class samples.  Li, et al. [15] analyzes the intrinsic factors behind this failure and proposes a suitable re-sampling method.  Liu, et al. [16] proposes two algorithms to overcome this deficiency.  J. et al. [17] considers the application of these ensembles to imbalanced data : classification problems where the class proportions are significantly different.  Seiffert, et al. [18] presents a new hybrid sampling/boosting algorithm, called rusboost, for learning from skewed training data.  Song, et al. [19] proposes an improved adaboost algorithm called baboost (balanced adaboost), which gives higher weights to the misclassified examples from the minority class.  Sun, et al. [20] develops a cost-sensitive boosting algorithm to improve the classification performance of imbalanced data involving multiple classes.  Van et al. [21] presents a comprehensive suite of experimentation on the subject of learning from imbalanced data.  Wasikowski, et al. [22] presents a first systematic comparison of the three types of methods developed for imbalanced data classification problems and of seven feature selection metrics evaluated on small sample data sets from different applications.  an active under-sampling approach is proposed for handling the imbalanced problem in Yang, et al. [23]. Zhou, et al. [24] studies empirically the effect of sampling and threshold-moving in training cost-sensitive neural networks. \n'},
                 {'subtitle': 'Algorithmic approaches',
                  'content': 'For algorithmic approaches, Baccianella, et al. [25] proposed a simple way to turn standard measures for or into ones robust to imbalance.  Lin, et al. [26] applies a fuzzy membership to each input point and reformulate the svms such that different input points can make different constributions to the learning of decision surface. \n'},
                 {'subtitle': 'Meta-learning approaches',
                  'content': 'For meta-learning approaches, Drummond, et al. [27] proposed an alternative to roc representation, in which the expected cost of a classi er is represented explicitly.  Tao, et al. [28] develops a mechanism to overcome these problems.  Torgo et al. [29] presents a generalization of regression error characteristic (rec) curves.  C. et al. [30] demonstrates that class probability estimates attained via supervised learning in imbalanced scenarios systematically underestimate the probabilities for minority class instances, despite ostensibly good overall calibration.  Yoon, et al. [31] proposes preprocessing majority instances by partitioning them into clusters.  Zheng, et al. [32] investigates the usefulness of explicit control of that combination within a proposed feature selection framework.'}]]



        if k == "app":
            survey['app'] = '''The problem of imbalanced data is pervasive in many real-world applications of predictive modeling, where the data is often skewed towards one or more minority class or classes. Such applications include, but are not limited to, fraud detection in finance, rare disease diagnosis in healthcare, fault detection in manufacturing, spam filtering in email systems, and anomaly detection in cybersecurity. In these scenarios, accurately identifying the minority class instances is of utmost importance, as they often represent critical and rare events that have significant impact or consequences.
                            \nHowever, traditional classification algorithms tend to perform poorly on imbalanced datasets, since they are often biased towards the majority class due to its abundance in the data. This results in low accuracy, high false negative rates, and poor generalization performance, especially for the minority class(es) of interest. In addition, the cost of misclassifying the minority class is often much higher than that of the majority class, making it even more critical to achieve high accuracy and low false negative rates for these instances.
                            \nTo overcome the class imbalance problem, a variety of predictive modeling techniques have been proposed and developed in the literature, specifically designed to handle imbalanced datasets. These techniques range from simple preprocessing methods that adjust the class distribution, to more complex algorithmic modifications that incorporate class imbalance considerations into the learning process. The effectiveness of these techniques depends on the specific characteristics of the dataset and problem, and thus, their selection and evaluation require careful experimentation and analysis.
                            \nOverall, the development and application of predictive modeling techniques for imbalanced data is an active and important research area, with many practical and societal implications. Advancements in this field have the potential to improve the accuracy, efficiency, and fairness of many critical applications, and thus, benefit society as a whole.
                            '''

        if k == "app":
            survey['clg'] = '''Selecting the most appropriate sampling, algorithmic, or meta-learning approach for a specific dataset: There is no one-size-fits-all solution, and choosing the right approach can be challenging.
                            \nLack of standard evaluation metrics that can capture the performance of models on imbalanced data, especially for rare events: Existing evaluation metrics like accuracy can be misleading in imbalanced datasets, and there is a need for metrics that can capture the performance of models on rare events.
                            \nInterpretability and explainability of models trained on imbalanced data: It can be difficult to understand how a model arrives at its predictions, especially when the data is heavily skewed, and there is a need for more interpretable models.
                            \nScalability of methods to handle very large datasets with imbalanced class distributions: As datasets grow in size, it can be challenging to scale methods to handle the imbalanced class distribution efficiently.
                            \nNeed for better feature engineering techniques to handle imbalanced data: Feature engineering is an important step in predictive modeling, and there is a need for better techniques that can handle imbalanced data.
                            \nDevelopment of new learning algorithms that are specifically designed to work well on imbalanced datasets: Most standard learning algorithms assume a relatively balanced class distribution, and there is a need for new algorithms that can handle imbalanced data more effectively.
                            \nResearch into the use of semi-supervised and unsupervised learning techniques for imbalanced data: Semi-supervised and unsupervised learning techniques have shown promise in imbalanced data, and there is a need for more research to explore their potential.
                            \nPotential benefits of using ensemble methods to combine multiple models trained on imbalanced data: Ensemble methods can improve the performance of models on imbalanced data by combining multiple models, and there is a need for more research to explore their potential.
                            \nDeveloping more effective methods for dealing with concept drift and evolving class distributions over time in imbalanced datasets: As class distributions evolve over time, it can be challenging to adapt models to the new distribution, and there is a need for more effective methods to handle concept drift.
                            '''


        if k == "conclusion":
            conclusion = '''In conclusion, the class imbalance problem is a significant challenge in predictive modeling, which can lead to biased models and poor performance. In this survey, we have provided a comprehensive overview of existing works on predictive modeling on imbalanced data. We have discussed different approaches to address this problem, including sampling approaches, algorithmic approaches, and meta-learning approaches, as well as evaluation metrics and challenges in this field. We also presented some potential future research directions in this area. The insights and knowledge provided in this survey paper can help researchers and practitioners better understand the challenges and opportunities in predictive modeling on imbalanced data and design more effective approaches to address this problem in real-world applications.
            \nThere are also some potencial directions for future research:
            \n1. Incorporating domain knowledge: Incorporating domain-specific knowledge can help improve the performance of models on imbalanced data. Research can be done on developing techniques to effectively integrate domain knowledge into the modeling process.
            \n2. Explainability of models: With the increasing adoption of machine learning models in critical applications, it is important to understand how the models make predictions. Research can be done on developing explainable models for imbalanced data, which can provide insights into the reasons for model predictions.
            \n3. Online learning: Imbalanced data can evolve over time, and models need to be adapted to new data as it becomes available. Research can be done on developing online learning algorithms that can adapt to imbalanced data in real-time.
            \n4. Multi-label imbalanced classification: In many real-world scenarios, multiple classes can be imbalanced simultaneously. Research can be done on developing techniques for multi-label imbalanced classification that can effectively handle such scenarios.
            \n5. Transfer learning: In some cases, imbalanced data in one domain can be used to improve the performance of models in another domain. Research can be done on developing transfer learning techniques for imbalanced data, which can leverage knowledge from related domains to improve performance.
            \n6. Incorporating fairness considerations: Models trained on imbalanced data can have biases that can disproportionately affect certain groups. Research can be done on developing techniques to ensure that models trained on imbalanced data are fair and do not discriminate against any particular group.
            \n7. Imbalanced data in deep learning: Deep learning has shown great promise in various applications, but its effectiveness on imbalanced data is not well understood. Research can be done on developing techniques to effectively apply deep learning to imbalanced data.
            \n8. Large-scale imbalanced data: With the increasing availability of large-scale datasets, research can be done on developing scalable techniques for predictive modeling on imbalanced data.
            '''
            survey['conclusion'] = conclusion

    survey['references'] = []
    try:
        for ref in Global_df_selected['ref_entry']:
            entry = str(ref)
            survey['references'].append(entry)
    except:
        import traceback
        print(traceback.print_exc())

    survey_dict = json.dumps(survey)

    return HttpResponse(survey_dict)

@csrf_exempt
def get_survey(request):
    survey_dict = get_survey_text()
    survey_dict = json.dumps(survey_dict)
    return HttpResponse(survey_dict)
    
@csrf_exempt
def get_survey_id(request):
    global Global_survey_id, Global_survey_title, Global_collection_names_clustered, Global_pipeline, Global_citation_data
    generateSurvey_qwen_new(Global_survey_id, Global_survey_title, Global_collection_names_clustered, Global_pipeline, Global_citation_data)
    print("Global_collection_names_clustered: ")
    for i, element in enumerate(Global_collection_names_clustered):
        print(f"第 {i} 个元素:{element}")
    return JsonResponse({"survey_id": Global_survey_id})

@csrf_exempt
def generate_pdf(request):
    if request.method == 'POST':
        survey_id = request.POST.get('survey_id', '')
        markdown_content = request.POST.get('content', '')
        markdown_dir = get_path('info', survey_id) + '/'
        markdown_filename = f'survey_{survey_id}_vanilla.md'
        markdown_filepath = os.path.join(markdown_dir, markdown_filename)

        # 确保目标目录存在
        if not os.path.exists(markdown_dir):
            os.makedirs(markdown_dir)
            print(f"Directory '{markdown_dir}' created.")
        else:
            print(f"Directory '{markdown_dir}' already exists.")

        # 保存 Markdown 内容到文件
        with open(markdown_filepath, 'w', encoding='utf-8') as markdown_file:
            markdown_file.write(markdown_content)
        print(f"Markdown content saved to: {markdown_filepath}")

        markdown_content = finalize_survey_paper(markdown_content, Global_collection_names, Global_file_names)
        # 设置 Markdown 文件的保存路径1
        markdown_dir = get_path('info', survey_id) + '/'
        markdown_filename = f'survey_{survey_id}_processed.md'
        markdown_filepath = os.path.join(markdown_dir, markdown_filename)

        # 确保目标目录存在1
        if not os.path.exists(markdown_dir):
            os.makedirs(markdown_dir)
            print(f"Directory '{markdown_dir}' created.")
        else:
            print(f"Directory '{markdown_dir}' already exists.")

        # 保存 Markdown 内容到文件1
        with open(markdown_filepath, 'w', encoding='utf-8') as markdown_file:
            markdown_file.write(markdown_content)
        print(f"Markdown content saved to: {markdown_filepath}")



        # 配置 PDF 文件的保存路径
        pdf_filename = f'survey_{survey_id}.pdf'
        pdf_dir = get_path('results')
        pdf_filepath = os.path.join(pdf_dir, pdf_filename)

        # 检查并创建 results 目录
        if not os.path.exists(pdf_dir):
            os.makedirs(pdf_dir)
            print(f"Directory '{pdf_dir}' created.")
        else:
            print(f"Directory '{pdf_dir}' already exists.")

        # 打印文件保存路径信息
        print(f"PDF will be saved to: {pdf_filepath}")

        # 使用 markdown_pdf 库生成 PDF
        pdf = MarkdownPdf()
        pdf.meta["title"] = "Survey Results"  # 设置 PDF 的元数据
        pdf.add_section(Section(markdown_content, toc=False))  # 添加 Markdown 内容,不生成目录
        pdf.save(pdf_filepath)  # 将 PDF 保存到文件

        # 打开文件并将其作为响应返回
        with open(pdf_filepath, 'rb') as pdf_file:
            response = HttpResponse(pdf_file.read(), content_type='application/pdf')
            response['Content-Disposition'] = f'attachment; filename="{pdf_filename}"'
            return response

    return JsonResponse({'error': 'Invalid request method'}, status=400)

@csrf_exempt
def generate_pdf_from_tex(request):

    global Global_survey_id, Global_survey_title
    if request.method == 'POST':
        base_dir = get_path('info', Global_survey_id)
        md_path = os.path.join(base_dir, f'survey_{Global_survey_id}_processed.md')
        new_md_path = os.path.join(base_dir, f'survey_{Global_survey_id}_preprocessed.md')
        tex_path = os.path.join(base_dir, 'template.tex')
        new_tex_path = os.path.join(base_dir, 'template_with_figure.tex')
        sty_path = os.path.join(base_dir, 'acl.sty')
        pdf_dir = get_path('results')
        
        os.makedirs(base_dir, exist_ok=True)
        print(f"Directory '{base_dir}' checked or created.")

        origin_template = 'src/demo/latex_template/template.tex'
        origin_acl_sty = 'src/demo/latex_template/acl.sty'
        shutil.copy(origin_template, tex_path)
        shutil.copy(origin_acl_sty, sty_path)

        os.makedirs(pdf_dir, exist_ok=True)

        preprocess_md(md_path, new_md_path)
        md_to_tex(new_md_path, tex_path, Global_survey_title)

        insert_figures(
            png_path=get_path('info', Global_survey_id, 'outline.png'),
            tex_path= tex_path, 
            json_path=get_path('info', Global_survey_id, 'flowchart_results.json'),
            ref_names= Global_ref_list,
            survey_title=Global_survey_title,
            new_tex_path=new_tex_path
        )

        tex_to_pdf(
            new_tex_path,
            output_dir=os.path.dirname(new_tex_path),
            compiler="xelatex"
        )
        pdf_path = os.path.join(os.path.dirname(new_tex_path), 'template_with_figure.pdf' )
        final_pdf_path = os.path.join(pdf_dir, f'survey_{Global_survey_id}_latex.pdf')
        shutil.copy2(pdf_path, final_pdf_path)

        try:
            with open(final_pdf_path, 'rb') as pdf_file:
                response = HttpResponse(pdf_file.read(), content_type='application/pdf')
                filename = os.path.basename(final_pdf_path)
                response['Content-Disposition'] = f'attachment; filename="{filename}"'
                return response
        except Exception as e:
            return JsonResponse({'error': f'读取 PDF 文件失败: {e}'}, status=500)

    return JsonResponse({'error': 'Invalid request method'}, status=400)

def get_refs(topic):
    '''
    Get the references from given topic
    Return with a list
    '''
    default_references = ['ref1','ref2','ref3','ref4','ref5','ref6','ref7','ref8','ref9','ref10']
    default_ref_links = ['', '', '', '', '', '', '', '', '', '']
    default_ref_ids = ['', '', '', '', '', '', '', '', '', '']
    references = []
    ref_links = []
    ref_ids = []

    try:
        ## here is the algorithm part
        ref_path   = os.path.join(DATA_PATH, topic + '.tsv')
        df         = pd.read_csv(ref_path, sep='\t')
        for i,r in df.iterrows():
            # print(r['intro'], r['ref_title'], i)
            if not pd.isnull(r['intro']):
                references.append(r['ref_title'])
                ref_links.append(r['ref_link'])
                ref_ids.append(i)
    except:
        print(traceback.print_exc())
        references = default_references
        ref_links = default_ref_links
        ref_ids = default_ref_ids
    print(len(ref_ids))
    return references, ref_links, ref_ids

def get_survey_text(refs=Global_ref_list):
    '''
    Get the survey text from a given ref list
    Return with a dict as below default value:
    '''
    survey = {
        'Title': "A Survey of " + Survey_dict[Global_survey_id],
        'Abstract': "test "*150,
        'Introduction': "test "*500,
        'Methodology': [
            "This is the proceeding",
            [{"subtitle": "This is the first subtitle", "content": "test "*500},
             {"subtitle": "This is the second subtitle", "content": "test "*500},
             {"subtitle": "This is the third subtitle", "content": "test "*500}]
        ],
        'Conclusion': "test "*150,
        'References': []
    }

    try:
        ## abs generation
        abs, last_sent = absGen(Global_survey_id, Global_df_selected, Global_category_label)
        survey['Abstract'] = [abs, last_sent]

        ## Intro generation
        #intro = introGen_supervised(Global_survey_id, Global_df_selected, Global_category_label, Global_category_description)
        intro = introGen(Global_survey_id, Global_df_selected, Global_category_label, Global_category_description)
        survey['Introduction'] = intro

        ## Methodology generation
        proceeding, detailed_des = methodologyGen(Global_survey_id, Global_df_selected, Global_category_label, Global_category_description)
        survey['Methodology'] = [proceeding, detailed_des]

        ## Conclusion generation
        conclusion = conclusionGen(Global_survey_id, Global_category_label)
        survey['Conclusion'] = conclusion

        try:
            for ref in Global_df_selected['ref_entry']:
                entry = str(ref)
                survey['References'].append(entry)
        except:
            colors, category_label, category_description = Clustering_refs(n_clusters=Survey_n_clusters[Global_survey_id])
            for ref in Global_df_selected['ref_entry']:
                entry = str(ref)
                survey['References'].append(entry)

    except:
        print(traceback.print_exc())
    return survey

def Clustering_refs(n_clusters):
    global Global_cluster_num
    df = pd.read_csv(get_path('tsv', Global_survey_id + '.tsv'), sep='\t', index_col=0, encoding='utf-8')

    print(Global_ref_list)
    df_selected = df.iloc[Global_ref_list]
    df_selected, colors, best_n_topics = clustering(df_selected, [3,4,5], Global_survey_id)
    Global_cluster_num = best_n_topics

    global Global_df_selected
    Global_df_selected = df_selected
    category_description = [0]*len(colors)
    category_label = [0]*len(colors)

    return colors, category_label
    # return 1,0,1

def remove_invalid_citations(text, valid_collection_names):
    pattern = r"\[(.*?)\\\]"
    all_matches = re.findall(pattern, text)

    new_text = text
    for match in all_matches:
        cleaned_match = match.rstrip('\\')
        if cleaned_match not in valid_collection_names:
            new_text = new_text.replace(f"[{match}\\]", "")
    return new_text

# wza
def normalize_citations_with_mapping(paper_text):
    citations = re.findall(r'\[.*?\]', paper_text)
    unique_citations = list(dict.fromkeys(citations))
    citation_mapping = {citation: f'[{i + 1}]' for i, citation in enumerate(unique_citations)}

    normalized_text = paper_text
    for old_citation, new_citation in citation_mapping.items():
        normalized_text = normalized_text.replace(old_citation, new_citation)

    reverse_mapping = {
        i + 1: unique_citations[i].strip('[]').rstrip('\\')
        for i in range(len(unique_citations))
    }

    return normalized_text, reverse_mapping

def generate_references_section(citation_mapping, collection_pdf_mapping):
    references = ["# References"]
    ref_list = []
    for num in sorted(citation_mapping.keys()):
        collection_name = citation_mapping[num]
        pdf_name = collection_pdf_mapping.get(collection_name, "Unknown PDF")
        if pdf_name.endswith(".pdf"):
            pdf_name = pdf_name[:-4]
        ref_list.append(pdf_name)
        # 在每一行末尾添加两个空格以确保换行
        references.append(f"[{num}] {pdf_name}  ")

    return "\n".join(references), ref_list

def fix_citation_punctuation_md(text):
    pattern = r'\.\s*(\\\[\d+\])'
    replacement = r' \1.'
    fixed_text = re.sub(pattern, replacement, text)
    return fixed_text

def finalize_survey_paper(paper_text, 
                          Global_collection_names, 
                          Global_file_names):
    global Global_survey_id, Global_survey_title, Global_ref_list

    paper_text = remove_invalid_citations(paper_text, Global_collection_names)
    normalized_text, citation_mapping = normalize_citations_with_mapping(paper_text)
    normalized_text = fix_citation_punctuation_md(normalized_text)
    collection_pdf_mapping = dict(zip(Global_collection_names, Global_file_names))
    
    references_section, ref_list = generate_references_section(citation_mapping, collection_pdf_mapping)
    Global_ref_list = ref_list
    print(ref_list)

    json_path = get_path('txt', Global_survey_id, 'outline.json')
    output_png_path = get_path('info', Global_survey_id, 'outline')
    md_path = get_path('info', Global_survey_id, f'survey_{Global_survey_id}_processed.md')
    flowchart_results_path = get_path('info', Global_survey_id, 'flowchart_results.json')
    detect_flowcharts(Global_survey_id)
    png_path = generate_graphviz_png(
        json_path=json_path,
        output_png_path=output_png_path,
        md_content=normalized_text,
        title=Global_survey_title,
        max_root_chars=30
    )

    try:
        normalized_text = insert_ref_images(flowchart_results_path, ref_list, normalized_text)
    except Exception as e:
        print(f"Error inserting ref image: {e}. Continuing with next step.")
    try:
        normalized_text = insert_outline_image(
            png_path=png_path,
            md_content=normalized_text,
            survey_title =Global_survey_title
        )
    except Exception as e:
        print(f"Error inserting outline image: {e}. Continuing with next step.")

    final_paper = normalized_text.strip() + "\n\n" + references_section
    return final_paper