Spaces:

cuongnguyen910
/

topic-clustering-global-dashboard

Build error

File size: 24,963 Bytes
from tensorRT import inference
import re
from collections import Counter
from vncorenlp import VnCoreNLP
from nltk.tokenize import sent_tokenize
import torch
import datetime
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import json
from . import utils
import time
from summary import text_summary, get_summary_bert
from function.clean_text import normalize_text
from .summary_with_llm import summary_with_llama
from .translate import translate_text_multi_layer
from scipy.spatial import distance
import copy
from .sentence_embbeding import embbeded_zh, embbeded_en, embedded_bge


# from . import detect_time as dt

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
use_cuda = torch.cuda.is_available()
print(torch.cuda.is_available())

# annotator = VnCoreNLP('vncorenlp/VnCoreNLP-1.1.1.jar', port=9191, annotators="wseg,pos", max_heap_size='-Xmx2g')


def detect_postaging(text_in):
    word_segmented_text = annotator.annotate(text_in)
    lst_k = []
    for se in word_segmented_text["sentences"]:
        for kw in se:
            if kw["posTag"] in ("Np", "Ny", "N"):
                if kw["posTag"] == "N" and "_" not in kw["form"]:
                    continue
                lst_k.append(kw["form"].replace("_", " "))
    return list(set(lst_k))

def clean_text(text_in):
    doc = re.sub('<.*?>', '', text_in)
    doc = re.sub('(function).*}', ' ', doc)
    # link
    doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.htm)', ' ', doc)
    doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.html)', ' ', doc)
    doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\/\/)', ' ', doc)
    doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.htm)', ' ', doc)
    doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.html)', ' ', doc)
    doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vn)', ' ', doc)
    doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.net)', ' ', doc)
    doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vgp)', ' ', doc)
    doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.vgp)', ' ', doc)

    doc = re.sub('(http:\/\/).*?(\.htm)', ' ', doc)
    doc = re.sub('(http:\/\/).*?(\.html)', ' ', doc)
    doc = re.sub('(https:\/\/).*?(\/\/)', ' ', doc)
    doc = re.sub('(https:\/\/).*?(\.htm)', ' ', doc)
    doc = re.sub('(https:\/\/).*?(\.html)', ' ', doc)
    doc = re.sub('(https:\/\/).*?(\.vn)', ' ', doc)
    doc = re.sub('(https:\/\/).*?(\.net)', ' ', doc)
    doc = re.sub('(https:\/\/).*?(\.vgp)', ' ', doc)
    doc = re.sub('(http:\/\/).*?(\.vgp)', ' ', doc)
    # escape sequence
    doc = re.sub('\n', ' ', doc)
    doc = re.sub('\t', ' ', doc)
    doc = re.sub('\r', ' ', doc)

    doc = normalize_text(doc)
    return doc


def data_cleaning(docs):
    res = []
    for d in docs:
        if 'message' in d:
            # css and js
            doc = re.sub('<.*?>', '', d['message'])
            doc = re.sub('(function).*}', ' ', doc)

            # link
            doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.htm)', ' ', doc)
            doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.html)', ' ', doc)
            doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\/\/)', ' ', doc)
            doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.htm)', ' ', doc)
            doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.html)', ' ', doc)
            doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vn)', ' ', doc)
            doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.net)', ' ', doc)
            doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vgp)', ' ', doc)
            doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.vgp)', ' ', doc)

            doc = re.sub('(http:\/\/).*?(\.htm)', ' ', doc)
            doc = re.sub('(http:\/\/).*?(\.html)', ' ', doc)
            doc = re.sub('(https:\/\/).*?(\/\/)', ' ', doc)
            doc = re.sub('(https:\/\/).*?(\.htm)', ' ', doc)
            doc = re.sub('(https:\/\/).*?(\.html)', ' ', doc)
            doc = re.sub('(https:\/\/).*?(\.vn)', ' ', doc)
            doc = re.sub('(https:\/\/).*?(\.net)', ' ', doc)
            doc = re.sub('(https:\/\/).*?(\.vgp)', ' ', doc)
            doc = re.sub('(http:\/\/).*?(\.vgp)', ' ', doc)
            # escape sequence
            doc = re.sub('\n', ' ', doc)
            doc = re.sub('\t', ' ', doc)
            doc = re.sub('\r', ' ', doc)

            d['message'] = doc
            res.append(d)
    return res


def segment(docs, lang="vi"):
    segmented_docs = []
    for d in docs:
        # print(d)
        # if len(d.get('message', "")) > 8000 or len(d.get('message', "")) < 100:
        if len(d.get('message', "")) > 8000:
            continue
        if 'snippet' not in d:
            continue
        try:
            if lang == "vi":
                snippet = d.get('snippet', "")
                segmented_snippet = ""
                segmented_sentences_snippet = annotator.tokenize(snippet)
                for sentence in segmented_sentences_snippet:
                    segmented_snippet += ' ' + ' '.join(sentence)
                segmented_snippet = segmented_snippet.replace('\xa0', '')
                d['segmented_snippet'] = segmented_snippet
            segmented_docs.append(d)
        except Exception:
            pass
    return segmented_docs


def timestamp_to_date(timestamp):
    return datetime.datetime.fromtimestamp(timestamp).strftime('%d/%m/%Y')


def re_ranking(result_topic, vectors_prompt, sorted_field):
    lst_score = []
    lst_ids = []
    lst_top = []
    try:
        for k in result_topic:
            lst_ids.append(k)
            if  not sorted_field.strip():
                    lst_top.append(len(result_topic[k]))
            else:
                lst_top.append(result_topic[k][0]['max_score'])
            vector_center = result_topic[k][0]["vector"]
            max_score = 11.0
            for vec in vectors_prompt:
                score = distance.cosine(np.array(vec), np.array(vector_center))
                if score < max_score:
                    max_score = score
            lst_score.append(max_score)
            result_topic[k][0]["similarity_score"] = max_score
            for d in result_topic[k]:
                d["similarity_score"] = max_score
            del result_topic[k][0]["vector"]
        idx = np.argsort(np.array(lst_score))
    except Exception as ve:
        return [], lst_ids, lst_top
    return idx, lst_ids, lst_top

def post_processing(response, top_cluster=5, top_sentence=5, topn_summary=5, sorted_field='', max_doc_per_cluster = 50, delete_message=True, prompt="", hash_str: str= "", vectors_prompt: list = []):
    print(f'[INFO] sorted_field: {sorted_field}')
    MAX_DOC_PER_CLUSTER = max_doc_per_cluster

    lst_ids = []
    lst_top = []
    lst_res = []
    idx = []
    if prompt:
        idx, lst_ids, lst_top = re_ranking(response, vectors_prompt, sorted_field)
        print("idx_prompt: ", idx)
    if len(prompt) == 0 or len(idx) == 0:
        for i in response:
            lst_ids.append(i)
            if  not sorted_field.strip():
                lst_top.append(len(response[i]))
            else:
                lst_top.append(response[i][0]['max_score'])
        idx = np.argsort(np.array(lst_top))[::-1] 
        print("idx_not_prompt: ", idx)
    if top_cluster == -1:
        top_cluster = len(idx)
    for i in idx[: top_cluster]:
        ik = lst_ids[i]
        if top_sentence == -1:
            top_sentence = len(response[ik])
        lst_check_title = []
        lst_check_not_title = []
        i_c_t = 0
        for resss in response[ik]:
            r_title = resss.get("title", "")
            if r_title and not r_title.endswith("..."):
                lst_check_title.append(resss)
                i_c_t += 1
            else:
                lst_check_not_title.append(resss)
            if i_c_t == top_sentence:
                break
        if i_c_t == top_sentence:
            lst_res.append(lst_check_title)
        else:
            lst_check_title.extend(lst_check_not_title)
            lst_res.append(lst_check_title[:top_sentence])
        #lst_res.append(response[ik][:top_sentence])
    dict_res = {}
    for i in range(len(lst_res)):
        dict_res[str(i + 1)] = lst_res[i][:MAX_DOC_PER_CLUSTER]
        for j in range(min(len(dict_res[str(i + 1)]), 3)):
            dict_res[str(i + 1)][0]["title_summarize"].append(dict_res[str(i + 1)][j].get("snippet", ""))
        # t11 = time.time()
        summary_text = get_summary_bert(dict_res[str(i + 1)][0].get("message", ""), dict_res[str(i + 1)][0].get("lang", "vi"), topn=topn_summary, title=dict_res[str(i + 1)][0].get("title", ""), snippet=dict_res[str(i + 1)][0].get("snippet", ""))
        # print("time_summary: ", time.time() - t11)
        if len(summary_text) < 10:
            summary_text = dict_res[str(i + 1)][0].get("snippet", "")
            if len(summary_text) < 10:
                summary_text = dict_res[str(i + 1)][0].get("title", "")
        summary_text = utils.remove_image_keyword(summary_text)
        # if prompt:
        #     if dict_res[str(i + 1)][0].get("message", ""):
        #         src_lang = dict_res[str(i + 1)][0].get("lang", "")
        #         print("src_lang: ", src_lang)
        #         print("summary_text: ", summary_text)
        #         summary_text = translate_text_multi_layer(src_lang, "vi", summary_text)
        #         text_tran = translate_text_multi_layer(src_lang, "vi", dict_res[str(i + 1)][0].get("message", ""))
        #         ans_from_llama = summary_with_llama(prompt, text_tran, "vi", version="vi-llama", max_word_per_context=1000)
        #         print("ans_from_llama: ", ans_from_llama)
        #     summary_text = summary_text + "$$$$\n" + ans_from_llama
        #     print("summary_text: ", summary_text, len(summary_text))
        dict_res[str(i + 1)][0]["content_summary"] = summary_text
        dict_res[str(i + 1)][0]["num_of_post"] = len(lst_res[i])
        kew_phares = []
        dict_res[str(i + 1)][0]["topic_keywords"] = kew_phares
        
        # print("delete_message: ", delete_message)
        if delete_message:
            for j in range(len(dict_res[str(i + 1)])):
                if "message" in dict_res[str(i + 1)][j]:
                    del dict_res[str(i + 1)][j]["message"]
    
    with open(f"log_llm/topic_result_after_postprocessing/{hash_str}.json", "w") as f:
        dict_log_pos = {}
        for k in dict_res:
            dict_log_pos[k] = copy.deepcopy(dict_res[k])
            for d in dict_log_pos[k]:
                if "message" in d:
                    del d["message"]
                if "vector" in d:
                    del d["vector"]
        json.dump(dict_log_pos, f, ensure_ascii= False)
    return dict_res


def get_lang(docs):
    lang_vi = 0
    lang_en = 0
    dict_lang = {}
    for d in docs:
        lang = d.get("lang", "")
        if lang not in dict_lang:
            dict_lang[lang] = 0
        dict_lang[lang] += 1
        # if d.get("lang", "") == "vi":
        #     lang_vi += 1
        # else:
        #     lang_en += 1
    lst_lang = []
    lst_cnt = []
    for k in dict_lang:
        lst_lang.append(k)
        lst_cnt.append(dict_lang[k])
    idx_max = np.argsort(np.array(lst_cnt))[::-1][0]
    lang = lst_lang[int(idx_max)]
    
    if lang.startswith("zh_"):
        lang = "zh"
    print("lang: ", lang, lst_cnt[int(idx_max)])
    return lang


def topic_clustering(docs, distance_threshold, top_cluster=5, top_sentence=5, topn_summary=5, sorted_field='', max_doc_per_cluster=50, 

                        delete_message=True, prompt="", type_cluster:str = "single", hash_str: str= "", id_topic=""):
    # global model, model_en
    with open("data/topic_name.txt") as f:
        dict_topic_name = json.load(f)
    topic_name_relevant = dict_topic_name.get(id_topic , "")
    docs = docs[:30000]
    lang = get_lang(docs)
    if type_cluster == "complete" and lang == "zh":
        distance_threshold = 0.4
    if type_cluster == "complete" and lang == "en":
        distance_threshold = 0.4
        # type_cluster = "single"

    result = {}
    cluster_score = {}
    cluster_real_vectors = {}
    # docs = segment(docs, lang=lang)

    t1 = time.time()
    if len(docs) < 1:
        return result
    elif len(docs) == 1:
        return {
            "0": docs
        }
    vec_prompt = []
    prompt_strips = []
    # prompt = ""
    if topic_name_relevant:
        prompt_split = topic_name_relevant.split("#####")
        for prom in prompt_split:
            sys_p = prom.strip().split("$$$$")
            if len(sys_p) == 1:
                prompt_strips.append(prom.strip())
            else:
                prompt_strips.append(sys_p[1].strip())
        if lang == "zh":
            vec_prompt = embbeded_zh(prompt_split)
        elif lang == "en":
            vec_prompt = embbeded_en(prompt_split)
        else:
            vec_prompt = inference.encode(prompt_split, lang=lang)
    if lang == "zh":
        features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
        vectors = embbeded_zh(features)
        # vectors = embedded_bge(features)
        if len(vectors) == 0:
            print(f"[WARNING] Embedded {lang}: {len(vectors)} / {len(features)}")
            vectors = inference.encode(features, lang=lang)
        # vectors = model.encode(features, show_progress_bar=False)
    elif lang == "en":
        features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
        vectors = embbeded_en(features)
        # vectors = embedded_bge(features)
        if len(vectors) == 0:
            print(f"[WARNING] Embedded {lang}: {len(vectors)} / {len(features)}")
            vectors = inference.encode(features, lang=lang)
    else:
        features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
        # vectors = embedded_bge(features)
        # if len(vectors) == 0:
        #     print(f"[WARNING] Embedded {lang}: {len(vectors)} / {len(features)}")
        vectors = inference.encode(features, lang=lang)
        # vectors = model_en.encode(features, show_progress_bar=False)
    clusteror = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='cosine',
                                        linkage=type_cluster, distance_threshold=distance_threshold)
    clusteror.fit(vectors)
    matrix_vec = np.stack(vectors, axis=0)
    print(f"Time encode + clustering: {time.time() - t1} {clusteror.n_clusters_}")
    for i in range(clusteror.n_clusters_):
        result[str(i + 1)] = []
        cluster_score[str(i + 1)] = 0
        ids = clusteror.labels_ # == i
        # cluster_real_vectors[str(i + 1)] = re_clustering(ids, matrix_vec, distance_threshold, max_doc_per_cluster)

    for i in range(len(clusteror.labels_)):
        cluster_no = clusteror.labels_[i]
        # if any((cluster_real_vectors[str(cluster_no+1)][:] == vectors[i]).all(1)):
        if docs[i].get('domain','') not in ["cungcau.vn","baomoi.com","news.skydoor.net"]:
            response_doc = {}
            response_doc = docs[i]
            score = response_doc.get('score', 0)
            if not docs[i].get('message','').strip():
                continue 
            if score > cluster_score[str(cluster_no + 1)]:
                cluster_score[str(cluster_no + 1)] = score
            if 'domain' in docs[i]:
                response_doc['domain'] = docs[i]['domain']
            if 'url' in docs[i]:
                response_doc['url'] = docs[i]['url']
            if 'title' in docs[i]:
                response_doc['title'] = clean_text(docs[i]['title'])
            if 'snippet' in docs[i]:
                response_doc['snippet'] = clean_text(docs[i]['snippet'])
            if 'created_time' in docs[i]:
                response_doc['created_time'] = docs[i]['created_time']
            if "sentiment" in docs[i]:
                response_doc['sentiment'] = docs[i]['sentiment']
            if 'message' in docs[i]:
                title = docs[i].get('title','')
                snippet = docs[i].get('snippet','')
                message = docs[i].get('message','')
                if title.strip():
                    split_mess = message.split(title)
                    if len(split_mess) > 1:
                        message = title.join(split_mess[1:])
                if snippet.strip():
                    split_mess = message.split(snippet)
                    if len(split_mess) > 1:
                        message = snippet.join(split_mess[1:])

                response_doc['message'] = clean_text(message)
            if 'id' in docs[i]:
                response_doc['id'] = docs[i]['id']
            # response_doc['score'] = 0.0
            response_doc['title_summarize'] = []
            response_doc['content_summary'] = ""
            response_doc['total_facebook_viral'] = 0
            response_doc["vector"] = np.array(vectors[i]).tolist()
            result[str(cluster_no + 1)].append(response_doc)
    empty_clus_ids = []
    for x in result:
        result[x] = sorted(result[x], key=lambda i: -len(i.get('message','')))
        if len( result[x]) > 0:
            if len(result[x]) > 1:
                result[x] = check_duplicate_title_domain(result[x])
            result[x][0]['num_docs'] = len(result[x])
            result[x][0]['max_score'] = cluster_score[x]
        else:
            empty_clus_ids.append(x)
    
    for x in empty_clus_ids:
        result.pop(x,None)
    # result = dict(sorted(result.items(), key=lambda i: -len(i[1])))[:top_cluster]
    with open(f"log_llm/topic_result_before_postprocessing/{hash_str}.json", "w") as f:
        dict_log = {}
        for k in result:
            dict_log[k] = copy.deepcopy(result[k])
            for d in dict_log[k]:
                if "message" in d:
                    del d["message"]
                if "vector" in d:
                    del d["vector"]
        json.dump(dict_log, f, ensure_ascii= False)
    return post_processing(result, top_cluster=top_cluster, top_sentence=top_sentence, topn_summary=topn_summary, sorted_field = sorted_field, max_doc_per_cluster=max_doc_per_cluster, delete_message=delete_message,
                            prompt=topic_name_relevant, hash_str=hash_str, vectors_prompt=vec_prompt)

def check_duplicate_title_domain(docs):
    lst_title_domain = [f"{d.get('domain', '')} {d.get('title','')}" for d in docs]
    for i in range(1,len(lst_title_domain) -1):
        for j in range(i+1,len(lst_title_domain)):
            if lst_title_domain[j] == lst_title_domain[i]:
                lst_title_domain[j] = 'dup'
    lst_filter_docs = [docs[i] for i,x in enumerate(lst_title_domain) if x != 'dup']
    return lst_filter_docs
def convert_date(text):
    text = text.replace(".", "/")
    text = text.replace("-", "/")
    return text


def check_keyword(sentence):
    keyword = ['sáng', 'trưa', 'chiều', 'tối', 'đến', 'hôm', 'ngày', 'tới']
    for k in keyword:
        if k in sentence:
            return True
    return False


def extract_events_and_time(docs, publish_date):
    def standardize(date_str):
        return date_str.replace('.', '/').replace('-', '/')

    def add_0(date_str):

        date_str = date_str.split('/')
        res = []
        for o in date_str:
            o = re.sub('\s+', '', o)
            if len(o) < 2:
                o = '0' + o
            res.append(o)
        date_str = '/'.join(res)
        return date_str

    def get_date_list(reg, sentence):
        find_object = re.finditer(reg, sentence)
        date_list = [x.group() for x in find_object]
        return date_list

    year = publish_date.split('/')[2]

    # dd/mm/yyyy
    reg_exp_1 = '(\D|^)(?:0?[1-9]|[12][0-9]|3[01])[- \/.](?:0?[1-9]|1[012])[- \/.]([12]([0-9]){3})(\D|$)'
    # #mm/yyyy
    # reg_exp_5 = '(\D|^)(?:0?[1-9]|1[012])[- \/.]([12]([0-9]){3})(\D|$)'
    # dd/mm
    reg_exp_2 = '(\D|^)(?:0?[1-9]|[12][0-9]|3[01])[- \/.](?:0?[1-9]|1[012])(\D|$)'

    # ngày  dd tháng mm năm yyyy
    reg_exp_3 = '(ngày)\s*\d{1,2}\s*(tháng)\s*\d{1,2}\s*(năm)\s*\d{4}'
    # ngày dd tháng mm
    reg_exp_4 = '(ngày)\s*\d{1,2}\s*(tháng)\s*\d{1,2}'

    result = []
    for d in docs:
        text = d['message']
        for sentence in sent_tokenize(text):
            lower_sentence = sentence.lower()
            c = re.search(reg_exp_3, sentence.lower())
            d = re.search(reg_exp_4, sentence.lower())
            # e = re.search(reg_exp_5, sentence.lower())
            a = re.search(reg_exp_1, sentence)
            b = re.search(reg_exp_2, sentence)
            #
            if (a or b or c or d) and check_keyword(lower_sentence):
                date_list = get_date_list(reg_exp_1, lower_sentence)
                date_entity = ''
                if date_list:
                    date_entity = add_0(standardize(date_list[0]))
                elif get_date_list(reg_exp_2, lower_sentence):
                    date_list = get_date_list(reg_exp_2, lower_sentence)
                    date_entity = add_0(standardize(date_list[0]) + '/' + year)
                elif get_date_list(reg_exp_3, lower_sentence):
                    date_list = get_date_list(reg_exp_3, lower_sentence)

                    date_entity = date_list[0].replace('ngày', '').replace('tháng', '').replace('năm', '').strip()
                    date_entity = re.sub('\s+', ' ', date_entity)
                    date_entity = date_entity.replace(' ', '/')
                    date_entity = add_0(date_entity)
                else:
                    date_list = get_date_list(reg_exp_4, lower_sentence)
                    if date_list != []:
                        date_entity = date_list[0].replace('ngày', '').replace('tháng', '').replace('năm', '').strip()
                        date_entity = re.sub('\s+', ' ', date_entity)
                        date_entity = date_entity.replace(' ', '/')
                        date_entity = date_entity + '/' + year
                        date_entity = add_0(date_entity)
                result.append((sentence, date_entity))
    return result

def find_index_nearest_vector(cluster, vectors):
    # Compute the centroid of the cluster
    centroid = np.mean(cluster, axis=0, keepdims=True)
    
    # Calculate the Euclidean distance between each vector and the centroid
    distances = cosine_similarity(centroid, vectors)
    
    # Find the index of the vector with the minimum distance
    nearest_index = np.argmin(distances, axis=1)
    
    
    return nearest_index

def re_clustering(ids, vectors, distance_threshold, max_doc_per_cluster):
    sub_vectors = vectors[ids]

    try:
        if sub_vectors.shape[0] < 2:
            return sub_vectors
        sub_clusteror = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='cosine',
                        linkage='complete', distance_threshold=0.12)
        sub_clusteror.fit(sub_vectors)
        dict_cluster = {id_clus: sub_vectors[sub_clusteror.labels_ == id_clus] for id_clus in range(sub_clusteror.n_clusters_)}
        dict_num_vec = {id_clus: v.shape[0] for id_clus, v in dict_cluster.items()}

        max_num_cluster = max(dict_num_vec, key=dict_num_vec.get)
        other_vectors = sub_vectors[sub_clusteror.labels_ != max_num_cluster]

        # if other_vectors.shape[0]:
        #     while dict_num_vec[max_num_cluster] < max_doc_per_cluster:
        #         tmp_index_vec = find_index_nearest_vector(dict_cluster[max_num_cluster], other_vectors)
        #         dict_cluster[max_num_cluster] = np.vstack((dict_cluster[max_num_cluster], other_vectors[tmp_index_vec]))
        #         dict_num_vec[max_num_cluster] += 1
        #         if other_vectors.shape[0] != 1:
        #             other_vectors = np.delete(other_vectors, tmp_index_vec, axis=0)
        #         else:
        #             break
        cosine_scores = cosine_similarity(dict_cluster[max_num_cluster], dict_cluster[max_num_cluster])
        with open("/home/vietle/topic-clustering/log_score.txt", "a") as f:
            f.write(str(cosine_scores) + "\n")
        return dict_cluster[max_num_cluster]
    except Exception as e:
        with open("/home/vietle/topic-clustering/log_clustering_diemtin/log_cluster_second.txt", "a") as f:
            f.write(str(e)+"$$"+json.dumps({"ids": ids.tolist(), "vectors": vectors.tolist()}))
        return sub_vectors