Spaces:

cuongnguyen910
/

topic-clustering-global-dashboard

Build error

File size: 4,402 Bytes
import json
import os

# os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

from function.topic_clustering import model, AgglomerativeClustering

def check_duplicate_title_domain(docs):
    lst_title_domain = [f"{d.get('domain', '')} {d.get('title','')}" for d in docs]
    for i in range(1,len(lst_title_domain) -1):
        for j in range(i+1,len(lst_title_domain)):
            if lst_title_domain[j] == lst_title_domain[i]:
                lst_title_domain[j] = 'dup'
    lst_filter_docs = [docs[i] for i,x in enumerate(lst_title_domain) if x != 'dup']
    return lst_filter_docs

def main(req):
    # threshold = req.get('threshold', 0.3)
    type = req['type']
    if type == 'monthly':
        MAX_CLUSTER = 50
    else:
        MAX_CLUSTER = 20

    MAX_NUM_DOC_PER_CLUSTER = 50

    threshold = 0.4

    preprocess = req.get('preprocess', [])
    lst_labels = []
    lst_topics = []
    for date_clusters in preprocess:
        # date = date_clusters['date']
        topic = date_clusters.get('topic', [])
        if topic:
            for topic_id in topic:
                # print(topic_id)
                topic_docs = topic[topic_id]
                lst_topics.append(topic[topic_id])
                label = '. '.join([topic_docs[0].get('title',''),topic_docs[0].get('snippet','')]) 
                lst_labels.append(label)
    
    final_clusters = []
    label_clusters = sbert_clustering(lst_labels, distance_threshold=threshold,return_ids=True)
    
    # print(lst_labels)
    print(label_clusters)

    if label_clusters:
        for id_label_clusters in label_clusters:
            merge_clusters = []
            num_docs = 0
            for topic_id in label_clusters[id_label_clusters]:
                topic = lst_topics[topic_id]
                count_doc = topic[0].get('num_docs',1)
                num_docs += count_doc
                merge_clusters.extend(lst_topics[topic_id])
                
            merge_clusters = sorted(merge_clusters, key=lambda x: -x.get('created_time',0))
            merge_clusters = check_duplicate_title_domain(merge_clusters)

            merge_clusters = merge_clusters[:MAX_NUM_DOC_PER_CLUSTER]
            for doc in merge_clusters:
                doc['num_docs'] = num_docs
            final_clusters.append(merge_clusters)
    
    final_clusters = sorted(final_clusters, key=lambda x: -x[0]['num_docs'])
    final_clusters = final_clusters[:MAX_CLUSTER]

    final_result = {}
    for i,cluster in enumerate(final_clusters):
        final_result[i] = cluster
    with open('zzz.json','w') as f:
        json.dump(final_result, f, ensure_ascii=False)
    return final_result

def get_sbert_embedding(lst_sentence):
    embs = model.encode(lst_sentence)
    # embs = np.array(embs)
    return embs

def sbert_clustering(lst_sentence, distance_threshold=0.25, return_ids = False):
    lst_sentence = [sen.replace('_',' ') for sen in lst_sentence]
    if len(lst_sentence) == 0:
        return
    if len(lst_sentence) == 1:
        if return_ids:
            return {
                0: [0]
            }
        return {
            0: lst_sentence
        }
    
    # embs = model.encode(lst_sentence, show_progress_bar=True)
    embs = get_sbert_embedding(lst_sentence)

    hyer_clusteror = AgglomerativeClustering(n_clusters = None,compute_full_tree = True, affinity = 'cosine', 
                                        linkage = 'complete', distance_threshold=distance_threshold)
    # print(f'[INFO] Len lst_sentence: {len(lst_sentence)}')
    # print(f'[INFO] Len embs: {len(embs)}')
    hyer_clusteror.fit(embs)
    # print(hyer_clusteror.n_clusters_)

    dict_result = {}
    dict_ids = {}
    for i in range(hyer_clusteror.n_clusters_):
        if i not in dict_result:
            dict_result[i] = []
            dict_ids[i] = []
        for j in range(len(lst_sentence)):
            if hyer_clusteror.labels_[j] == i:
                dict_result[i].append(lst_sentence[j])
                dict_ids[i].append(j)
    
    if return_ids:
        output = dict_ids
    else:
        output = dict_result
    result = dict(sorted(output.items(), key=lambda i: -len(i[1])))
    return result

if __name__ == '__main__':
    with open("input_merge.json",'r') as f:
        req = json.load(f)
    main(req)