File size: 2,320 Bytes
cdc5783
 
 
 
 
 
 
e2ec8e0
090f2d4
cdc5783
 
e2ec8e0
 
 
cdc5783
a680719
 
cdc5783
 
 
 
 
fd9b289
858ef45
1f6b7aa
2a78aa3
cdc5783
a680719
 
cdc5783
 
 
 
a680719
cdc5783
 
 
 
 
 
e2ec8e0
cdc5783
 
 
 
 
 
 
 
 
 
 
e2ec8e0
cdc5783
 
 
 
 
 
 
e2ec8e0
cdc5783
 
 
858ef45
cdc5783
 
 
3784e1c
 
 
858ef45
 
 
 
1f6b7aa
e2ec8e0
cdc5783
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import re

from langdetect import detect
from transformers import pipeline

from utils.tag_utils import filter_tags

AiSummaryVersion = 2
MinTagScore = 0.7
summarization_pipeline = pipeline("summarization", model="csebuetnlp/mT5_multilingual_XLSum")
en_translation_pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
tag_gen_pipe_1 = pipeline("text-classification", model="yiyanghkust/finbert-esg-9-categories")
tag_gen_pipe_2 = pipeline("text-classification", model="dima806/news-category-classifier-distilbert")
tag_gen_pipe_3 = pipeline("text-classification", model="elozano/bert-base-cased-news-category")


def summarize(id: str, text: str):
    if text is None or len(text) < 10:
        return {
            "ver": AiSummaryVersion
        }
    summary = get_summarization(text) if len(text) > 100 else text
    translated = get_en_translation(text)
    tags = get_tags(translated, id)
    tags = filter_tags(tags)
    tags = sorted(list(set(tags)))

    value = {
        "id": id,
        "ver": AiSummaryVersion,
        "summary": summary,
        "tags": tags,
    }
    return value


def get_summarization(text: str):
    try:
        result = summarization_pipeline(text)
        return result[0]['summary_text'] if isinstance(result, list) else result['summary_text']
    except:
        return None


def get_en_translation(text: str):
    if text is None:
        return None
    try:
        if is_english(text):
            return text
        result = en_translation_pipe(text)
        return result[0]['translation_text'] if isinstance(result, list) else result['translation_text']
    except:
        return None


def is_english(text):
    try:
        lang = detect(text)
        return lang == 'en'
    except:
        return False


def get_tags(text: str, id: str):
    if text is None:
        return []
    try:
        tags1 = [tag['label'] for tag in tag_gen_pipe_1(text) if tag['score'] >= MinTagScore]
        tags2 = [tag['label'] for tag in tag_gen_pipe_2(text) if tag['score'] >= MinTagScore]
        tags3 = [tag['label'] for tag in tag_gen_pipe_3(text) if tag['score'] >= MinTagScore]
        print('XXXXXXXXXXXXXXXXXXXXX')
        print(id)
        print(tags1, tags2, tags3)
        print(text)
        return tags1 + tags2 + tags3
    except:
        return []