File size: 1,908 Bytes
cdc5783
 
 
 
 
02f8d21
1d4052e
cdc5783
02f8d21
cdc5783
a680719
 
a45c805
 
cdc5783
 
 
 
1d4052e
2c41d85
ed04ca3
2a78aa3
1d4052e
 
cdc5783
a680719
 
cdc5783
 
 
 
a680719
cdc5783
 
 
 
02f8d21
cdc5783
e2ec8e0
cdc5783
 
 
 
 
 
 
 
 
 
 
e2ec8e0
cdc5783
 
 
 
 
 
 
e2ec8e0
cdc5783
 
 
02f8d21
cdc5783
 
 
02f8d21
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from langdetect import detect
from transformers import pipeline

from utils.tag_utils import filter_tags

AiSummaryVersion = 4
summarization_pipeline = pipeline("summarization", model="csebuetnlp/mT5_multilingual_XLSum", max_length=512)
en_translation_pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
text_to_tags_pipe = pipeline('text2text-generation', model='models/text2tags')


def summarize(id: str, text: str):
    print('=================')
    print(id)
    if text is None or len(text) < 10:
        return {
            "ver": AiSummaryVersion
        }
    summary = get_summarization(text) if len(text) > 1000 else text
    translated = get_en_translation(summary)
    tags = get_tags(translated)
    tags = sorted(list(set(tags)))
    print(summary)
    print(tags)

    value = {
        "id": id,
        "ver": AiSummaryVersion,
        "summary": summary,
        "tags": tags,
    }
    return value


def get_summarization(text: str):
    try:
        result = summarization_pipeline(text)
        return result[0]['summary_text'] if isinstance(result, list) else result['summary_text']
    except:
        return None


def get_en_translation(text: str):
    if text is None:
        return None
    try:
        if is_english(text):
            return text
        result = en_translation_pipe(text)
        return result[0]['translation_text'] if isinstance(result, list) else result['translation_text']
    except:
        return None


def is_english(text):
    try:
        lang = detect(text)
        return lang == 'en'
    except:
        return False


def get_tags(text: str):
    if text is None:
        return []
    try:
        result = text_to_tags_pipe(text)
        tag_str = result[0]['generated_text'] if isinstance(result, list) else result['generated_text']
        return [tag.strip() for tag in tag_str.split(',')]
    except:
        return []