File size: 1,829 Bytes
cdc5783
 
 
1c82268
d9a5c81
cdc5783
5513f06
cdc5783
a680719
 
cdc5783
 
 
 
1c82268
2c41d85
ed04ca3
2a78aa3
a680719
 
cdc5783
 
 
 
a680719
cdc5783
 
 
 
02f8d21
cdc5783
e2ec8e0
cdc5783
 
 
 
 
 
 
 
 
 
 
e2ec8e0
cdc5783
 
 
 
 
 
 
e2ec8e0
cdc5783
 
 
02f8d21
cdc5783
 
 
02f8d21
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from langdetect import detect
from transformers import pipeline

AiSummaryVersion = 1
summarization_pipeline = pipeline("summarization", model="csebuetnlp/mT5_multilingual_XLSum", max_length=512, min_length=50)
en_translation_pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
text_to_tags_pipe = pipeline('text2text-generation', model='PageOrg/t5-small-tagging-text', max_length=64)


def summarize(id: str, text: str):
    if text is None or len(text) < 10:
        return {
            "ver": AiSummaryVersion
        }
    summary = get_summarization(text) if len(text) > 2000 else text
    translated = get_en_translation(summary)
    tags = get_tags(translated)
    tags = sorted(list(set(tags)))
    value = {
        "id": id,
        "ver": AiSummaryVersion,
        "summary": summary,
        "tags": tags,
    }
    return value


def get_summarization(text: str):
    try:
        result = summarization_pipeline(text)
        return result[0]['summary_text'] if isinstance(result, list) else result['summary_text']
    except:
        return None


def get_en_translation(text: str):
    if text is None:
        return None
    try:
        if is_english(text):
            return text
        result = en_translation_pipe(text)
        return result[0]['translation_text'] if isinstance(result, list) else result['translation_text']
    except:
        return None


def is_english(text):
    try:
        lang = detect(text)
        return lang == 'en'
    except:
        return False


def get_tags(text: str):
    if text is None:
        return []
    try:
        result = text_to_tags_pipe(text)
        tag_str = result[0]['generated_text'] if isinstance(result, list) else result['generated_text']
        return [tag.strip() for tag in tag_str.split(',')]
    except:
        return []