File size: 4,191 Bytes
f657d03
71775e2
 
f657d03
71775e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97a23a7
71775e2
 
 
97a23a7
71775e2
 
 
 
 
 
 
 
 
f657d03
 
 
 
71775e2
 
 
97a23a7
71775e2
97a23a7
71775e2
f657d03
 
 
71775e2
f657d03
 
71775e2
 
f657d03
 
97a23a7
 
 
 
 
 
 
 
 
3318d6e
97a23a7
f657d03
 
71775e2
 
f657d03
71775e2
 
f657d03
71775e2
 
97a23a7
71775e2
f657d03
71775e2
 
f657d03
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import datetime
import gradio as gr
from langdetect import detect, DetectorFactory, detect_langs
from transformers import pipeline

models = {'en': 'Narsil/deberta-large-mnli-zero-cls', # English
          'de': 'Sahajtomar/German_Zeroshot', # German
          'es': 'Recognai/zeroshot_selectra_medium', # Spanish
          'it': 'joeddav/xlm-roberta-large-xnli', # Italian
          'ru': 'DeepPavlov/xlm-roberta-large-en-ru-mnli', # Russian
          'no': 'NbAiLab/nb-bert-base-mnli'} # Norsk

hypothesis_templates = {'en': 'This example is {}.', # English
                        'de': 'Dieses beispiel ist {}.', # German
                        'es': 'Este ejemplo es {}.', # Spanish
                        'it': 'Questo esempio è {}.', # Italian
                        'ru': 'Этот пример {}.', # Russian
                        'no': 'Dette eksempelet er {}.'} # Norsk

def detect_lang(sequence, labels):
    DetectorFactory.seed = 0
    seq_lang = 'en'

    try:
        seq_lang = detect(sequence)
        lbl_lang = detect(labels)
    except:
        print("Language detection failed!",
              "Date:{}, Sequence:{}, Labels:{}".format(
                  str(datetime.datetime.now()),
                  labels))

    if seq_lang != lbl_lang:
        print("Different languages detected for sequence and labels!",
              "Date:{}, Sequence:{}, Labels:{}, Sequence Language:{}, Label Language:{}".format(
                  str(datetime.datetime.now()),
                  sequence,
                  labels,
                  seq_lang,
                  lbl_lang))

    if seq_lang in models:
        print("Sequence Language detected.",
              "Date:{}, Sequence:{}, Sequence Language:{}".format(
                  str(datetime.datetime.now()),
                  sequence,
                  seq_lang))
    else:
        print("Language not supported. Defaulting to English!",
              "Date:{}, Sequence:{}, Sequence Language:{}".format(
                  str(datetime.datetime.now()),
                  sequence,
                  seq_lang))
        seq_lang = 'en'

    return seq_lang


def sequence_to_classify(sequence, labels):
    label_clean = str(labels).split(",")

    lang = detect_lang(sequence, labels)
    classifier = pipeline("zero-shot-classification",
                          hypothesis_template=hypothesis_templates[lang],
                          model=models[lang])
    response = classifier(sequence, label_clean, multi_label=True)

    predicted_labels = response['labels']
    predicted_scores = response['scores']
    clean_output = {idx: float(predicted_scores.pop(0)) for idx in predicted_labels}
    print("Date:{}, Sequence:{}, Labels: {}".format(
        str(datetime.datetime.now()),
        sequence,
        predicted_labels))

    return clean_output

example_text1 = "Climate change refers to long-term shifts in temperatures and weather patterns. \
These shifts may be natural, but since the 1800s, human activities have been the main driver of climate change, \
primarily due to the burning of fossil fuels (like coal, oil, and gas), which produces heat-trapping gases."
example_labels1="business,nature,religion"
example_text2="Ja, vi elsker dette landet,\
som det stiger frem,\
furet, værbitt over vannet,\
med de tusen hjem.\
Og som fedres kamp har hevet\
det av nød til seir"
example_labels2="helse,sport,religion,mat,patriotisme og nasjonalisme"

iface = gr.Interface(
    title="Multilingual Multi-label Zero-shot Classification",
    description="Currently supported languages are English, German, Spanish, Italian, Russian, Norsk.",
    fn=sequence_to_classify,
    inputs=[gr.inputs.Textbox(lines=20,
        label="Please enter the text you would like to classify...",
        placeholder="Text here..."),
        gr.inputs.Textbox(lines=5,
        label="Possible candidate labels (separated by comma)...",
        placeholder="Labels here separated by comma...")],
    outputs=gr.outputs.Label(num_top_classes=5),
    capture_session=True,
    #interpretation="default",
    examples=[
        [example_text1, example_labels1],
        [example_text2, example_labels2]
    ])
iface.launch()