File size: 5,328 Bytes
34475ca
1f8139d
3c8522e
1f8139d
 
3e1d85e
1f8139d
 
96d6ff2
 
 
 
 
 
3e1d85e
 
 
 
 
96d6ff2
 
3e1d85e
3c8522e
3e1d85e
 
 
 
1a6e1ca
546ed8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd8fa87
 
 
 
 
 
 
 
 
 
 
 
96d6ff2
3e1d85e
cd8fa87
3e1d85e
 
 
 
85c7334
 
 
3e1d85e
 
 
 
 
 
85c7334
3e1d85e
 
 
 
 
96d6ff2
3e1d85e
 
b284e87
cd8fa87
 
3e1d85e
 
cd8fa87
 
6c4f7e8
85c7334
1a6e1ca
0e88fea
6f918d6
0e88fea
3e1d85e
318e9b0
 
b284e87
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import pkg_resources
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import gradio as gr
from ferret import Benchmark

# Load models and tokenizers
sentiment_tokenizer = AutoTokenizer.from_pretrained("Birkir/electra-base-igc-is-sentiment-analysis")
sentiment_model = AutoModelForSequenceClassification.from_pretrained("Birkir/electra-base-igc-is-sentiment-analysis")
formality_tokenizer = AutoTokenizer.from_pretrained("svanhvit/formality-classification-icebert")
formality_model = AutoModelForSequenceClassification.from_pretrained("svanhvit/formality-classification-icebert")
toxicity_tokenizer = AutoTokenizer.from_pretrained("unitary/toxic-bert")
toxicity_model = AutoModelForSequenceClassification.from_pretrained("unitary/toxic-bert")
politeness_tokenizer = AutoTokenizer.from_pretrained("Genius1237/xlm-roberta-large-tydip")
politeness_model = AutoModelForSequenceClassification.from_pretrained("Genius1237/xlm-roberta-large-tydip")

# Initialize benchmarks
sentiment_bench = Benchmark(sentiment_model, sentiment_tokenizer)
formality_bench = Benchmark(formality_model, formality_tokenizer)
toxicity_bench = Benchmark(toxicity_model, toxicity_tokenizer)
politeness_bench = Benchmark(politeness_model, politeness_tokenizer)

# Initialize pipelines for translation and text classification
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-is-en")
sentiment_classifier = pipeline("text-classification", model="Birkir/electra-base-igc-is-sentiment-analysis")
formality_classifier = pipeline("text-classification", model="svanhvit/formality-classification-icebert")
detoxify_classifier = pipeline('text-classification', model='unitary/toxic-bert', tokenizer='bert-base-uncased', function_to_apply='sigmoid', top_k=None)
politeness_classifier = pipeline("text-classification", model="Genius1237/xlm-roberta-large-tydip")

def replace_encoding(tokens):
    return [token.replace('Ġ', ' ')
                 .replace('ð', 'ð')
                 .replace('é', 'é')
                 .replace('æ', 'æ')
                 .replace('ý', 'ý')
                 .replace('á', 'á')
                 .replace('ú', 'ú')
                 .replace('ÃŃ', 'í')
                 .replace('Ãö', 'ö')
                 .replace('þ', 'þ')
                 .replace('Ãģ', 'Á')
                 .replace('Ãį', 'Ú')
                 .replace('Ãĵ', 'Ó')
                 .replace('ÃĨ', 'Æ')
                 .replace('ÃIJ', 'Ð')
                 .replace('Ãĸ', 'Ö')
                 .replace('Ãī', 'É')
                 .replace('Ãļ', 'ý')
            for token in tokens[1:-1]]

def analyze_with_influence(text, bench):
    explanations = bench.explain(text, target=0)
    influential_words = []
    for explanation in explanations:
        if explanation.explainer == 'Partition SHAP':
            tokens = replace_encoding(explanation.tokens)
            token_score_pairs = zip(tokens, explanation.scores)
            influential_words.extend([(token, score) for token, score in token_score_pairs])
    influential_words_str = "; ".join([f"{token} ({score:.2f})" for token, score in influential_words])
    return influential_words_str

def analyze_text(icelandic_text):
    # Perform translations
    translated_text = translator(icelandic_text, max_length=512)[0]['translation_text']
    
    # Perform initial analysis to get scores
    sentiment_result = sentiment_classifier(icelandic_text)[0]
    formality_result = formality_classifier(icelandic_text)[0]
    # Assuming detoxify_classifier gives a list of dictionaries, we need to adjust how we process this
    # For the sake of example, let's just mock a toxicity score here. Adjust this based on actual model output
    toxicity_mock_score = 0.5  # Placeholder, replace with actual processing of detoxify_classifier output
    politeness_result = politeness_classifier(translated_text)[0]

    # Gather scores and labels
    scores_labels = {
        "Sentiment": (sentiment_result['score'], sentiment_bench),
        "Formality": (formality_result['score'], formality_bench),
        "Toxicity": (toxicity_mock_score, toxicity_bench),  # Use the mock or processed score
        "Politeness": (politeness_result['score'], politeness_bench)
    }

    # Identify the aspect with the lowest score
    lowest_aspect = min(scores_labels, key=lambda x: scores_labels[x][0])

    # Perform Ferret analysis on the aspect with the lowest score
    influential_words = analyze_with_influence(icelandic_text if lowest_aspect in ["Sentiment", "Formality"] else translated_text, scores_labels[lowest_aspect][1])

    analysis_results = f"""
    Translated Text: {translated_text}\n\n
    Lowest Score Aspect: {lowest_aspect}\n
    Influential Words in {lowest_aspect}: {influential_words}
    """
    return analysis_results.strip()


demo = gr.Interface(fn=analyze_text, 
                    inputs=gr.Textbox(lines=2, placeholder="Enter Icelandic Text Here..."), 
                    outputs=gr.Textbox(label="Analysis Results"),
                    title="Icelandic Text Analysis",
                    description="This app translates Icelandic text to English and performs analysis with influential words for the aspect with the lowest score.")

if __name__ == "__main__":
    demo.launch()