File size: 2,985 Bytes
e6d08d8
2d553a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d3ead6
2d553a1
5d3ead6
2d553a1
 
 
 
 
 
e6d08d8
2d553a1
e6d08d8
 
 
 
 
 
5d3ead6
 
e6d08d8
 
 
2d553a1
e6d08d8
 
2d553a1
 
e6d08d8
 
2d553a1
e6d08d8
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import gradio as gr
from transformers import (
    MBart50TokenizerFast,
    MBartForConditionalGeneration,
    AutoTokenizer,
    AutoModelForSequenceClassification,
)
import torch

# Load the language detection model
lang_detector_name = "Aesopskenya/LanguageDetector"
lang_tokenizer = AutoTokenizer.from_pretrained(lang_detector_name)
lang_model = AutoModelForSequenceClassification.from_pretrained(lang_detector_name)

# Define the language mapping to models
lang_to_model = {
    "Gikuyu": "Aesopskenya/translator",
    "Kalenjin": "Aesopskenya/KalenjinTranslator",
    "Kamba": "Aesopskenya/KambaTranslation",
    "Luo": "Aesopskenya/LuoTranslator",
    "Sheng": "Aesopskenya/ShengTranslation",
}

# Reverse mapper for language detection
reverse_mapper = {
    0: "English",
    1: "Sheng",
    2: "Other",
    3: "Luhya",
    4: "Kamba",
    5: "Gikuyu",
    6: "Kalenjin",
    7: "Luo",
}

# Function to detect language
def detect_language(text):
    inputs = lang_tokenizer(
        text,
        max_length=128,
        padding=True,
        truncation=True,
        return_tensors="pt",
    )
    with torch.no_grad():
        outputs = lang_model(**inputs)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=-1).item()
    return reverse_mapper[prediction]

# Function to load the appropriate model and tokenizer
def load_model_and_tokenizer(language):
    model_name = lang_to_model.get(language)
    if model_name:
        tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
        model = MBartForConditionalGeneration.from_pretrained(model_name)
        return tokenizer, model
    return None, None

# Function to translate text
def translate_text(text):
    # Detect the language
    detected_language = detect_language(text)
    print(f"Detected Language: {detected_language}")  # Print detected language for the app output
    if detected_language not in lang_to_model:
        return f"Detected Language: {detected_language}. Language not supported for translation."

    # Load the appropriate model and tokenizer
    tokenizer, model = load_model_and_tokenizer(detected_language)
    if not tokenizer or not model:
        return "Error loading the translation model."

    # Tokenize input
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    
    # Generate translation
    outputs = model.generate(inputs.input_ids, max_length=128)
    
    # Decode output
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return f"Detected Language: {detected_language}\nTranslation: {translation}"


# Define Gradio interface
iface = gr.Interface(
    fn=translate_text,
    inputs="text",
    outputs="text",
    title="Multi-Language Translator",
    description="Enter a sentence, and the model will detect its language and translate it into English.",
)

# Launch the app
if __name__ == "__main__":
    iface.launch(server_name="0.0.0.0", server_port=7860)