File size: 1,658 Bytes
da2ee46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36caef8
da2ee46
 
 
 
 
 
 
 
 
 
 
a5d5d37
 
 
 
 
 
 
 
da2ee46
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import gradio as gr
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load model and tokenizer
model_name = "alexneakameni/language_detection"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Get label mapping
id2label = model.config.id2label

def predict_language(text, top_k=5):
    """Predicts the top-k languages for the given text."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
    
    probs = torch.nn.functional.softmax(logits, dim=-1).squeeze()
    top_probs, top_indices = torch.topk(probs, top_k)
    
    results = [f"{id2label[idx.item()]}: {prob:.4f}" for prob, idx in zip(top_probs, top_indices)]
    return "\n".join(results)

# Create Gradio interface
demo = gr.Interface(
    fn=predict_language,
    inputs=[
        gr.Textbox(label="Enter text", placeholder="Type a sentence here..."),
        gr.Slider(1, 10, value=5, step=1, label="Top-k Languages")
    ],
    outputs=gr.Textbox(label="Predicted Languages"),
    title="🌍 Language Detection",
    description="Detects the language of a given text using a fine-tuned BERT model. Returns the top-k most probable languages.",
    examples=[
        ["Hello, how are you?", 5],
        ["Bonjour, comment ça va?", 5],
        ["Hola, ¿cómo estás?", 5],
        ["Hallo, wie geht es dir?", 5],
        ["Привет, как дела?", 5]
    ]
)

demo.launch()