File size: 3,832 Bytes
2b53f2b
 
3b25c41
 
 
 
4bce69c
3b25c41
 
a0c8166
 
2b53f2b
 
 
 
 
 
 
 
 
3b25c41
2b53f2b
 
 
3b25c41
a0c8166
3b25c41
 
 
a0c8166
3b25c41
a0c8166
3b25c41
2b53f2b
3b25c41
 
 
 
 
 
 
2b53f2b
3b25c41
 
 
 
 
 
 
 
 
2b53f2b
3b25c41
 
 
 
 
 
2b53f2b
a0c8166
2b53f2b
3b25c41
 
 
a0c8166
3b25c41
2b53f2b
3b25c41
a0c8166
3b25c41
2b53f2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0c8166
2b53f2b
 
 
 
 
 
 
 
 
 
 
a0c8166
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b53f2b
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# app.py
import streamlit as st
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List
import torch
import asyncio
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from IndicTransToolkit import IndicProcessor
import requests
import json

# Initialize models and processors
model = AutoModelForSeq2SeqLM.from_pretrained(
    "ai4bharat/indictrans2-en-indic-1B",
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(
    "ai4bharat/indictrans2-en-indic-1B",
    trust_remote_code=True
)
ip = IndicProcessor(inference=True)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(DEVICE)

def translate_text(sentences: List[str], target_lang: str):
    try:
        src_lang = "eng_Latn"
        batch = ip.preprocess_batch(
            sentences,
            src_lang=src_lang,
            tgt_lang=target_lang
        )
        
        inputs = tokenizer(
            batch,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True
        ).to(DEVICE)
        
        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                use_cache=True,
                min_length=0,
                max_length=256,
                num_beams=5,
                num_return_sequences=1
            )
            
        with tokenizer.as_target_tokenizer():
            generated_tokens = tokenizer.batch_decode(
                generated_tokens.detach().cpu().tolist(),
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True
            )
            
        translations = ip.postprocess_batch(generated_tokens, lang=target_lang)
        
        return {
            "translations": translations,
            "source_language": src_lang,
            "target_language": target_lang
        }
        
    except Exception as e:
        raise Exception(f"Translation failed: {str(e)}")

# Streamlit interface
def main():
    st.title("Indic Language Translator")
    
    # Input text
    text_input = st.text_area("Enter text to translate:", "Hello, how are you?")
    
    # Language selection
    target_languages = {
        "Hindi": "hin_Deva",
        "Bengali": "ben_Beng",
        "Tamil": "tam_Taml",
        "Telugu": "tel_Telu",
        "Marathi": "mar_Deva",
        "Gujarati": "guj_Gujr",
        "Kannada": "kan_Knda",
        "Malayalam": "mal_Mlym",
        "Punjabi": "pan_Guru",
        "Odia": "ori_Orya"
    }
    
    target_lang = st.selectbox(
        "Select target language:",
        options=list(target_languages.keys())
    )
    
    if st.button("Translate"):
        try:
            result = translate_text(
                sentences=[text_input],
                target_lang=target_languages[target_lang]
            )
            
            # Display result
            st.success("Translation:")
            st.write(result["translations"][0])
            
        except Exception as e:
            st.error(f"Translation failed: {str(e)}")

    # Add API documentation
    st.markdown("---")
    st.header("API Documentation")
    st.markdown("""
    To use the translation API, send POST requests to:
    ```
    https://USERNAME-SPACE_NAME.hf.space/translate
    ```
    
    Request body format:
    ```json
    {
        "sentences": ["Your text here"],
        "target_lang": "hin_Deva"
    }
    ```
    
    Available target languages:
    - Hindi: `hin_Deva`
    - Bengali: `ben_Beng`
    - Tamil: `tam_Taml`
    - Telugu: `tel_Telu`
    - Marathi: `mar_Deva`
    - Gujarati: `guj_Gujr`
    - Kannada: `kan_Knda`
    - Malayalam: `mal_Mlym`
    - Punjabi: `pan_Guru`
    - Odia: `ori_Orya`
    """)

if __name__ == "__main__":
    main()