File size: 9,381 Bytes
630b8fe
 
06cfbed
64ce29d
 
 
 
630b8fe
06cfbed
630b8fe
06cfbed
 
630b8fe
 
 
64ce29d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
630b8fe
06cfbed
 
64ce29d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
630b8fe
64ce29d
 
630b8fe
64ce29d
 
06cfbed
64ce29d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
06cfbed
 
64ce29d
06cfbed
64ce29d
 
 
 
 
06cfbed
 
 
 
630b8fe
 
06cfbed
630b8fe
64ce29d
 
630b8fe
 
64ce29d
 
 
 
630b8fe
64ce29d
 
630b8fe
64ce29d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
630b8fe
64ce29d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
630b8fe
64ce29d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
06cfbed
64ce29d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
630b8fe
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import re
import pandas as pd
from io import StringIO
import time

# Page configuration
st.set_page_config(
    page_title="Hindi to Kangri Translator",
    page_icon="🗣️",
    layout="wide"
)

# Custom CSS for better styling
st.markdown("""
    <style>
    .stAlert {
        padding: 10px;
        margin: 10px 0;
    }
    .example-text {
        padding: 10px;
        background-color: #f0f2f6;
        border-radius: 5px;
        margin: 5px 0;
        cursor: pointer;
    }
    </style>
""", unsafe_allow_html=True)

# Example texts
EXAMPLE_TEXTS = {
    "General Conversation": "मैं आज बाजार जा रहा हूं। क्या आप मेरे साथ चलना चाहेंगे?",
    "Cultural": "दिवाली का त्योहार रोशनी और खुशियों का त्योहार है।",
    "Literature": "साहित्य मानव जीवन का दर्पण है। इसमें समाज की हर छवि दिखाई देती है।",
    "Tourism": "हिमाचल प्रदेश की सुंदर पहाड़ियां और हरी-भरी वादियां पर्यटकों को आकर्षित करती हैं।"
}

@st.cache_resource
def load_model():
    """Load and cache the model and tokenizer"""
    try:
        model_name = "cloghost/nllb-200-distilled-600M-hin-kang-v1"
        
        with st.spinner("Loading model and tokenizer..."):
            model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            
            device = 0 if torch.cuda.is_available() else -1
            
            translator = pipeline(
                "translation",
                model=model,
                tokenizer=tokenizer,
                src_lang="hin_Deva",
                tgt_lang="kang_Deva",
                device=device
            )
            
            return translator
    except Exception as e:
        st.error(f"Error loading model: {str(e)}")
        return None

def preprocess_text(text):
    """Preprocess the input text"""
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text.strip())
    
    # Remove special characters except Devanagari and basic punctuation
    text = re.sub(r'[^\u0900-\u097F\s।,.?!]', '', text)
    
    # Normalize common variations of Hindi characters
    text = text.replace('॰', '.')
    
    return text

def batch_translate(translator, texts):
    """Translate a batch of texts"""
    results = []
    for text in texts:
        try:
            if text.strip():  # Only translate non-empty texts
                translation = translator(text)
                results.append({
                    'Source': text.strip(),
                    'Translation': translation[0]['translation_text']
                })
            else:
                results.append({
                    'Source': '',
                    'Translation': ''
                })
        except Exception as e:
            results.append({
                'Source': text.strip(),
                'Translation': f'Error: {str(e)}'
            })
    return pd.DataFrame(results)

def translate_text(translator, text):
    """Translate single text with error handling"""
    try:
        preprocessed_text = preprocess_text(text)
        if not preprocessed_text:
            return None
        
        translation = translator(preprocessed_text)
        return translation[0]['translation_text']
    except Exception as e:
        st.error(f"Translation Error: {str(e)}")
        return None

def main():
    st.title("🗣️ Hindi to Kangri Translator")
    st.markdown("""
    An advanced translation tool for converting Hindi text to Kangri language. 
    Features include single text translation, batch processing, and text preprocessing.
    """)
    
    # Load model
    translator = load_model()
    if not translator:
        st.stop()
    
    # Create tabs for different features
    tabs = st.tabs(["Single Translation", "Batch Translation", "Examples", "About"])
    
    # Single Translation Tab
    with tabs[0]:
        col1, col2 = st.columns(2)
        
        with col1:
            st.subheader("Hindi Text (हिंदी)")
            input_text = st.text_area(
                "Enter Hindi text",
                height=200,
                help="Enter the Hindi text you want to translate to Kangri",
                placeholder="यहाँ हिंदी में टेक्स्ट लिखें..."
            )
            
            # Preprocessing options
            with st.expander("Preprocessing Options"):
                remove_special = st.checkbox("Remove special characters", value=True)
                normalize_chars = st.checkbox("Normalize Hindi characters", value=True)
            
            if st.button("Translate to Kangri"):
                if input_text:
                    with st.spinner("Translating..."):
                        # Show preprocessing steps
                        if remove_special or normalize_chars:
                            st.info("Preprocessing text...")
                            processed_text = preprocess_text(input_text)
                            st.code(processed_text, language="text")
                        
                        translated_text = translate_text(translator, input_text)
                        
                        if translated_text:
                            with col2:
                                st.subheader("Kangri Translation (कांगड़ी)")
                                st.text_area(
                                    "Kangri translation",
                                    value=translated_text,
                                    height=200,
                                    disabled=True
                                )
                else:
                    st.warning("Please enter some Hindi text to translate.")
    
    # Batch Translation Tab
    with tabs[1]:
        st.subheader("Batch Translation")
        st.markdown("""
        Upload a CSV or TXT file containing Hindi texts to translate in bulk.
        - For CSV: Include a column named 'text' containing Hindi texts
        - For TXT: Each line should contain one Hindi text to translate
        """)
        
        uploaded_file = st.file_uploader("Choose a file", type=['csv', 'txt'])
        
        if uploaded_file:
            try:
                if uploaded_file.type == 'text/csv':
                    df = pd.read_csv(uploaded_file)
                    texts = df['text'].tolist()
                else:  # txt file
                    content = uploaded_file.read().decode()
                    texts = content.split('\n')
                
                if st.button("Translate Batch"):
                    progress_bar = st.progress(0)
                    with st.spinner("Processing batch translation..."):
                        results_df = batch_translate(translator, texts)
                        progress_bar.progress(100)
                    
                    st.success("Translation completed!")
                    st.dataframe(results_df)
                    
                    # Download button for results
                    csv = results_df.to_csv(index=False)
                    st.download_button(
                        "Download Results",
                        csv,
                        "translation_results.csv",
                        "text/csv",
                        key='download-csv'
                    )
            except Exception as e:
                st.error(f"Error processing file: {str(e)}")
    
    # Examples Tab
    with tabs[2]:
        st.subheader("Example Texts")
        st.markdown("Click on any example to load it into the translator:")
        
        for category, text in EXAMPLE_TEXTS.items():
            st.markdown(f"**{category}:**")
            if st.button(text, key=f"example_{category}"):
                tabs[0].button = True  # Switch to translation tab
                st.session_state.input_text = text
                st.experimental_rerun()
    
    # About Tab
    with tabs[3]:
        st.subheader("About the Model")
        st.markdown("""
        ### Model Information
        - **Base Model**: NLLB-200 Distilled (600M parameters)
        - **Fine-tuned for**: Hindi (hin_Deva) to Kangri (kang_Deva) translation
        - **Maximum input length**: 512 tokens
        - **Model ID**: `cloghost/nllb-200-distilled-600M-hin-kang-v1`
        
        ### Preprocessing Features
        - Remove special characters while preserving Devanagari script
        - Normalize Hindi character variations
        - Clean extra whitespace and formatting
        
        ### Usage Tips
        1. For best results, input clean Hindi text in Devanagari script
        2. Use batch translation for processing multiple texts efficiently
        3. Check preprocessing options for better translation quality
        4. Refer to example texts for optimal input format
        """)

if __name__ == "__main__":
    main()