Spaces:
Sleeping
Sleeping
import streamlit as st | |
import torch | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline | |
import re | |
import pandas as pd | |
from io import StringIO | |
import time | |
# Page configuration | |
st.set_page_config( | |
page_title="Hindi to Kangri Translator", | |
page_icon="🗣️", | |
layout="wide" | |
) | |
# Custom CSS for better styling | |
st.markdown(""" | |
<style> | |
.stAlert { | |
padding: 10px; | |
margin: 10px 0; | |
} | |
.example-text { | |
padding: 10px; | |
background-color: #f0f2f6; | |
border-radius: 5px; | |
margin: 5px 0; | |
cursor: pointer; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
# Example texts | |
EXAMPLE_TEXTS = { | |
"General Conversation": "मैं आज बाजार जा रहा हूं। क्या आप मेरे साथ चलना चाहेंगे?", | |
"Cultural": "दिवाली का त्योहार रोशनी और खुशियों का त्योहार है।", | |
"Literature": "साहित्य मानव जीवन का दर्पण है। इसमें समाज की हर छवि दिखाई देती है।", | |
"Tourism": "हिमाचल प्रदेश की सुंदर पहाड़ियां और हरी-भरी वादियां पर्यटकों को आकर्षित करती हैं।" | |
} | |
def load_model(): | |
"""Load and cache the model and tokenizer""" | |
try: | |
model_name = "cloghost/nllb-200-distilled-600M-hin-kang-v1" | |
with st.spinner("Loading model and tokenizer..."): | |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
device = 0 if torch.cuda.is_available() else -1 | |
translator = pipeline( | |
"translation", | |
model=model, | |
tokenizer=tokenizer, | |
src_lang="hin_Deva", | |
tgt_lang="kang_Deva", | |
device=device | |
) | |
return translator | |
except Exception as e: | |
st.error(f"Error loading model: {str(e)}") | |
return None | |
def preprocess_text(text): | |
"""Preprocess the input text""" | |
# Remove extra whitespace | |
text = re.sub(r'\s+', ' ', text.strip()) | |
# Remove special characters except Devanagari and basic punctuation | |
text = re.sub(r'[^\u0900-\u097F\s।,.?!]', '', text) | |
# Normalize common variations of Hindi characters | |
text = text.replace('॰', '.') | |
return text | |
def batch_translate(translator, texts): | |
"""Translate a batch of texts""" | |
results = [] | |
for text in texts: | |
try: | |
if text.strip(): # Only translate non-empty texts | |
translation = translator(text) | |
results.append({ | |
'Source': text.strip(), | |
'Translation': translation[0]['translation_text'] | |
}) | |
else: | |
results.append({ | |
'Source': '', | |
'Translation': '' | |
}) | |
except Exception as e: | |
results.append({ | |
'Source': text.strip(), | |
'Translation': f'Error: {str(e)}' | |
}) | |
return pd.DataFrame(results) | |
def translate_text(translator, text): | |
"""Translate single text with error handling""" | |
try: | |
preprocessed_text = preprocess_text(text) | |
if not preprocessed_text: | |
return None | |
translation = translator(preprocessed_text) | |
return translation[0]['translation_text'] | |
except Exception as e: | |
st.error(f"Translation Error: {str(e)}") | |
return None | |
def main(): | |
st.title("🗣️ Hindi to Kangri Translator") | |
st.markdown(""" | |
An advanced translation tool for converting Hindi text to Kangri language. | |
Features include single text translation, batch processing, and text preprocessing. | |
""") | |
# Load model | |
translator = load_model() | |
if not translator: | |
st.stop() | |
# Create tabs for different features | |
tabs = st.tabs(["Single Translation", "Batch Translation", "Examples", "About"]) | |
# Single Translation Tab | |
with tabs[0]: | |
col1, col2 = st.columns(2) | |
with col1: | |
st.subheader("Hindi Text (हिंदी)") | |
input_text = st.text_area( | |
"Enter Hindi text", | |
height=200, | |
help="Enter the Hindi text you want to translate to Kangri", | |
placeholder="यहाँ हिंदी में टेक्स्ट लिखें..." | |
) | |
# Preprocessing options | |
with st.expander("Preprocessing Options"): | |
remove_special = st.checkbox("Remove special characters", value=True) | |
normalize_chars = st.checkbox("Normalize Hindi characters", value=True) | |
if st.button("Translate to Kangri"): | |
if input_text: | |
with st.spinner("Translating..."): | |
# Show preprocessing steps | |
if remove_special or normalize_chars: | |
st.info("Preprocessing text...") | |
processed_text = preprocess_text(input_text) | |
st.code(processed_text, language="text") | |
translated_text = translate_text(translator, input_text) | |
if translated_text: | |
with col2: | |
st.subheader("Kangri Translation (कांगड़ी)") | |
st.text_area( | |
"Kangri translation", | |
value=translated_text, | |
height=200, | |
disabled=True | |
) | |
else: | |
st.warning("Please enter some Hindi text to translate.") | |
# Batch Translation Tab | |
with tabs[1]: | |
st.subheader("Batch Translation") | |
st.markdown(""" | |
Upload a CSV or TXT file containing Hindi texts to translate in bulk. | |
- For CSV: Include a column named 'text' containing Hindi texts | |
- For TXT: Each line should contain one Hindi text to translate | |
""") | |
uploaded_file = st.file_uploader("Choose a file", type=['csv', 'txt']) | |
if uploaded_file: | |
try: | |
if uploaded_file.type == 'text/csv': | |
df = pd.read_csv(uploaded_file) | |
texts = df['text'].tolist() | |
else: # txt file | |
content = uploaded_file.read().decode() | |
texts = content.split('\n') | |
if st.button("Translate Batch"): | |
progress_bar = st.progress(0) | |
with st.spinner("Processing batch translation..."): | |
results_df = batch_translate(translator, texts) | |
progress_bar.progress(100) | |
st.success("Translation completed!") | |
st.dataframe(results_df) | |
# Download button for results | |
csv = results_df.to_csv(index=False) | |
st.download_button( | |
"Download Results", | |
csv, | |
"translation_results.csv", | |
"text/csv", | |
key='download-csv' | |
) | |
except Exception as e: | |
st.error(f"Error processing file: {str(e)}") | |
# Examples Tab | |
with tabs[2]: | |
st.subheader("Example Texts") | |
st.markdown("Click on any example to load it into the translator:") | |
for category, text in EXAMPLE_TEXTS.items(): | |
st.markdown(f"**{category}:**") | |
if st.button(text, key=f"example_{category}"): | |
tabs[0].button = True # Switch to translation tab | |
st.session_state.input_text = text | |
st.experimental_rerun() | |
# About Tab | |
with tabs[3]: | |
st.subheader("About the Model") | |
st.markdown(""" | |
### Model Information | |
- **Base Model**: NLLB-200 Distilled (600M parameters) | |
- **Fine-tuned for**: Hindi (hin_Deva) to Kangri (kang_Deva) translation | |
- **Maximum input length**: 512 tokens | |
- **Model ID**: `cloghost/nllb-200-distilled-600M-hin-kang-v1` | |
### Preprocessing Features | |
- Remove special characters while preserving Devanagari script | |
- Normalize Hindi character variations | |
- Clean extra whitespace and formatting | |
### Usage Tips | |
1. For best results, input clean Hindi text in Devanagari script | |
2. Use batch translation for processing multiple texts efficiently | |
3. Check preprocessing options for better translation quality | |
4. Refer to example texts for optimal input format | |
""") | |
if __name__ == "__main__": | |
main() |