Spaces:
Sleeping
Sleeping
File size: 9,381 Bytes
630b8fe 06cfbed 64ce29d 630b8fe 06cfbed 630b8fe 06cfbed 630b8fe 64ce29d 630b8fe 06cfbed 64ce29d 630b8fe 64ce29d 630b8fe 64ce29d 06cfbed 64ce29d 06cfbed 64ce29d 06cfbed 64ce29d 06cfbed 630b8fe 06cfbed 630b8fe 64ce29d 630b8fe 64ce29d 630b8fe 64ce29d 630b8fe 64ce29d 630b8fe 64ce29d 630b8fe 64ce29d 06cfbed 64ce29d 630b8fe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 |
import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import re
import pandas as pd
from io import StringIO
import time
# Page configuration
st.set_page_config(
page_title="Hindi to Kangri Translator",
page_icon="🗣️",
layout="wide"
)
# Custom CSS for better styling
st.markdown("""
<style>
.stAlert {
padding: 10px;
margin: 10px 0;
}
.example-text {
padding: 10px;
background-color: #f0f2f6;
border-radius: 5px;
margin: 5px 0;
cursor: pointer;
}
</style>
""", unsafe_allow_html=True)
# Example texts
EXAMPLE_TEXTS = {
"General Conversation": "मैं आज बाजार जा रहा हूं। क्या आप मेरे साथ चलना चाहेंगे?",
"Cultural": "दिवाली का त्योहार रोशनी और खुशियों का त्योहार है।",
"Literature": "साहित्य मानव जीवन का दर्पण है। इसमें समाज की हर छवि दिखाई देती है।",
"Tourism": "हिमाचल प्रदेश की सुंदर पहाड़ियां और हरी-भरी वादियां पर्यटकों को आकर्षित करती हैं।"
}
@st.cache_resource
def load_model():
"""Load and cache the model and tokenizer"""
try:
model_name = "cloghost/nllb-200-distilled-600M-hin-kang-v1"
with st.spinner("Loading model and tokenizer..."):
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = 0 if torch.cuda.is_available() else -1
translator = pipeline(
"translation",
model=model,
tokenizer=tokenizer,
src_lang="hin_Deva",
tgt_lang="kang_Deva",
device=device
)
return translator
except Exception as e:
st.error(f"Error loading model: {str(e)}")
return None
def preprocess_text(text):
"""Preprocess the input text"""
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text.strip())
# Remove special characters except Devanagari and basic punctuation
text = re.sub(r'[^\u0900-\u097F\s।,.?!]', '', text)
# Normalize common variations of Hindi characters
text = text.replace('॰', '.')
return text
def batch_translate(translator, texts):
"""Translate a batch of texts"""
results = []
for text in texts:
try:
if text.strip(): # Only translate non-empty texts
translation = translator(text)
results.append({
'Source': text.strip(),
'Translation': translation[0]['translation_text']
})
else:
results.append({
'Source': '',
'Translation': ''
})
except Exception as e:
results.append({
'Source': text.strip(),
'Translation': f'Error: {str(e)}'
})
return pd.DataFrame(results)
def translate_text(translator, text):
"""Translate single text with error handling"""
try:
preprocessed_text = preprocess_text(text)
if not preprocessed_text:
return None
translation = translator(preprocessed_text)
return translation[0]['translation_text']
except Exception as e:
st.error(f"Translation Error: {str(e)}")
return None
def main():
st.title("🗣️ Hindi to Kangri Translator")
st.markdown("""
An advanced translation tool for converting Hindi text to Kangri language.
Features include single text translation, batch processing, and text preprocessing.
""")
# Load model
translator = load_model()
if not translator:
st.stop()
# Create tabs for different features
tabs = st.tabs(["Single Translation", "Batch Translation", "Examples", "About"])
# Single Translation Tab
with tabs[0]:
col1, col2 = st.columns(2)
with col1:
st.subheader("Hindi Text (हिंदी)")
input_text = st.text_area(
"Enter Hindi text",
height=200,
help="Enter the Hindi text you want to translate to Kangri",
placeholder="यहाँ हिंदी में टेक्स्ट लिखें..."
)
# Preprocessing options
with st.expander("Preprocessing Options"):
remove_special = st.checkbox("Remove special characters", value=True)
normalize_chars = st.checkbox("Normalize Hindi characters", value=True)
if st.button("Translate to Kangri"):
if input_text:
with st.spinner("Translating..."):
# Show preprocessing steps
if remove_special or normalize_chars:
st.info("Preprocessing text...")
processed_text = preprocess_text(input_text)
st.code(processed_text, language="text")
translated_text = translate_text(translator, input_text)
if translated_text:
with col2:
st.subheader("Kangri Translation (कांगड़ी)")
st.text_area(
"Kangri translation",
value=translated_text,
height=200,
disabled=True
)
else:
st.warning("Please enter some Hindi text to translate.")
# Batch Translation Tab
with tabs[1]:
st.subheader("Batch Translation")
st.markdown("""
Upload a CSV or TXT file containing Hindi texts to translate in bulk.
- For CSV: Include a column named 'text' containing Hindi texts
- For TXT: Each line should contain one Hindi text to translate
""")
uploaded_file = st.file_uploader("Choose a file", type=['csv', 'txt'])
if uploaded_file:
try:
if uploaded_file.type == 'text/csv':
df = pd.read_csv(uploaded_file)
texts = df['text'].tolist()
else: # txt file
content = uploaded_file.read().decode()
texts = content.split('\n')
if st.button("Translate Batch"):
progress_bar = st.progress(0)
with st.spinner("Processing batch translation..."):
results_df = batch_translate(translator, texts)
progress_bar.progress(100)
st.success("Translation completed!")
st.dataframe(results_df)
# Download button for results
csv = results_df.to_csv(index=False)
st.download_button(
"Download Results",
csv,
"translation_results.csv",
"text/csv",
key='download-csv'
)
except Exception as e:
st.error(f"Error processing file: {str(e)}")
# Examples Tab
with tabs[2]:
st.subheader("Example Texts")
st.markdown("Click on any example to load it into the translator:")
for category, text in EXAMPLE_TEXTS.items():
st.markdown(f"**{category}:**")
if st.button(text, key=f"example_{category}"):
tabs[0].button = True # Switch to translation tab
st.session_state.input_text = text
st.experimental_rerun()
# About Tab
with tabs[3]:
st.subheader("About the Model")
st.markdown("""
### Model Information
- **Base Model**: NLLB-200 Distilled (600M parameters)
- **Fine-tuned for**: Hindi (hin_Deva) to Kangri (kang_Deva) translation
- **Maximum input length**: 512 tokens
- **Model ID**: `cloghost/nllb-200-distilled-600M-hin-kang-v1`
### Preprocessing Features
- Remove special characters while preserving Devanagari script
- Normalize Hindi character variations
- Clean extra whitespace and formatting
### Usage Tips
1. For best results, input clean Hindi text in Devanagari script
2. Use batch translation for processing multiple texts efficiently
3. Check preprocessing options for better translation quality
4. Refer to example texts for optimal input format
""")
if __name__ == "__main__":
main() |