Spaces:

cloghost
/

nllb-hind2kangri

Sleeping

App Files Files Community

nllb-hind2kangri / app.py

cloghost

Update app.py

64ce29d verified 4 days ago

raw

history blame

9.38 kB

	import streamlit as st
	import torch
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
	import re
	import pandas as pd
	from io import StringIO
	import time

	# Page configuration
	st.set_page_config(
	page_title="Hindi to Kangri Translator",
	page_icon="🗣️",
	layout="wide"
	)

	# Custom CSS for better styling
	st.markdown("""
	<style>
	.stAlert {
	padding: 10px;
	margin: 10px 0;
	}
	.example-text {
	padding: 10px;
	background-color: #f0f2f6;
	border-radius: 5px;
	margin: 5px 0;
	cursor: pointer;
	}
	</style>
	""", unsafe_allow_html=True)

	# Example texts
	EXAMPLE_TEXTS = {
	"General Conversation": "मैं आज बाजार जा रहा हूं। क्या आप मेरे साथ चलना चाहेंगे?",
	"Cultural": "दिवाली का त्योहार रोशनी और खुशियों का त्योहार है।",
	"Literature": "साहित्य मानव जीवन का दर्पण है। इसमें समाज की हर छवि दिखाई देती है।",
	"Tourism": "हिमाचल प्रदेश की सुंदर पहाड़ियां और हरी-भरी वादियां पर्यटकों को आकर्षित करती हैं।"
	}

	@st.cache_resource
	def load_model():
	"""Load and cache the model and tokenizer"""
	try:
	model_name = "cloghost/nllb-200-distilled-600M-hin-kang-v1"

	with st.spinner("Loading model and tokenizer..."):
	model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	device = 0 if torch.cuda.is_available() else -1

	translator = pipeline(
	"translation",
	model=model,
	tokenizer=tokenizer,
	src_lang="hin_Deva",
	tgt_lang="kang_Deva",
	device=device
	)

	return translator
	except Exception as e:
	st.error(f"Error loading model: {str(e)}")
	return None

	def preprocess_text(text):
	"""Preprocess the input text"""
	# Remove extra whitespace
	text = re.sub(r'\s+', ' ', text.strip())

	# Remove special characters except Devanagari and basic punctuation
	text = re.sub(r'[^\u0900-\u097F\s।,.?!]', '', text)

	# Normalize common variations of Hindi characters
	text = text.replace('॰', '.')

	return text

	def batch_translate(translator, texts):
	"""Translate a batch of texts"""
	results = []
	for text in texts:
	try:
	if text.strip(): # Only translate non-empty texts
	translation = translator(text)
	results.append({
	'Source': text.strip(),
	'Translation': translation[0]['translation_text']
	})
	else:
	results.append({
	'Source': '',
	'Translation': ''
	})
	except Exception as e:
	results.append({
	'Source': text.strip(),
	'Translation': f'Error: {str(e)}'
	})
	return pd.DataFrame(results)

	def translate_text(translator, text):
	"""Translate single text with error handling"""
	try:
	preprocessed_text = preprocess_text(text)
	if not preprocessed_text:
	return None

	translation = translator(preprocessed_text)
	return translation[0]['translation_text']
	except Exception as e:
	st.error(f"Translation Error: {str(e)}")
	return None

	def main():
	st.title("🗣️ Hindi to Kangri Translator")
	st.markdown("""
	An advanced translation tool for converting Hindi text to Kangri language.
	Features include single text translation, batch processing, and text preprocessing.
	""")

	# Load model
	translator = load_model()
	if not translator:
	st.stop()

	# Create tabs for different features
	tabs = st.tabs(["Single Translation", "Batch Translation", "Examples", "About"])

	# Single Translation Tab
	with tabs[0]:
	col1, col2 = st.columns(2)

	with col1:
	st.subheader("Hindi Text (हिंदी)")
	input_text = st.text_area(
	"Enter Hindi text",
	height=200,
	help="Enter the Hindi text you want to translate to Kangri",
	placeholder="यहाँ हिंदी में टेक्स्ट लिखें..."
	)

	# Preprocessing options
	with st.expander("Preprocessing Options"):
	remove_special = st.checkbox("Remove special characters", value=True)
	normalize_chars = st.checkbox("Normalize Hindi characters", value=True)

	if st.button("Translate to Kangri"):
	if input_text:
	with st.spinner("Translating..."):
	# Show preprocessing steps
	if remove_special or normalize_chars:
	st.info("Preprocessing text...")
	processed_text = preprocess_text(input_text)
	st.code(processed_text, language="text")

	translated_text = translate_text(translator, input_text)

	if translated_text:
	with col2:
	st.subheader("Kangri Translation (कांगड़ी)")
	st.text_area(
	"Kangri translation",
	value=translated_text,
	height=200,
	disabled=True
	)
	else:
	st.warning("Please enter some Hindi text to translate.")

	# Batch Translation Tab
	with tabs[1]:
	st.subheader("Batch Translation")
	st.markdown("""
	Upload a CSV or TXT file containing Hindi texts to translate in bulk.
	- For CSV: Include a column named 'text' containing Hindi texts
	- For TXT: Each line should contain one Hindi text to translate
	""")

	uploaded_file = st.file_uploader("Choose a file", type=['csv', 'txt'])

	if uploaded_file:
	try:
	if uploaded_file.type == 'text/csv':
	df = pd.read_csv(uploaded_file)
	texts = df['text'].tolist()
	else: # txt file
	content = uploaded_file.read().decode()
	texts = content.split('\n')

	if st.button("Translate Batch"):
	progress_bar = st.progress(0)
	with st.spinner("Processing batch translation..."):
	results_df = batch_translate(translator, texts)
	progress_bar.progress(100)

	st.success("Translation completed!")
	st.dataframe(results_df)

	# Download button for results
	csv = results_df.to_csv(index=False)
	st.download_button(
	"Download Results",
	csv,
	"translation_results.csv",
	"text/csv",
	key='download-csv'
	)
	except Exception as e:
	st.error(f"Error processing file: {str(e)}")

	# Examples Tab
	with tabs[2]:
	st.subheader("Example Texts")
	st.markdown("Click on any example to load it into the translator:")

	for category, text in EXAMPLE_TEXTS.items():
	st.markdown(f"{category}:")
	if st.button(text, key=f"example_{category}"):
	tabs[0].button = True # Switch to translation tab
	st.session_state.input_text = text
	st.experimental_rerun()

	# About Tab
	with tabs[3]:
	st.subheader("About the Model")
	st.markdown("""
	### Model Information
	- Base Model: NLLB-200 Distilled (600M parameters)
	- Fine-tuned for: Hindi (hin_Deva) to Kangri (kang_Deva) translation
	- Maximum input length: 512 tokens
	- Model ID: `cloghost/nllb-200-distilled-600M-hin-kang-v1`

	### Preprocessing Features
	- Remove special characters while preserving Devanagari script
	- Normalize Hindi character variations
	- Clean extra whitespace and formatting

	### Usage Tips
	1. For best results, input clean Hindi text in Devanagari script
	2. Use batch translation for processing multiple texts efficiently
	3. Check preprocessing options for better translation quality
	4. Refer to example texts for optimal input format
	""")

	if __name__ == "__main__":
	main()