Spaces:

akarshan11
/

garrry

Running

App Files Files Community

garrry / app.py

akarshan11

Update app.py

895c980 verified 4 months ago

raw

history blame

7.7 kB

	import os
	import gradio as gr
	import torch
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	import fitz # PyMuPDF for PDF processing
	import docx2txt # For DOCX processing
	from fpdf import FPDF # For creating PDF outputs

	# Load model and tokenizer
	model_name = "facebook/mbart-large-50-many-to-many-mmt" # Powerful translation model that can handle idioms well
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

	# Set device
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model = model.to(device)

	# Reduced language list with focus on major languages and Indian languages
	LANGUAGES = {
	# Major Global Languages
	"English": "en_XX",
	"Spanish": "es_XX",
	"French": "fr_XX",
	"German": "de_DE",
	"Russian": "ru_RU",
	"Chinese": "zh_CN",
	"Japanese": "ja_XX",
	"Arabic": "ar_AR",

	# Major Indian Languages
	"Hindi": "hi_IN",
	"Bengali": "bn_IN",
	"Gujarati": "gu_IN",
	"Marathi": "mr_IN",
	"Tamil": "ta_IN",
	"Telugu": "te_IN",
	"Malayalam": "ml_IN",
	"Punjabi": "pa_IN", # Note: Using closest available in mBART
	"Kannada": "kn_IN", # Note: Using closest available in mBART
	"Urdu": "ur_PK"
	}

	# File extraction functions
	def extract_text_from_pdf(file_path):
	"""Extract text from a PDF file"""
	text = ""
	try:
	doc = fitz.open(file_path)
	for page in doc:
	text += page.get_text()
	return text
	except Exception as e:
	return f"Error extracting PDF text: {str(e)}"

	def extract_text_from_docx(file_path):
	"""Extract text from a DOCX file"""
	try:
	return docx2txt.process(file_path)
	except Exception as e:
	return f"Error extracting DOCX text: {str(e)}"

	def extract_text_from_txt(file_path):
	"""Extract text from a TXT file"""
	try:
	with open(file_path, 'r', encoding='utf-8') as file:
	return file.read()
	except UnicodeDecodeError:
	try:
	with open(file_path, 'r', encoding='latin-1') as file:
	return file.read()
	except Exception as e:
	return f"Error extracting TXT text: {str(e)}"
	except Exception as e:
	return f"Error extracting TXT text: {str(e)}"

	def save_as_pdf(text, output_path):
	"""Save text as PDF"""
	pdf = FPDF()
	pdf.add_page()
	pdf.set_font("Arial", size=12)

	# Split text into lines and add to PDF
	# Encode to handle unicode characters
	encoded_text = text.encode('latin-1', 'replace').decode('latin-1')
	pdf.multi_cell(0, 10, encoded_text)

	pdf.output(output_path)
	return output_path

	# Translation function
	def translate(text, source_lang, target_lang, max_length=1024):
	"""Translate text from source language to target language"""
	if not text:
	return "No text provided for translation."

	try:
	# Set source and target language
	src_lang = LANGUAGES.get(source_lang)
	tgt_lang = LANGUAGES.get(target_lang)

	if not src_lang or not tgt_lang:
	return "Source or target language not supported."

	# Set tokenizer source language
	tokenizer.src_lang = src_lang

	# Prepare input
	inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True)
	inputs = {k: v.to(device) for k, v in inputs.items()}

	# Generate translation
	with torch.no_grad():
	generated_tokens = model.generate(
	**inputs,
	forced_bos_token_id=tokenizer.lang_to_id[tgt_lang],
	max_length=max_length,
	num_beams=5,
	early_stopping=True
	)

	# Decode translation
	translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
	return translation

	except Exception as e:
	return f"Translation error: {str(e)}"

	# Process uploads and handle translation
	def process_file(file, source_lang, target_lang):
	"""Process uploaded file and translate its content"""
	try:
	# Save uploaded file temporarily
	temp_file_path = file.name

	# Extract text based on file type
	if temp_file_path.lower().endswith('.pdf'):
	text = extract_text_from_pdf(temp_file_path)
	elif temp_file_path.lower().endswith('.docx'):
	text = extract_text_from_docx(temp_file_path)
	elif temp_file_path.lower().endswith('.txt'):
	text = extract_text_from_txt(temp_file_path)
	else:
	return None, "Unsupported file format. Please upload PDF, DOCX, or TXT files."

	# Translate the extracted text
	translated_text = translate(text, source_lang, target_lang)

	# Save translation as PDF
	output_pdf_path = temp_file_path + "_translated.pdf"
	save_as_pdf(translated_text, output_pdf_path)

	return output_pdf_path, translated_text

	except Exception as e:
	return None, f"Error processing file: {str(e)}"

	# Gradio interface
	def gradio_interface():
	with gr.Blocks(title="Indian Language Translator", theme=gr.themes.Soft()) as interface:
	gr.Markdown("# Indian & Global Language Translator")
	gr.Markdown("Translate text with understanding of idioms and cultural expressions")

	with gr.Tab("Text Translation"):
	with gr.Row():
	source_lang_text = gr.Dropdown(list(LANGUAGES.keys()), value="English", label="Source Language")
	target_lang_text = gr.Dropdown(list(LANGUAGES.keys()), value="Hindi", label="Target Language")

	with gr.Row():
	input_text = gr.Textbox(label="Enter text to translate", lines=5, placeholder="Type or paste text here...")
	output_text = gr.Textbox(label="Translation", lines=5)

	translate_btn = gr.Button("Translate Text", variant="primary")
	translate_btn.click(
	fn=translate,
	inputs=[input_text, source_lang_text, target_lang_text],
	outputs=output_text
	)

	with gr.Tab("Document Translation"):
	with gr.Row():
	source_lang_doc = gr.Dropdown(list(LANGUAGES.keys()), value="English", label="Source Language")
	target_lang_doc = gr.Dropdown(list(LANGUAGES.keys()), value="Hindi", label="Target Language")

	file_input = gr.File(label="Upload Document (PDF, DOCX, TXT)", file_types=[".pdf", ".docx", ".txt"])
	with gr.Row():
	output_file = gr.File(label="Translated PDF")
	output_preview = gr.Textbox(label="Translation Preview", lines=8)

	translate_doc_btn = gr.Button("Translate Document", variant="primary")
	translate_doc_btn.click(
	fn=process_file,
	inputs=[file_input, source_lang_doc, target_lang_doc],
	outputs=[output_file, output_preview]
	)

	gr.Markdown("### Supported File Types: PDF, DOCX, TXT")
	gr.Markdown("### Features:")
	gr.Markdown("- Supports major Indian languages including Hindi, Bengali, Tamil, Telugu, Malayalam")
	gr.Markdown("- Context-aware translation that understands idioms and cultural expressions")
	gr.Markdown("- Document translation with PDF output")

	return interface

	# Launch the application
	if __name__ == "__main__":
	app = gradio_interface()
	app.launch(share=True) # Remove share=True in production