Spaces:
Running
Running
import os | |
import gradio as gr | |
import torch | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
import fitz # PyMuPDF for PDF processing | |
import docx2txt # For DOCX processing | |
from fpdf import FPDF # For creating PDF outputs | |
# Load model and tokenizer | |
model_name = "facebook/mbart-large-50-many-to-many-mmt" # Powerful translation model that can handle idioms well | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | |
# Set device | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model = model.to(device) | |
# Reduced language list with focus on major languages and Indian languages | |
LANGUAGES = { | |
# Major Global Languages | |
"English": "en_XX", | |
"Spanish": "es_XX", | |
"French": "fr_XX", | |
"German": "de_DE", | |
"Russian": "ru_RU", | |
"Chinese": "zh_CN", | |
"Japanese": "ja_XX", | |
"Arabic": "ar_AR", | |
# Major Indian Languages | |
"Hindi": "hi_IN", | |
"Bengali": "bn_IN", | |
"Gujarati": "gu_IN", | |
"Marathi": "mr_IN", | |
"Tamil": "ta_IN", | |
"Telugu": "te_IN", | |
"Malayalam": "ml_IN", | |
"Punjabi": "pa_IN", # Note: Using closest available in mBART | |
"Kannada": "kn_IN", # Note: Using closest available in mBART | |
"Urdu": "ur_PK" | |
} | |
# File extraction functions | |
def extract_text_from_pdf(file_path): | |
"""Extract text from a PDF file""" | |
text = "" | |
try: | |
doc = fitz.open(file_path) | |
for page in doc: | |
text += page.get_text() | |
return text | |
except Exception as e: | |
return f"Error extracting PDF text: {str(e)}" | |
def extract_text_from_docx(file_path): | |
"""Extract text from a DOCX file""" | |
try: | |
return docx2txt.process(file_path) | |
except Exception as e: | |
return f"Error extracting DOCX text: {str(e)}" | |
def extract_text_from_txt(file_path): | |
"""Extract text from a TXT file""" | |
try: | |
with open(file_path, 'r', encoding='utf-8') as file: | |
return file.read() | |
except UnicodeDecodeError: | |
try: | |
with open(file_path, 'r', encoding='latin-1') as file: | |
return file.read() | |
except Exception as e: | |
return f"Error extracting TXT text: {str(e)}" | |
except Exception as e: | |
return f"Error extracting TXT text: {str(e)}" | |
def save_as_pdf(text, output_path): | |
"""Save text as PDF""" | |
pdf = FPDF() | |
pdf.add_page() | |
pdf.set_font("Arial", size=12) | |
# Split text into lines and add to PDF | |
# Encode to handle unicode characters | |
encoded_text = text.encode('latin-1', 'replace').decode('latin-1') | |
pdf.multi_cell(0, 10, encoded_text) | |
pdf.output(output_path) | |
return output_path | |
# Translation function | |
def translate(text, source_lang, target_lang, max_length=1024): | |
"""Translate text from source language to target language""" | |
if not text: | |
return "No text provided for translation." | |
try: | |
# Set source and target language | |
src_lang = LANGUAGES.get(source_lang) | |
tgt_lang = LANGUAGES.get(target_lang) | |
if not src_lang or not tgt_lang: | |
return "Source or target language not supported." | |
# Set tokenizer source language | |
tokenizer.src_lang = src_lang | |
# Prepare input | |
inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True) | |
inputs = {k: v.to(device) for k, v in inputs.items()} | |
# Generate translation | |
with torch.no_grad(): | |
generated_tokens = model.generate( | |
**inputs, | |
forced_bos_token_id=tokenizer.lang_to_id[tgt_lang], | |
max_length=max_length, | |
num_beams=5, | |
early_stopping=True | |
) | |
# Decode translation | |
translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] | |
return translation | |
except Exception as e: | |
return f"Translation error: {str(e)}" | |
# Process uploads and handle translation | |
def process_file(file, source_lang, target_lang): | |
"""Process uploaded file and translate its content""" | |
try: | |
# Save uploaded file temporarily | |
temp_file_path = file.name | |
# Extract text based on file type | |
if temp_file_path.lower().endswith('.pdf'): | |
text = extract_text_from_pdf(temp_file_path) | |
elif temp_file_path.lower().endswith('.docx'): | |
text = extract_text_from_docx(temp_file_path) | |
elif temp_file_path.lower().endswith('.txt'): | |
text = extract_text_from_txt(temp_file_path) | |
else: | |
return None, "Unsupported file format. Please upload PDF, DOCX, or TXT files." | |
# Translate the extracted text | |
translated_text = translate(text, source_lang, target_lang) | |
# Save translation as PDF | |
output_pdf_path = temp_file_path + "_translated.pdf" | |
save_as_pdf(translated_text, output_pdf_path) | |
return output_pdf_path, translated_text | |
except Exception as e: | |
return None, f"Error processing file: {str(e)}" | |
# Gradio interface | |
def gradio_interface(): | |
with gr.Blocks(title="Indian Language Translator", theme=gr.themes.Soft()) as interface: | |
gr.Markdown("# Indian & Global Language Translator") | |
gr.Markdown("Translate text with understanding of idioms and cultural expressions") | |
with gr.Tab("Text Translation"): | |
with gr.Row(): | |
source_lang_text = gr.Dropdown(list(LANGUAGES.keys()), value="English", label="Source Language") | |
target_lang_text = gr.Dropdown(list(LANGUAGES.keys()), value="Hindi", label="Target Language") | |
with gr.Row(): | |
input_text = gr.Textbox(label="Enter text to translate", lines=5, placeholder="Type or paste text here...") | |
output_text = gr.Textbox(label="Translation", lines=5) | |
translate_btn = gr.Button("Translate Text", variant="primary") | |
translate_btn.click( | |
fn=translate, | |
inputs=[input_text, source_lang_text, target_lang_text], | |
outputs=output_text | |
) | |
with gr.Tab("Document Translation"): | |
with gr.Row(): | |
source_lang_doc = gr.Dropdown(list(LANGUAGES.keys()), value="English", label="Source Language") | |
target_lang_doc = gr.Dropdown(list(LANGUAGES.keys()), value="Hindi", label="Target Language") | |
file_input = gr.File(label="Upload Document (PDF, DOCX, TXT)", file_types=[".pdf", ".docx", ".txt"]) | |
with gr.Row(): | |
output_file = gr.File(label="Translated PDF") | |
output_preview = gr.Textbox(label="Translation Preview", lines=8) | |
translate_doc_btn = gr.Button("Translate Document", variant="primary") | |
translate_doc_btn.click( | |
fn=process_file, | |
inputs=[file_input, source_lang_doc, target_lang_doc], | |
outputs=[output_file, output_preview] | |
) | |
gr.Markdown("### Supported File Types: PDF, DOCX, TXT") | |
gr.Markdown("### Features:") | |
gr.Markdown("- Supports major Indian languages including Hindi, Bengali, Tamil, Telugu, Malayalam") | |
gr.Markdown("- Context-aware translation that understands idioms and cultural expressions") | |
gr.Markdown("- Document translation with PDF output") | |
return interface | |
# Launch the application | |
if __name__ == "__main__": | |
app = gradio_interface() | |
app.launch(share=True) # Remove share=True in production |