import os import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import fitz # PyMuPDF for PDF processing import docx2txt # For DOCX processing from fpdf import FPDF # For creating PDF outputs # Load model and tokenizer model_name = "facebook/mbart-large-50-many-to-many-mmt" # Powerful translation model that can handle idioms well tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name) # Set device device = "cuda" if torch.cuda.is_available() else "cpu" model = model.to(device) # Reduced language list with focus on major languages and Indian languages LANGUAGES = { # Major Global Languages "English": "en_XX", "Spanish": "es_XX", "French": "fr_XX", "German": "de_DE", "Russian": "ru_RU", "Chinese": "zh_CN", "Japanese": "ja_XX", "Arabic": "ar_AR", # Major Indian Languages "Hindi": "hi_IN", "Bengali": "bn_IN", "Gujarati": "gu_IN", "Marathi": "mr_IN", "Tamil": "ta_IN", "Telugu": "te_IN", "Malayalam": "ml_IN", "Punjabi": "pa_IN", # Note: Using closest available in mBART "Kannada": "kn_IN", # Note: Using closest available in mBART "Urdu": "ur_PK" } # File extraction functions def extract_text_from_pdf(file_path): """Extract text from a PDF file""" text = "" try: doc = fitz.open(file_path) for page in doc: text += page.get_text() return text except Exception as e: return f"Error extracting PDF text: {str(e)}" def extract_text_from_docx(file_path): """Extract text from a DOCX file""" try: return docx2txt.process(file_path) except Exception as e: return f"Error extracting DOCX text: {str(e)}" def extract_text_from_txt(file_path): """Extract text from a TXT file""" try: with open(file_path, 'r', encoding='utf-8') as file: return file.read() except UnicodeDecodeError: try: with open(file_path, 'r', encoding='latin-1') as file: return file.read() except Exception as e: return f"Error extracting TXT text: {str(e)}" except Exception as e: return f"Error extracting TXT text: {str(e)}" def save_as_pdf(text, output_path): """Save text as PDF""" pdf = FPDF() pdf.add_page() pdf.set_font("Arial", size=12) # Split text into lines and add to PDF # Encode to handle unicode characters encoded_text = text.encode('latin-1', 'replace').decode('latin-1') pdf.multi_cell(0, 10, encoded_text) pdf.output(output_path) return output_path # Translation function def translate(text, source_lang, target_lang, max_length=1024): """Translate text from source language to target language""" if not text: return "No text provided for translation." try: # Set source and target language src_lang = LANGUAGES.get(source_lang) tgt_lang = LANGUAGES.get(target_lang) if not src_lang or not tgt_lang: return "Source or target language not supported." # Set tokenizer source language tokenizer.src_lang = src_lang # Prepare input inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True) inputs = {k: v.to(device) for k, v in inputs.items()} # Generate translation with torch.no_grad(): generated_tokens = model.generate( **inputs, forced_bos_token_id=tokenizer.lang_to_id[tgt_lang], max_length=max_length, num_beams=5, early_stopping=True ) # Decode translation translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] return translation except Exception as e: return f"Translation error: {str(e)}" # Process uploads and handle translation def process_file(file, source_lang, target_lang): """Process uploaded file and translate its content""" try: # Save uploaded file temporarily temp_file_path = file.name # Extract text based on file type if temp_file_path.lower().endswith('.pdf'): text = extract_text_from_pdf(temp_file_path) elif temp_file_path.lower().endswith('.docx'): text = extract_text_from_docx(temp_file_path) elif temp_file_path.lower().endswith('.txt'): text = extract_text_from_txt(temp_file_path) else: return None, "Unsupported file format. Please upload PDF, DOCX, or TXT files." # Translate the extracted text translated_text = translate(text, source_lang, target_lang) # Save translation as PDF output_pdf_path = temp_file_path + "_translated.pdf" save_as_pdf(translated_text, output_pdf_path) return output_pdf_path, translated_text except Exception as e: return None, f"Error processing file: {str(e)}" # Gradio interface def gradio_interface(): with gr.Blocks(title="Indian Language Translator", theme=gr.themes.Soft()) as interface: gr.Markdown("# Indian & Global Language Translator") gr.Markdown("Translate text with understanding of idioms and cultural expressions") with gr.Tab("Text Translation"): with gr.Row(): source_lang_text = gr.Dropdown(list(LANGUAGES.keys()), value="English", label="Source Language") target_lang_text = gr.Dropdown(list(LANGUAGES.keys()), value="Hindi", label="Target Language") with gr.Row(): input_text = gr.Textbox(label="Enter text to translate", lines=5, placeholder="Type or paste text here...") output_text = gr.Textbox(label="Translation", lines=5) translate_btn = gr.Button("Translate Text", variant="primary") translate_btn.click( fn=translate, inputs=[input_text, source_lang_text, target_lang_text], outputs=output_text ) with gr.Tab("Document Translation"): with gr.Row(): source_lang_doc = gr.Dropdown(list(LANGUAGES.keys()), value="English", label="Source Language") target_lang_doc = gr.Dropdown(list(LANGUAGES.keys()), value="Hindi", label="Target Language") file_input = gr.File(label="Upload Document (PDF, DOCX, TXT)", file_types=[".pdf", ".docx", ".txt"]) with gr.Row(): output_file = gr.File(label="Translated PDF") output_preview = gr.Textbox(label="Translation Preview", lines=8) translate_doc_btn = gr.Button("Translate Document", variant="primary") translate_doc_btn.click( fn=process_file, inputs=[file_input, source_lang_doc, target_lang_doc], outputs=[output_file, output_preview] ) gr.Markdown("### Supported File Types: PDF, DOCX, TXT") gr.Markdown("### Features:") gr.Markdown("- Supports major Indian languages including Hindi, Bengali, Tamil, Telugu, Malayalam") gr.Markdown("- Context-aware translation that understands idioms and cultural expressions") gr.Markdown("- Document translation with PDF output") return interface # Launch the application if __name__ == "__main__": app = gradio_interface() app.launch(share=True) # Remove share=True in production