Spaces:

akarshan11
/

garrry

Running

File size: 7,696 Bytes

import os
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import fitz  # PyMuPDF for PDF processing
import docx2txt  # For DOCX processing
from fpdf import FPDF  # For creating PDF outputs

# Load model and tokenizer
model_name = "facebook/mbart-large-50-many-to-many-mmt"  # Powerful translation model that can handle idioms well
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

# Reduced language list with focus on major languages and Indian languages
LANGUAGES = {
    # Major Global Languages
    "English": "en_XX",
    "Spanish": "es_XX",
    "French": "fr_XX",
    "German": "de_DE",
    "Russian": "ru_RU",
    "Chinese": "zh_CN",
    "Japanese": "ja_XX",
    "Arabic": "ar_AR",
    
    # Major Indian Languages
    "Hindi": "hi_IN",
    "Bengali": "bn_IN",
    "Gujarati": "gu_IN",
    "Marathi": "mr_IN",
    "Tamil": "ta_IN",
    "Telugu": "te_IN",
    "Malayalam": "ml_IN",
    "Punjabi": "pa_IN",  # Note: Using closest available in mBART
    "Kannada": "kn_IN",  # Note: Using closest available in mBART
    "Urdu": "ur_PK"
}

# File extraction functions
def extract_text_from_pdf(file_path):
    """Extract text from a PDF file"""
    text = ""
    try:
        doc = fitz.open(file_path)
        for page in doc:
            text += page.get_text()
        return text
    except Exception as e:
        return f"Error extracting PDF text: {str(e)}"

def extract_text_from_docx(file_path):
    """Extract text from a DOCX file"""
    try:
        return docx2txt.process(file_path)
    except Exception as e:
        return f"Error extracting DOCX text: {str(e)}"

def extract_text_from_txt(file_path):
    """Extract text from a TXT file"""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except UnicodeDecodeError:
        try:
            with open(file_path, 'r', encoding='latin-1') as file:
                return file.read()
        except Exception as e:
            return f"Error extracting TXT text: {str(e)}"
    except Exception as e:
        return f"Error extracting TXT text: {str(e)}"

def save_as_pdf(text, output_path):
    """Save text as PDF"""
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    
    # Split text into lines and add to PDF
    # Encode to handle unicode characters
    encoded_text = text.encode('latin-1', 'replace').decode('latin-1')
    pdf.multi_cell(0, 10, encoded_text)
    
    pdf.output(output_path)
    return output_path

# Translation function
def translate(text, source_lang, target_lang, max_length=1024):
    """Translate text from source language to target language"""
    if not text:
        return "No text provided for translation."
    
    try:
        # Set source and target language
        src_lang = LANGUAGES.get(source_lang)
        tgt_lang = LANGUAGES.get(target_lang)
        
        if not src_lang or not tgt_lang:
            return "Source or target language not supported."
        
        # Set tokenizer source language
        tokenizer.src_lang = src_lang
        
        # Prepare input
        inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Generate translation
        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                forced_bos_token_id=tokenizer.lang_to_id[tgt_lang],
                max_length=max_length,
                num_beams=5,
                early_stopping=True
            )
            
        # Decode translation
        translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
        return translation
    
    except Exception as e:
        return f"Translation error: {str(e)}"

# Process uploads and handle translation
def process_file(file, source_lang, target_lang):
    """Process uploaded file and translate its content"""
    try:
        # Save uploaded file temporarily
        temp_file_path = file.name
        
        # Extract text based on file type
        if temp_file_path.lower().endswith('.pdf'):
            text = extract_text_from_pdf(temp_file_path)
        elif temp_file_path.lower().endswith('.docx'):
            text = extract_text_from_docx(temp_file_path)
        elif temp_file_path.lower().endswith('.txt'):
            text = extract_text_from_txt(temp_file_path)
        else:
            return None, "Unsupported file format. Please upload PDF, DOCX, or TXT files."
        
        # Translate the extracted text
        translated_text = translate(text, source_lang, target_lang)
        
        # Save translation as PDF
        output_pdf_path = temp_file_path + "_translated.pdf"
        save_as_pdf(translated_text, output_pdf_path)
        
        return output_pdf_path, translated_text
    
    except Exception as e:
        return None, f"Error processing file: {str(e)}"

# Gradio interface
def gradio_interface():
    with gr.Blocks(title="Indian Language Translator", theme=gr.themes.Soft()) as interface:
        gr.Markdown("# Indian & Global Language Translator")
        gr.Markdown("Translate text with understanding of idioms and cultural expressions")
        
        with gr.Tab("Text Translation"):
            with gr.Row():
                source_lang_text = gr.Dropdown(list(LANGUAGES.keys()), value="English", label="Source Language")
                target_lang_text = gr.Dropdown(list(LANGUAGES.keys()), value="Hindi", label="Target Language")
            
            with gr.Row():
                input_text = gr.Textbox(label="Enter text to translate", lines=5, placeholder="Type or paste text here...")
                output_text = gr.Textbox(label="Translation", lines=5)
            
            translate_btn = gr.Button("Translate Text", variant="primary")
            translate_btn.click(
                fn=translate,
                inputs=[input_text, source_lang_text, target_lang_text],
                outputs=output_text
            )
        
        with gr.Tab("Document Translation"):
            with gr.Row():
                source_lang_doc = gr.Dropdown(list(LANGUAGES.keys()), value="English", label="Source Language")
                target_lang_doc = gr.Dropdown(list(LANGUAGES.keys()), value="Hindi", label="Target Language")
            
            file_input = gr.File(label="Upload Document (PDF, DOCX, TXT)", file_types=[".pdf", ".docx", ".txt"])
            with gr.Row():
                output_file = gr.File(label="Translated PDF")
                output_preview = gr.Textbox(label="Translation Preview", lines=8)
            
            translate_doc_btn = gr.Button("Translate Document", variant="primary")
            translate_doc_btn.click(
                fn=process_file,
                inputs=[file_input, source_lang_doc, target_lang_doc],
                outputs=[output_file, output_preview]
            )
        
        gr.Markdown("### Supported File Types: PDF, DOCX, TXT")
        gr.Markdown("### Features:")
        gr.Markdown("- Supports major Indian languages including Hindi, Bengali, Tamil, Telugu, Malayalam")
        gr.Markdown("- Context-aware translation that understands idioms and cultural expressions")
        gr.Markdown("- Document translation with PDF output")
    
    return interface

# Launch the application
if __name__ == "__main__":
    app = gradio_interface()
    app.launch(share=True)  # Remove share=True in production