garrry / app.py
akarshan11's picture
Update app.py
895c980 verified
raw
history blame
7.7 kB
import os
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import fitz # PyMuPDF for PDF processing
import docx2txt # For DOCX processing
from fpdf import FPDF # For creating PDF outputs
# Load model and tokenizer
model_name = "facebook/mbart-large-50-many-to-many-mmt" # Powerful translation model that can handle idioms well
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
# Reduced language list with focus on major languages and Indian languages
LANGUAGES = {
# Major Global Languages
"English": "en_XX",
"Spanish": "es_XX",
"French": "fr_XX",
"German": "de_DE",
"Russian": "ru_RU",
"Chinese": "zh_CN",
"Japanese": "ja_XX",
"Arabic": "ar_AR",
# Major Indian Languages
"Hindi": "hi_IN",
"Bengali": "bn_IN",
"Gujarati": "gu_IN",
"Marathi": "mr_IN",
"Tamil": "ta_IN",
"Telugu": "te_IN",
"Malayalam": "ml_IN",
"Punjabi": "pa_IN", # Note: Using closest available in mBART
"Kannada": "kn_IN", # Note: Using closest available in mBART
"Urdu": "ur_PK"
}
# File extraction functions
def extract_text_from_pdf(file_path):
"""Extract text from a PDF file"""
text = ""
try:
doc = fitz.open(file_path)
for page in doc:
text += page.get_text()
return text
except Exception as e:
return f"Error extracting PDF text: {str(e)}"
def extract_text_from_docx(file_path):
"""Extract text from a DOCX file"""
try:
return docx2txt.process(file_path)
except Exception as e:
return f"Error extracting DOCX text: {str(e)}"
def extract_text_from_txt(file_path):
"""Extract text from a TXT file"""
try:
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
except UnicodeDecodeError:
try:
with open(file_path, 'r', encoding='latin-1') as file:
return file.read()
except Exception as e:
return f"Error extracting TXT text: {str(e)}"
except Exception as e:
return f"Error extracting TXT text: {str(e)}"
def save_as_pdf(text, output_path):
"""Save text as PDF"""
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
# Split text into lines and add to PDF
# Encode to handle unicode characters
encoded_text = text.encode('latin-1', 'replace').decode('latin-1')
pdf.multi_cell(0, 10, encoded_text)
pdf.output(output_path)
return output_path
# Translation function
def translate(text, source_lang, target_lang, max_length=1024):
"""Translate text from source language to target language"""
if not text:
return "No text provided for translation."
try:
# Set source and target language
src_lang = LANGUAGES.get(source_lang)
tgt_lang = LANGUAGES.get(target_lang)
if not src_lang or not tgt_lang:
return "Source or target language not supported."
# Set tokenizer source language
tokenizer.src_lang = src_lang
# Prepare input
inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True)
inputs = {k: v.to(device) for k, v in inputs.items()}
# Generate translation
with torch.no_grad():
generated_tokens = model.generate(
**inputs,
forced_bos_token_id=tokenizer.lang_to_id[tgt_lang],
max_length=max_length,
num_beams=5,
early_stopping=True
)
# Decode translation
translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
return translation
except Exception as e:
return f"Translation error: {str(e)}"
# Process uploads and handle translation
def process_file(file, source_lang, target_lang):
"""Process uploaded file and translate its content"""
try:
# Save uploaded file temporarily
temp_file_path = file.name
# Extract text based on file type
if temp_file_path.lower().endswith('.pdf'):
text = extract_text_from_pdf(temp_file_path)
elif temp_file_path.lower().endswith('.docx'):
text = extract_text_from_docx(temp_file_path)
elif temp_file_path.lower().endswith('.txt'):
text = extract_text_from_txt(temp_file_path)
else:
return None, "Unsupported file format. Please upload PDF, DOCX, or TXT files."
# Translate the extracted text
translated_text = translate(text, source_lang, target_lang)
# Save translation as PDF
output_pdf_path = temp_file_path + "_translated.pdf"
save_as_pdf(translated_text, output_pdf_path)
return output_pdf_path, translated_text
except Exception as e:
return None, f"Error processing file: {str(e)}"
# Gradio interface
def gradio_interface():
with gr.Blocks(title="Indian Language Translator", theme=gr.themes.Soft()) as interface:
gr.Markdown("# Indian & Global Language Translator")
gr.Markdown("Translate text with understanding of idioms and cultural expressions")
with gr.Tab("Text Translation"):
with gr.Row():
source_lang_text = gr.Dropdown(list(LANGUAGES.keys()), value="English", label="Source Language")
target_lang_text = gr.Dropdown(list(LANGUAGES.keys()), value="Hindi", label="Target Language")
with gr.Row():
input_text = gr.Textbox(label="Enter text to translate", lines=5, placeholder="Type or paste text here...")
output_text = gr.Textbox(label="Translation", lines=5)
translate_btn = gr.Button("Translate Text", variant="primary")
translate_btn.click(
fn=translate,
inputs=[input_text, source_lang_text, target_lang_text],
outputs=output_text
)
with gr.Tab("Document Translation"):
with gr.Row():
source_lang_doc = gr.Dropdown(list(LANGUAGES.keys()), value="English", label="Source Language")
target_lang_doc = gr.Dropdown(list(LANGUAGES.keys()), value="Hindi", label="Target Language")
file_input = gr.File(label="Upload Document (PDF, DOCX, TXT)", file_types=[".pdf", ".docx", ".txt"])
with gr.Row():
output_file = gr.File(label="Translated PDF")
output_preview = gr.Textbox(label="Translation Preview", lines=8)
translate_doc_btn = gr.Button("Translate Document", variant="primary")
translate_doc_btn.click(
fn=process_file,
inputs=[file_input, source_lang_doc, target_lang_doc],
outputs=[output_file, output_preview]
)
gr.Markdown("### Supported File Types: PDF, DOCX, TXT")
gr.Markdown("### Features:")
gr.Markdown("- Supports major Indian languages including Hindi, Bengali, Tamil, Telugu, Malayalam")
gr.Markdown("- Context-aware translation that understands idioms and cultural expressions")
gr.Markdown("- Document translation with PDF output")
return interface
# Launch the application
if __name__ == "__main__":
app = gradio_interface()
app.launch(share=True) # Remove share=True in production