Spaces:
Running
Running
import gradio as gr | |
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM | |
import fitz # PyMuPDF for PDF handling | |
from io import BytesIO | |
# Load IndicTrans model for Indian languages (example: English to Hindi) | |
model_name = "ai4bharat/indictrans2-en-indic-1b" # Supports multiple Indian languages | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | |
translator = pipeline("translation", model=model, tokenizer=tokenizer) | |
# Language mapping for Indian languages and others | |
language_map = { | |
"French": "fr", # Using OPUS model for non-Indian languages | |
"Spanish": "es", | |
"German": "de", | |
"Hindi": "hi", | |
"Tamil": "ta", | |
"Telugu": "te", | |
"Bengali": "bn", | |
"Gujarati": "gu", | |
"Marathi": "mr", | |
"Kannada": "kn", | |
"Malayalam": "ml", | |
"Punjabi": "pa", | |
} | |
def translate_text(input_text, target_language): | |
"""Translate text with context awareness""" | |
target_lang_code = language_map[target_language] | |
# For Indian languages, use IndicTrans | |
if target_lang_code in ["hi", "ta", "te", "bn", "gu", "mr", "kn", "ml", "pa"]: | |
translated = translator(input_text, src_lang="en", tgt_lang=target_lang_code)[0]['translation_text'] | |
else: | |
# For non-Indian languages, switch to OPUS model (example: English to French) | |
opus_translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{target_lang_code}") | |
translated = opus_translator(input_text)[0]['translation_text'] | |
return translated | |
def process_input(input_data, target_language): | |
"""Handle both text and document inputs""" | |
if isinstance(input_data, str): # Direct text input | |
text = input_data | |
else: # File input (assuming text-based document) | |
doc = fitz.open(stream=input_data.read(), filetype="pdf") | |
text = "" | |
for page in doc: | |
text += page.get_text() | |
# Translate the extracted text | |
translated_text = translate_text(text, target_language) | |
# Create PDF output | |
pdf_output = fitz.open() | |
page = pdf_output.new_page() | |
page.insert_text((50, 50), translated_text) | |
# Save PDF to bytes | |
pdf_bytes = BytesIO() | |
pdf_output.save(pdf_bytes) | |
pdf_bytes.seek(0) | |
return pdf_bytes | |
# Gradio Interface | |
with gr.Blocks(title="Context-Aware Translator with Indian Languages") as demo: | |
gr.Markdown("# Context-Aware Language Translator") | |
gr.Markdown("Translate text or upload a document into Indian languages or others, and get a PDF output.") | |
with gr.Row(): | |
with gr.Column(): | |
input_type = gr.Radio(["Text", "Document"], label="Input Type", value="Text") | |
text_input = gr.Textbox(lines=5, label="Enter Text", visible=True) | |
file_input = gr.File(label="Upload Document", visible=False) | |
target_lang = gr.Dropdown( | |
choices=list(language_map.keys()), | |
label="Target Language", | |
value="Hindi" | |
) | |
submit_btn = gr.Button("Translate") | |
with gr.Column(): | |
output_pdf = gr.File(label="Download Translated PDF") | |
# Dynamic visibility based on input type | |
def update_visibility(choice): | |
return ( | |
gr.update(visible=(choice == "Text")), | |
gr.update(visible=(choice == "Document")) | |
) | |
input_type.change( | |
fn=update_visibility, | |
inputs=input_type, | |
outputs=[text_input, file_input] | |
) | |
# Process the input and generate output | |
submit_btn.click( | |
fn=process_input, | |
inputs=[gr.State(value=None, _js="() => document.querySelector('input[name=\"input_type\"]:checked').value === 'Text' ? document.querySelector('#text_input textarea').value : document.querySelector('#file_input input').files[0]"), target_lang], | |
outputs=output_pdf | |
) | |
demo.launch() |