garrry / app.py
akarshan11's picture
Update app.py
f7aa0f6 verified
raw
history blame
3.92 kB
import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import fitz # PyMuPDF for PDF handling
from io import BytesIO
# Load IndicTrans model for Indian languages (example: English to Hindi)
model_name = "ai4bharat/indictrans2-en-indic-1b" # Supports multiple Indian languages
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
translator = pipeline("translation", model=model, tokenizer=tokenizer)
# Language mapping for Indian languages and others
language_map = {
"French": "fr", # Using OPUS model for non-Indian languages
"Spanish": "es",
"German": "de",
"Hindi": "hi",
"Tamil": "ta",
"Telugu": "te",
"Bengali": "bn",
"Gujarati": "gu",
"Marathi": "mr",
"Kannada": "kn",
"Malayalam": "ml",
"Punjabi": "pa",
}
def translate_text(input_text, target_language):
"""Translate text with context awareness"""
target_lang_code = language_map[target_language]
# For Indian languages, use IndicTrans
if target_lang_code in ["hi", "ta", "te", "bn", "gu", "mr", "kn", "ml", "pa"]:
translated = translator(input_text, src_lang="en", tgt_lang=target_lang_code)[0]['translation_text']
else:
# For non-Indian languages, switch to OPUS model (example: English to French)
opus_translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{target_lang_code}")
translated = opus_translator(input_text)[0]['translation_text']
return translated
def process_input(input_data, target_language):
"""Handle both text and document inputs"""
if isinstance(input_data, str): # Direct text input
text = input_data
else: # File input (assuming text-based document)
doc = fitz.open(stream=input_data.read(), filetype="pdf")
text = ""
for page in doc:
text += page.get_text()
# Translate the extracted text
translated_text = translate_text(text, target_language)
# Create PDF output
pdf_output = fitz.open()
page = pdf_output.new_page()
page.insert_text((50, 50), translated_text)
# Save PDF to bytes
pdf_bytes = BytesIO()
pdf_output.save(pdf_bytes)
pdf_bytes.seek(0)
return pdf_bytes
# Gradio Interface
with gr.Blocks(title="Context-Aware Translator with Indian Languages") as demo:
gr.Markdown("# Context-Aware Language Translator")
gr.Markdown("Translate text or upload a document into Indian languages or others, and get a PDF output.")
with gr.Row():
with gr.Column():
input_type = gr.Radio(["Text", "Document"], label="Input Type", value="Text")
text_input = gr.Textbox(lines=5, label="Enter Text", visible=True)
file_input = gr.File(label="Upload Document", visible=False)
target_lang = gr.Dropdown(
choices=list(language_map.keys()),
label="Target Language",
value="Hindi"
)
submit_btn = gr.Button("Translate")
with gr.Column():
output_pdf = gr.File(label="Download Translated PDF")
# Dynamic visibility based on input type
def update_visibility(choice):
return (
gr.update(visible=(choice == "Text")),
gr.update(visible=(choice == "Document"))
)
input_type.change(
fn=update_visibility,
inputs=input_type,
outputs=[text_input, file_input]
)
# Process the input and generate output
submit_btn.click(
fn=process_input,
inputs=[gr.State(value=None, _js="() => document.querySelector('input[name=\"input_type\"]:checked').value === 'Text' ? document.querySelector('#text_input textarea').value : document.querySelector('#file_input input').files[0]"), target_lang],
outputs=output_pdf
)
demo.launch()