Spaces:
Running
Running
File size: 3,923 Bytes
895c980 f7aa0f6 9dcbee4 f7aa0f6 9dcbee4 f7aa0f6 476dd48 f7aa0f6 b113724 f7aa0f6 b113724 f7aa0f6 b113724 f7aa0f6 b113724 f7aa0f6 9dcbee4 f7aa0f6 d2bdf71 f7aa0f6 895c980 f7aa0f6 d2bdf71 f7aa0f6 476dd48 f7aa0f6 476dd48 9dcbee4 f7aa0f6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import fitz # PyMuPDF for PDF handling
from io import BytesIO
# Load IndicTrans model for Indian languages (example: English to Hindi)
model_name = "ai4bharat/indictrans2-en-indic-1b" # Supports multiple Indian languages
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
translator = pipeline("translation", model=model, tokenizer=tokenizer)
# Language mapping for Indian languages and others
language_map = {
"French": "fr", # Using OPUS model for non-Indian languages
"Spanish": "es",
"German": "de",
"Hindi": "hi",
"Tamil": "ta",
"Telugu": "te",
"Bengali": "bn",
"Gujarati": "gu",
"Marathi": "mr",
"Kannada": "kn",
"Malayalam": "ml",
"Punjabi": "pa",
}
def translate_text(input_text, target_language):
"""Translate text with context awareness"""
target_lang_code = language_map[target_language]
# For Indian languages, use IndicTrans
if target_lang_code in ["hi", "ta", "te", "bn", "gu", "mr", "kn", "ml", "pa"]:
translated = translator(input_text, src_lang="en", tgt_lang=target_lang_code)[0]['translation_text']
else:
# For non-Indian languages, switch to OPUS model (example: English to French)
opus_translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{target_lang_code}")
translated = opus_translator(input_text)[0]['translation_text']
return translated
def process_input(input_data, target_language):
"""Handle both text and document inputs"""
if isinstance(input_data, str): # Direct text input
text = input_data
else: # File input (assuming text-based document)
doc = fitz.open(stream=input_data.read(), filetype="pdf")
text = ""
for page in doc:
text += page.get_text()
# Translate the extracted text
translated_text = translate_text(text, target_language)
# Create PDF output
pdf_output = fitz.open()
page = pdf_output.new_page()
page.insert_text((50, 50), translated_text)
# Save PDF to bytes
pdf_bytes = BytesIO()
pdf_output.save(pdf_bytes)
pdf_bytes.seek(0)
return pdf_bytes
# Gradio Interface
with gr.Blocks(title="Context-Aware Translator with Indian Languages") as demo:
gr.Markdown("# Context-Aware Language Translator")
gr.Markdown("Translate text or upload a document into Indian languages or others, and get a PDF output.")
with gr.Row():
with gr.Column():
input_type = gr.Radio(["Text", "Document"], label="Input Type", value="Text")
text_input = gr.Textbox(lines=5, label="Enter Text", visible=True)
file_input = gr.File(label="Upload Document", visible=False)
target_lang = gr.Dropdown(
choices=list(language_map.keys()),
label="Target Language",
value="Hindi"
)
submit_btn = gr.Button("Translate")
with gr.Column():
output_pdf = gr.File(label="Download Translated PDF")
# Dynamic visibility based on input type
def update_visibility(choice):
return (
gr.update(visible=(choice == "Text")),
gr.update(visible=(choice == "Document"))
)
input_type.change(
fn=update_visibility,
inputs=input_type,
outputs=[text_input, file_input]
)
# Process the input and generate output
submit_btn.click(
fn=process_input,
inputs=[gr.State(value=None, _js="() => document.querySelector('input[name=\"input_type\"]:checked').value === 'Text' ? document.querySelector('#text_input textarea').value : document.querySelector('#file_input input').files[0]"), target_lang],
outputs=output_pdf
)
demo.launch() |