File size: 3,923 Bytes
895c980
f7aa0f6
 
 
9dcbee4
f7aa0f6
 
 
 
 
9dcbee4
f7aa0f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
476dd48
f7aa0f6
 
 
b113724
f7aa0f6
 
 
 
 
 
 
 
 
b113724
f7aa0f6
 
 
 
 
 
 
b113724
f7aa0f6
b113724
f7aa0f6
 
 
 
 
 
 
 
 
 
 
 
 
 
9dcbee4
f7aa0f6
 
 
 
d2bdf71
f7aa0f6
 
 
 
 
 
 
 
 
 
 
895c980
f7aa0f6
 
 
 
 
 
 
 
 
d2bdf71
f7aa0f6
 
 
 
476dd48
f7aa0f6
 
 
 
 
 
476dd48
9dcbee4
f7aa0f6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import fitz  # PyMuPDF for PDF handling
from io import BytesIO

# Load IndicTrans model for Indian languages (example: English to Hindi)
model_name = "ai4bharat/indictrans2-en-indic-1b"  # Supports multiple Indian languages
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
translator = pipeline("translation", model=model, tokenizer=tokenizer)

# Language mapping for Indian languages and others
language_map = {
    "French": "fr",  # Using OPUS model for non-Indian languages
    "Spanish": "es",
    "German": "de",
    "Hindi": "hi",
    "Tamil": "ta",
    "Telugu": "te",
    "Bengali": "bn",
    "Gujarati": "gu",
    "Marathi": "mr",
    "Kannada": "kn",
    "Malayalam": "ml",
    "Punjabi": "pa",
}

def translate_text(input_text, target_language):
    """Translate text with context awareness"""
    target_lang_code = language_map[target_language]
    
    # For Indian languages, use IndicTrans
    if target_lang_code in ["hi", "ta", "te", "bn", "gu", "mr", "kn", "ml", "pa"]:
        translated = translator(input_text, src_lang="en", tgt_lang=target_lang_code)[0]['translation_text']
    else:
        # For non-Indian languages, switch to OPUS model (example: English to French)
        opus_translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{target_lang_code}")
        translated = opus_translator(input_text)[0]['translation_text']
    
    return translated

def process_input(input_data, target_language):
    """Handle both text and document inputs"""
    if isinstance(input_data, str):  # Direct text input
        text = input_data
    else:  # File input (assuming text-based document)
        doc = fitz.open(stream=input_data.read(), filetype="pdf")
        text = ""
        for page in doc:
            text += page.get_text()
    
    # Translate the extracted text
    translated_text = translate_text(text, target_language)
    
    # Create PDF output
    pdf_output = fitz.open()
    page = pdf_output.new_page()
    page.insert_text((50, 50), translated_text)
    
    # Save PDF to bytes
    pdf_bytes = BytesIO()
    pdf_output.save(pdf_bytes)
    pdf_bytes.seek(0)
    
    return pdf_bytes

# Gradio Interface
with gr.Blocks(title="Context-Aware Translator with Indian Languages") as demo:
    gr.Markdown("# Context-Aware Language Translator")
    gr.Markdown("Translate text or upload a document into Indian languages or others, and get a PDF output.")
    
    with gr.Row():
        with gr.Column():
            input_type = gr.Radio(["Text", "Document"], label="Input Type", value="Text")
            text_input = gr.Textbox(lines=5, label="Enter Text", visible=True)
            file_input = gr.File(label="Upload Document", visible=False)
            target_lang = gr.Dropdown(
                choices=list(language_map.keys()),
                label="Target Language",
                value="Hindi"
            )
            submit_btn = gr.Button("Translate")
        
        with gr.Column():
            output_pdf = gr.File(label="Download Translated PDF")

    # Dynamic visibility based on input type
    def update_visibility(choice):
        return (
            gr.update(visible=(choice == "Text")),
            gr.update(visible=(choice == "Document"))
        )
    
    input_type.change(
        fn=update_visibility,
        inputs=input_type,
        outputs=[text_input, file_input]
    )

    # Process the input and generate output
    submit_btn.click(
        fn=process_input,
        inputs=[gr.State(value=None, _js="() => document.querySelector('input[name=\"input_type\"]:checked').value === 'Text' ? document.querySelector('#text_input textarea').value : document.querySelector('#file_input input').files[0]"), target_lang],
        outputs=output_pdf
    )

demo.launch()