File size: 7,696 Bytes
9dcbee4
895c980
 
 
 
 
 
9dcbee4
895c980
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9dcbee4
895c980
 
 
 
 
 
 
 
 
 
 
 
9dcbee4
895c980
 
 
 
 
 
 
 
 
 
 
9dcbee4
895c980
 
 
 
 
 
9dcbee4
895c980
 
 
 
 
 
9dcbee4
895c980
 
9dcbee4
895c980
 
 
9dcbee4
895c980
 
 
 
 
 
 
 
 
 
 
 
 
9dcbee4
895c980
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9dcbee4
895c980
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9dcbee4
895c980
 
 
 
 
 
 
 
 
 
 
 
 
 
e1983d6
895c980
 
 
 
 
 
 
 
 
 
 
 
 
9dcbee4
895c980
 
 
 
 
 
 
 
 
 
9dcbee4
895c980
 
 
9dcbee4
895c980
 
 
 
 
 
 
 
 
 
 
9dcbee4
895c980
 
 
 
9dcbee4
895c980
 
 
 
 
 
9dcbee4
895c980
 
 
 
 
 
 
9dcbee4
895c980
9dcbee4
895c980
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import os
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import fitz  # PyMuPDF for PDF processing
import docx2txt  # For DOCX processing
from fpdf import FPDF  # For creating PDF outputs

# Load model and tokenizer
model_name = "facebook/mbart-large-50-many-to-many-mmt"  # Powerful translation model that can handle idioms well
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

# Reduced language list with focus on major languages and Indian languages
LANGUAGES = {
    # Major Global Languages
    "English": "en_XX",
    "Spanish": "es_XX",
    "French": "fr_XX",
    "German": "de_DE",
    "Russian": "ru_RU",
    "Chinese": "zh_CN",
    "Japanese": "ja_XX",
    "Arabic": "ar_AR",
    
    # Major Indian Languages
    "Hindi": "hi_IN",
    "Bengali": "bn_IN",
    "Gujarati": "gu_IN",
    "Marathi": "mr_IN",
    "Tamil": "ta_IN",
    "Telugu": "te_IN",
    "Malayalam": "ml_IN",
    "Punjabi": "pa_IN",  # Note: Using closest available in mBART
    "Kannada": "kn_IN",  # Note: Using closest available in mBART
    "Urdu": "ur_PK"
}

# File extraction functions
def extract_text_from_pdf(file_path):
    """Extract text from a PDF file"""
    text = ""
    try:
        doc = fitz.open(file_path)
        for page in doc:
            text += page.get_text()
        return text
    except Exception as e:
        return f"Error extracting PDF text: {str(e)}"

def extract_text_from_docx(file_path):
    """Extract text from a DOCX file"""
    try:
        return docx2txt.process(file_path)
    except Exception as e:
        return f"Error extracting DOCX text: {str(e)}"

def extract_text_from_txt(file_path):
    """Extract text from a TXT file"""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except UnicodeDecodeError:
        try:
            with open(file_path, 'r', encoding='latin-1') as file:
                return file.read()
        except Exception as e:
            return f"Error extracting TXT text: {str(e)}"
    except Exception as e:
        return f"Error extracting TXT text: {str(e)}"

def save_as_pdf(text, output_path):
    """Save text as PDF"""
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    
    # Split text into lines and add to PDF
    # Encode to handle unicode characters
    encoded_text = text.encode('latin-1', 'replace').decode('latin-1')
    pdf.multi_cell(0, 10, encoded_text)
    
    pdf.output(output_path)
    return output_path

# Translation function
def translate(text, source_lang, target_lang, max_length=1024):
    """Translate text from source language to target language"""
    if not text:
        return "No text provided for translation."
    
    try:
        # Set source and target language
        src_lang = LANGUAGES.get(source_lang)
        tgt_lang = LANGUAGES.get(target_lang)
        
        if not src_lang or not tgt_lang:
            return "Source or target language not supported."
        
        # Set tokenizer source language
        tokenizer.src_lang = src_lang
        
        # Prepare input
        inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Generate translation
        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                forced_bos_token_id=tokenizer.lang_to_id[tgt_lang],
                max_length=max_length,
                num_beams=5,
                early_stopping=True
            )
            
        # Decode translation
        translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
        return translation
    
    except Exception as e:
        return f"Translation error: {str(e)}"

# Process uploads and handle translation
def process_file(file, source_lang, target_lang):
    """Process uploaded file and translate its content"""
    try:
        # Save uploaded file temporarily
        temp_file_path = file.name
        
        # Extract text based on file type
        if temp_file_path.lower().endswith('.pdf'):
            text = extract_text_from_pdf(temp_file_path)
        elif temp_file_path.lower().endswith('.docx'):
            text = extract_text_from_docx(temp_file_path)
        elif temp_file_path.lower().endswith('.txt'):
            text = extract_text_from_txt(temp_file_path)
        else:
            return None, "Unsupported file format. Please upload PDF, DOCX, or TXT files."
        
        # Translate the extracted text
        translated_text = translate(text, source_lang, target_lang)
        
        # Save translation as PDF
        output_pdf_path = temp_file_path + "_translated.pdf"
        save_as_pdf(translated_text, output_pdf_path)
        
        return output_pdf_path, translated_text
    
    except Exception as e:
        return None, f"Error processing file: {str(e)}"

# Gradio interface
def gradio_interface():
    with gr.Blocks(title="Indian Language Translator", theme=gr.themes.Soft()) as interface:
        gr.Markdown("# Indian & Global Language Translator")
        gr.Markdown("Translate text with understanding of idioms and cultural expressions")
        
        with gr.Tab("Text Translation"):
            with gr.Row():
                source_lang_text = gr.Dropdown(list(LANGUAGES.keys()), value="English", label="Source Language")
                target_lang_text = gr.Dropdown(list(LANGUAGES.keys()), value="Hindi", label="Target Language")
            
            with gr.Row():
                input_text = gr.Textbox(label="Enter text to translate", lines=5, placeholder="Type or paste text here...")
                output_text = gr.Textbox(label="Translation", lines=5)
            
            translate_btn = gr.Button("Translate Text", variant="primary")
            translate_btn.click(
                fn=translate,
                inputs=[input_text, source_lang_text, target_lang_text],
                outputs=output_text
            )
        
        with gr.Tab("Document Translation"):
            with gr.Row():
                source_lang_doc = gr.Dropdown(list(LANGUAGES.keys()), value="English", label="Source Language")
                target_lang_doc = gr.Dropdown(list(LANGUAGES.keys()), value="Hindi", label="Target Language")
            
            file_input = gr.File(label="Upload Document (PDF, DOCX, TXT)", file_types=[".pdf", ".docx", ".txt"])
            with gr.Row():
                output_file = gr.File(label="Translated PDF")
                output_preview = gr.Textbox(label="Translation Preview", lines=8)
            
            translate_doc_btn = gr.Button("Translate Document", variant="primary")
            translate_doc_btn.click(
                fn=process_file,
                inputs=[file_input, source_lang_doc, target_lang_doc],
                outputs=[output_file, output_preview]
            )
        
        gr.Markdown("### Supported File Types: PDF, DOCX, TXT")
        gr.Markdown("### Features:")
        gr.Markdown("- Supports major Indian languages including Hindi, Bengali, Tamil, Telugu, Malayalam")
        gr.Markdown("- Context-aware translation that understands idioms and cultural expressions")
        gr.Markdown("- Document translation with PDF output")
    
    return interface

# Launch the application
if __name__ == "__main__":
    app = gradio_interface()
    app.launch(share=True)  # Remove share=True in production