Spaces:
Running
Running
File size: 7,696 Bytes
9dcbee4 895c980 9dcbee4 895c980 9dcbee4 895c980 9dcbee4 895c980 9dcbee4 895c980 9dcbee4 895c980 9dcbee4 895c980 9dcbee4 895c980 9dcbee4 895c980 9dcbee4 895c980 9dcbee4 895c980 9dcbee4 895c980 e1983d6 895c980 9dcbee4 895c980 9dcbee4 895c980 9dcbee4 895c980 9dcbee4 895c980 9dcbee4 895c980 9dcbee4 895c980 9dcbee4 895c980 9dcbee4 895c980 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 |
import os
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import fitz # PyMuPDF for PDF processing
import docx2txt # For DOCX processing
from fpdf import FPDF # For creating PDF outputs
# Load model and tokenizer
model_name = "facebook/mbart-large-50-many-to-many-mmt" # Powerful translation model that can handle idioms well
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
# Reduced language list with focus on major languages and Indian languages
LANGUAGES = {
# Major Global Languages
"English": "en_XX",
"Spanish": "es_XX",
"French": "fr_XX",
"German": "de_DE",
"Russian": "ru_RU",
"Chinese": "zh_CN",
"Japanese": "ja_XX",
"Arabic": "ar_AR",
# Major Indian Languages
"Hindi": "hi_IN",
"Bengali": "bn_IN",
"Gujarati": "gu_IN",
"Marathi": "mr_IN",
"Tamil": "ta_IN",
"Telugu": "te_IN",
"Malayalam": "ml_IN",
"Punjabi": "pa_IN", # Note: Using closest available in mBART
"Kannada": "kn_IN", # Note: Using closest available in mBART
"Urdu": "ur_PK"
}
# File extraction functions
def extract_text_from_pdf(file_path):
"""Extract text from a PDF file"""
text = ""
try:
doc = fitz.open(file_path)
for page in doc:
text += page.get_text()
return text
except Exception as e:
return f"Error extracting PDF text: {str(e)}"
def extract_text_from_docx(file_path):
"""Extract text from a DOCX file"""
try:
return docx2txt.process(file_path)
except Exception as e:
return f"Error extracting DOCX text: {str(e)}"
def extract_text_from_txt(file_path):
"""Extract text from a TXT file"""
try:
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
except UnicodeDecodeError:
try:
with open(file_path, 'r', encoding='latin-1') as file:
return file.read()
except Exception as e:
return f"Error extracting TXT text: {str(e)}"
except Exception as e:
return f"Error extracting TXT text: {str(e)}"
def save_as_pdf(text, output_path):
"""Save text as PDF"""
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
# Split text into lines and add to PDF
# Encode to handle unicode characters
encoded_text = text.encode('latin-1', 'replace').decode('latin-1')
pdf.multi_cell(0, 10, encoded_text)
pdf.output(output_path)
return output_path
# Translation function
def translate(text, source_lang, target_lang, max_length=1024):
"""Translate text from source language to target language"""
if not text:
return "No text provided for translation."
try:
# Set source and target language
src_lang = LANGUAGES.get(source_lang)
tgt_lang = LANGUAGES.get(target_lang)
if not src_lang or not tgt_lang:
return "Source or target language not supported."
# Set tokenizer source language
tokenizer.src_lang = src_lang
# Prepare input
inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True)
inputs = {k: v.to(device) for k, v in inputs.items()}
# Generate translation
with torch.no_grad():
generated_tokens = model.generate(
**inputs,
forced_bos_token_id=tokenizer.lang_to_id[tgt_lang],
max_length=max_length,
num_beams=5,
early_stopping=True
)
# Decode translation
translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
return translation
except Exception as e:
return f"Translation error: {str(e)}"
# Process uploads and handle translation
def process_file(file, source_lang, target_lang):
"""Process uploaded file and translate its content"""
try:
# Save uploaded file temporarily
temp_file_path = file.name
# Extract text based on file type
if temp_file_path.lower().endswith('.pdf'):
text = extract_text_from_pdf(temp_file_path)
elif temp_file_path.lower().endswith('.docx'):
text = extract_text_from_docx(temp_file_path)
elif temp_file_path.lower().endswith('.txt'):
text = extract_text_from_txt(temp_file_path)
else:
return None, "Unsupported file format. Please upload PDF, DOCX, or TXT files."
# Translate the extracted text
translated_text = translate(text, source_lang, target_lang)
# Save translation as PDF
output_pdf_path = temp_file_path + "_translated.pdf"
save_as_pdf(translated_text, output_pdf_path)
return output_pdf_path, translated_text
except Exception as e:
return None, f"Error processing file: {str(e)}"
# Gradio interface
def gradio_interface():
with gr.Blocks(title="Indian Language Translator", theme=gr.themes.Soft()) as interface:
gr.Markdown("# Indian & Global Language Translator")
gr.Markdown("Translate text with understanding of idioms and cultural expressions")
with gr.Tab("Text Translation"):
with gr.Row():
source_lang_text = gr.Dropdown(list(LANGUAGES.keys()), value="English", label="Source Language")
target_lang_text = gr.Dropdown(list(LANGUAGES.keys()), value="Hindi", label="Target Language")
with gr.Row():
input_text = gr.Textbox(label="Enter text to translate", lines=5, placeholder="Type or paste text here...")
output_text = gr.Textbox(label="Translation", lines=5)
translate_btn = gr.Button("Translate Text", variant="primary")
translate_btn.click(
fn=translate,
inputs=[input_text, source_lang_text, target_lang_text],
outputs=output_text
)
with gr.Tab("Document Translation"):
with gr.Row():
source_lang_doc = gr.Dropdown(list(LANGUAGES.keys()), value="English", label="Source Language")
target_lang_doc = gr.Dropdown(list(LANGUAGES.keys()), value="Hindi", label="Target Language")
file_input = gr.File(label="Upload Document (PDF, DOCX, TXT)", file_types=[".pdf", ".docx", ".txt"])
with gr.Row():
output_file = gr.File(label="Translated PDF")
output_preview = gr.Textbox(label="Translation Preview", lines=8)
translate_doc_btn = gr.Button("Translate Document", variant="primary")
translate_doc_btn.click(
fn=process_file,
inputs=[file_input, source_lang_doc, target_lang_doc],
outputs=[output_file, output_preview]
)
gr.Markdown("### Supported File Types: PDF, DOCX, TXT")
gr.Markdown("### Features:")
gr.Markdown("- Supports major Indian languages including Hindi, Bengali, Tamil, Telugu, Malayalam")
gr.Markdown("- Context-aware translation that understands idioms and cultural expressions")
gr.Markdown("- Document translation with PDF output")
return interface
# Launch the application
if __name__ == "__main__":
app = gradio_interface()
app.launch(share=True) # Remove share=True in production |