Spaces:

akarshan11
/

garrry

Running

App Files Files Community

akarshan11 commited on Apr 1

Commit

895c980

verified ·

1 Parent(s): e1983d6

Update app.py

Browse files

Files changed (1) hide show

app.py +186 -153

app.py CHANGED Viewed

@@ -1,174 +1,207 @@
-import gradio as gr
-import sys
-import pkg_resources
-import tempfile
 import os
-from pathlib import Path
-def check_dependencies():
-    required_packages = {
-        'gradio': ['gradio'],
-        'transformers': ['transformers'],
-        'python-docx': ['python-docx', 'python_docx', 'docx'],
-        'PyPDF2': ['PyPDF2', 'pypdf2', 'pypdf'],
-        'torch': ['torch'],
-        'sentencepiece': ['sentencepiece'],
-        'tf-keras': ['tf-keras']
-    }
-    installed = {pkg.key.lower() for pkg in pkg_resources.working_set}
-    missing = []
-    for package, variations in required_packages.items():
-        if not any(variation.lower() in installed for variation in variations):
-            missing.append(package)
-    if missing:
-        print("Missing required packages. Please install:")
-        for pkg in missing:
-            print(f"pip install {pkg}")
-        sys.exit(1)
-# Check dependencies before importing
-check_dependencies()
-import torch
-from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
-import docx
-import PyPDF2
-import io
-class DocumentTranslator:
-    def __init__(self):
         try:
-            # Initialize translation models
-            self.romance_translator = pipeline(
-                "translation",
-                model="Helsinki-NLP/opus-mt-en-ROMANCE",
-                framework="pt"
-            )
-            # Initialize Hindi translator
-            self.hindi_translator = pipeline(
-                "translation",
-                model="Helsinki-NLP/opus-mt-en-hi",
-                framework="pt"
-            )
-            # Supported languages
-            self.languages = {
-                "English": "en",
-                "French": "fr",
-                "Spanish": "es",
-                "Portuguese": "pt",
-                "Italian": "it",
-                "Hindi": "hi"  # Added Hindi support
-            }
         except Exception as e:
-            print(f"Error initializing translator: {str(e)}")
-            print("Please make sure all required packages are installed:")
-            print("pip install transformers torch sentencepiece python-docx PyPDF2 gradio tf-keras")
-            raise
-    def extract_text_from_docx(self, file):
-        doc = docx.Document(file)
-        text = []
-        for paragraph in doc.paragraphs:
-            text.append(paragraph.text)
-        return "\n".join(text)
-    def extract_text_from_pdf(self, file):
-        pdf_reader = PyPDF2.PdfReader(file)
-        text = []
-        for page in pdf_reader.pages:
-            text.append(page.extract_text())
-        return "\n".join(text)
-    def create_translated_docx(self, original_text, translated_text, output_filename):
-        doc = docx.Document()
-        paragraphs = translated_text.split("\n")
-        for para in paragraphs:
-            if para.strip():
-                doc.add_paragraph(para)
-        doc.save(output_filename)
-        return output_filename
-    def translate_text(self, text, target_lang):
-        # Choose appropriate translator based on target language
-        if target_lang == "hi":
-            return self.hindi_translator(text)[0]['translation_text']
         else:
-            return self.romance_translator(text)[0]['translation_text']
-    def translate_document(self, file, source_lang, target_lang):
-        try:
-            # Create temporary directory for output
-            temp_dir = tempfile.mkdtemp()
-            output_filename = os.path.join(temp_dir, "translated_document.docx")
-            # Extract text based on file type
-            if file.name.endswith('.docx'):
-                text = self.extract_text_from_docx(file)
-            elif file.name.endswith('.pdf'):
-                text = self.extract_text_from_pdf(file)
-            else:
-                return None, "Unsupported file format. Please use .docx or .pdf"
-            # Split text into chunks to handle long documents
-            chunk_size = 500
-            chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
-            # Translate chunks
-            translated_chunks = []
-            for chunk in chunks:
-                translation = self.translate_text(chunk, self.languages[target_lang])
-                translated_chunks.append(translation)
-            translated_text = " ".join(translated_chunks)
-            # Create new document with translation
-            output_file = self.create_translated_docx(text, translated_text, output_filename)
-            return output_file, "Translation completed successfully!"
-        except Exception as e:
-            return None, f"Error during translation: {str(e)}"
-def create_translation_interface():
-    try:
-        translator = DocumentTranslator()
-        def translate_file(file, source_lang, target_lang):
-            if file is None:
-                return None, "Please upload a file"
-            return translator.translate_document(file, source_lang, target_lang)
-        iface = gr.Interface(
-            fn=translate_file,
-            inputs=[
-                gr.File(label="Upload Document (.docx or .pdf)"),
-                gr.Dropdown(choices=list(translator.languages.keys()), label="Source Language"),
-                gr.Dropdown(choices=list(translator.languages.keys()), label="Target Language")
-            ],
-            outputs=[
-                gr.File(label="Download Translated Document"),
-                gr.Textbox(label="Status")
-            ],
-            title="Document Translation System",
-            description="Upload a document (.docx or .pdf) and select source and target languages for translation.",
-            theme="default"
-        )
-        return iface
-    except Exception as e:
-        print(f"Error creating interface: {str(e)}")
-        sys.exit(1)
 if __name__ == "__main__":
-    print("Initializing translation system...")
-    print("Checking dependencies...")
-    check_dependencies()
-    print("Starting Gradio interface...")
-    iface = create_translation_interface()
-    iface.launch(share=True)

 import os
+import gradio as gr
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+import fitz  # PyMuPDF for PDF processing
+import docx2txt  # For DOCX processing
+from fpdf import FPDF  # For creating PDF outputs
+# Load model and tokenizer
+model_name = "facebook/mbart-large-50-many-to-many-mmt"  # Powerful translation model that can handle idioms well
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+# Set device
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = model.to(device)
+# Reduced language list with focus on major languages and Indian languages
+LANGUAGES = {
+    # Major Global Languages
+    "English": "en_XX",
+    "Spanish": "es_XX",
+    "French": "fr_XX",
+    "German": "de_DE",
+    "Russian": "ru_RU",
+    "Chinese": "zh_CN",
+    "Japanese": "ja_XX",
+    "Arabic": "ar_AR",
+    # Major Indian Languages
+    "Hindi": "hi_IN",
+    "Bengali": "bn_IN",
+    "Gujarati": "gu_IN",
+    "Marathi": "mr_IN",
+    "Tamil": "ta_IN",
+    "Telugu": "te_IN",
+    "Malayalam": "ml_IN",
+    "Punjabi": "pa_IN",  # Note: Using closest available in mBART
+    "Kannada": "kn_IN",  # Note: Using closest available in mBART
+    "Urdu": "ur_PK"
+}
+# File extraction functions
+def extract_text_from_pdf(file_path):
+    """Extract text from a PDF file"""
+    text = ""
+    try:
+        doc = fitz.open(file_path)
+        for page in doc:
+            text += page.get_text()
+        return text
+    except Exception as e:
+        return f"Error extracting PDF text: {str(e)}"
+def extract_text_from_docx(file_path):
+    """Extract text from a DOCX file"""
+    try:
+        return docx2txt.process(file_path)
+    except Exception as e:
+        return f"Error extracting DOCX text: {str(e)}"
+def extract_text_from_txt(file_path):
+    """Extract text from a TXT file"""
+    try:
+        with open(file_path, 'r', encoding='utf-8') as file:
+            return file.read()
+    except UnicodeDecodeError:
         try:
+            with open(file_path, 'r', encoding='latin-1') as file:
+                return file.read()
         except Exception as e:
+            return f"Error extracting TXT text: {str(e)}"
+    except Exception as e:
+        return f"Error extracting TXT text: {str(e)}"
+def save_as_pdf(text, output_path):
+    """Save text as PDF"""
+    pdf = FPDF()
+    pdf.add_page()
+    pdf.set_font("Arial", size=12)
+    # Split text into lines and add to PDF
+    # Encode to handle unicode characters
+    encoded_text = text.encode('latin-1', 'replace').decode('latin-1')
+    pdf.multi_cell(0, 10, encoded_text)
+    pdf.output(output_path)
+    return output_path
+# Translation function
+def translate(text, source_lang, target_lang, max_length=1024):
+    """Translate text from source language to target language"""
+    if not text:
+        return "No text provided for translation."
+    try:
+        # Set source and target language
+        src_lang = LANGUAGES.get(source_lang)
+        tgt_lang = LANGUAGES.get(target_lang)
+        if not src_lang or not tgt_lang:
+            return "Source or target language not supported."
+        # Set tokenizer source language
+        tokenizer.src_lang = src_lang
+        # Prepare input
+        inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True)
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        # Generate translation
+        with torch.no_grad():
+            generated_tokens = model.generate(
+                **inputs,
+                forced_bos_token_id=tokenizer.lang_to_id[tgt_lang],
+                max_length=max_length,
+                num_beams=5,
+                early_stopping=True
+            )
+        # Decode translation
+        translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
+        return translation
+    except Exception as e:
+        return f"Translation error: {str(e)}"
+# Process uploads and handle translation
+def process_file(file, source_lang, target_lang):
+    """Process uploaded file and translate its content"""
+    try:
+        # Save uploaded file temporarily
+        temp_file_path = file.name
+        # Extract text based on file type
+        if temp_file_path.lower().endswith('.pdf'):
+            text = extract_text_from_pdf(temp_file_path)
+        elif temp_file_path.lower().endswith('.docx'):
+            text = extract_text_from_docx(temp_file_path)
+        elif temp_file_path.lower().endswith('.txt'):
+            text = extract_text_from_txt(temp_file_path)
         else:
+            return None, "Unsupported file format. Please upload PDF, DOCX, or TXT files."
+        # Translate the extracted text
+        translated_text = translate(text, source_lang, target_lang)
+        # Save translation as PDF
+        output_pdf_path = temp_file_path + "_translated.pdf"
+        save_as_pdf(translated_text, output_pdf_path)
+        return output_pdf_path, translated_text
+    except Exception as e:
+        return None, f"Error processing file: {str(e)}"
+# Gradio interface
+def gradio_interface():
+    with gr.Blocks(title="Indian Language Translator", theme=gr.themes.Soft()) as interface:
+        gr.Markdown("# Indian & Global Language Translator")
+        gr.Markdown("Translate text with understanding of idioms and cultural expressions")
+        with gr.Tab("Text Translation"):
+            with gr.Row():
+                source_lang_text = gr.Dropdown(list(LANGUAGES.keys()), value="English", label="Source Language")
+                target_lang_text = gr.Dropdown(list(LANGUAGES.keys()), value="Hindi", label="Target Language")
+            with gr.Row():
+                input_text = gr.Textbox(label="Enter text to translate", lines=5, placeholder="Type or paste text here...")
+                output_text = gr.Textbox(label="Translation", lines=5)
+            translate_btn = gr.Button("Translate Text", variant="primary")
+            translate_btn.click(
+                fn=translate,
+                inputs=[input_text, source_lang_text, target_lang_text],
+                outputs=output_text
+            )
+        with gr.Tab("Document Translation"):
+            with gr.Row():
+                source_lang_doc = gr.Dropdown(list(LANGUAGES.keys()), value="English", label="Source Language")
+                target_lang_doc = gr.Dropdown(list(LANGUAGES.keys()), value="Hindi", label="Target Language")
+            file_input = gr.File(label="Upload Document (PDF, DOCX, TXT)", file_types=[".pdf", ".docx", ".txt"])
+            with gr.Row():
+                output_file = gr.File(label="Translated PDF")
+                output_preview = gr.Textbox(label="Translation Preview", lines=8)
+            translate_doc_btn = gr.Button("Translate Document", variant="primary")
+            translate_doc_btn.click(
+                fn=process_file,
+                inputs=[file_input, source_lang_doc, target_lang_doc],
+                outputs=[output_file, output_preview]
+            )
+        gr.Markdown("### Supported File Types: PDF, DOCX, TXT")
+        gr.Markdown("### Features:")
+        gr.Markdown("- Supports major Indian languages including Hindi, Bengali, Tamil, Telugu, Malayalam")
+        gr.Markdown("- Context-aware translation that understands idioms and cultural expressions")
+        gr.Markdown("- Document translation with PDF output")
+    return interface
+# Launch the application
 if __name__ == "__main__":
+    app = gradio_interface()
+    app.launch(share=True)  # Remove share=True in production