Spaces:

akarshan11
/

garrry

Running

App Files Files Community

akarshan11 commited on Apr 1

Commit

b113724

verified ·

1 Parent(s): 476dd48

Update app.py

Browse files

Files changed (1) hide show

app.py +235 -86

app.py CHANGED Viewed

@@ -1,117 +1,266 @@
 import os
 import gradio as gr
 import torch
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-# First, let's create a simpler interface without complex schema handling
-# Define languages
-LANGUAGES = {
-    "English": "en_XX",
-    "Hindi": "hi_IN",
-    "Bengali": "bn_IN",
-    "Tamil": "ta_IN",
-    "Telugu": "te_IN",
-    "Malayalam": "ml_IN",
-    "Urdu": "ur_PK"
 }
-# Initialize model and tokenizer
-model_name = "facebook/mbart-large-50-many-to-many-mmt"
-tokenizer = None
-model = None
-def load_model():
-    global tokenizer, model
-    if tokenizer is None:
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-    if model is None:
-        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
-        if torch.cuda.is_available():
-            model = model.to("cuda")
 def translate_text(text, source_lang, target_lang):
-    """Simple translation function"""
     if not text:
-        return "Please enter some text to translate."
     try:
-        load_model()
-        # Get language codes
-        src_lang = LANGUAGES.get(source_lang)
-        tgt_lang = LANGUAGES.get(target_lang)
-        # Set source language
-        tokenizer.src_lang = src_lang
-        # Tokenize
-        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
-        if torch.cuda.is_available():
-            inputs = {k: v.to("cuda") for k, v in inputs.items()}
-        # Generate translation
-        with torch.no_grad():
-            generated_tokens = model.generate(
-                **inputs,
-                forced_bos_token_id=tokenizer.lang_to_id[tgt_lang],
-                max_length=512,
-                num_beams=4,
-                early_stopping=True
-            )
-        # Decode
-        translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
-        return translation
     except Exception as e:
-        return f"Translation Error: {str(e)}"
 # Create Gradio interface
-with gr.Blocks(title="Simple Language Translator") as demo:
-    gr.Markdown("# Simple Language Translator")
-    with gr.Row():
-        with gr.Column():
-            input_text = gr.Textbox(
-                label="Input Text",
-                placeholder="Enter text to translate...",
-                lines=5
-            )
-            source_lang = gr.Dropdown(
-                choices=list(LANGUAGES.keys()),
-                value="English",
-                label="Source Language"
-            )
-            target_lang = gr.Dropdown(
-                choices=list(LANGUAGES.keys()),
-                value="Hindi",
-                label="Target Language"
-            )
-            translate_btn = gr.Button("Translate")
-        with gr.Column():
-            output_text = gr.Textbox(
-                label="Translation",
-                lines=5
-            )
-    # Set up translation event
     translate_btn.click(
         fn=translate_text,
         inputs=[input_text, source_lang, target_lang],
         outputs=output_text
     )
-    # Add examples
-    gr.Examples(
-        examples=[
-            ["Hello, how are you?", "English", "Hindi"],
-            ["नमस्ते, कैसे हैं आप?", "Hindi", "English"],
-        ],
-        inputs=[input_text, source_lang, target_lang],
-        outputs=output_text,
-        fn=translate_text,
-        cache_examples=True,
     )
 if __name__ == "__main__":

 import os
 import gradio as gr
 import torch
+from transformers import (
+    MarianMTModel, MarianTokenizer,
+    T5Tokenizer, T5ForConditionalGeneration,
+    pipeline
+)
+import fitz  # PyMuPDF
+import docx2txt
+from fpdf import FPDF
+from transformers import AutoModelForSeq2SeqLegacy, AutoTokenizer
+import spacy
+import re
+# Language mappings for MarianMT models
+LANGUAGE_PAIRS = {
+    "English-Hindi": "Helsinki-NLP/opus-mt-en-hi",
+    "Hindi-English": "Helsinki-NLP/opus-mt-hi-en",
+    "English-Tamil": "Helsinki-NLP/opus-mt-en-tam",
+    "Tamil-English": "Helsinki-NLP/opus-mt-tam-en",
+    "English-Telugu": "Helsinki-NLP/opus-mt-en-tel",
+    "Telugu-English": "Helsinki-NLP/opus-mt-tel-en",
 }
+# Initialize models dictionary
+models = {}
+tokenizers = {}
+def load_model_for_pair(source_lang, target_lang):
+    """Load appropriate model for language pair"""
+    pair = f"{source_lang}-{target_lang}"
+    if pair not in models:
+        try:
+            model_name = LANGUAGE_PAIRS.get(pair)
+            if model_name:
+                tokenizers[pair] = MarianTokenizer.from_pretrained(model_name)
+                models[pair] = MarianMTModel.from_pretrained(model_name)
+                if torch.cuda.is_available():
+                    models[pair] = models[pair].to("cuda")
+            else:
+                # Fallback to T5 for unsupported language pairs
+                tokenizers[pair] = T5Tokenizer.from_pretrained("t5-base")
+                models[pair] = T5ForConditionalGeneration.from_pretrained("t5-base")
+                if torch.cuda.is_available():
+                    models[pair] = models[pair].to("cuda")
+        except Exception as e:
+            print(f"Error loading model for {pair}: {str(e)}")
+            return None, None
+    return models.get(pair), tokenizers.get(pair)
+# Text extraction functions
+def extract_text_from_pdf(file_path):
+    """Extract text from PDF while preserving structure"""
+    try:
+        text_blocks = []
+        doc = fitz.open(file_path)
+        for page in doc:
+            # Get text blocks with position information
+            blocks = page.get_text("blocks")
+            # Sort blocks by vertical position then horizontal
+            blocks.sort(key=lambda b: (b[1], b[0]))
+            for b in blocks:
+                text_blocks.append(b[4])  # b[4] contains the text
+        return "\n\n".join(text_blocks)
+    except Exception as e:
+        return f"Error extracting PDF text: {str(e)}"
+def extract_text_from_docx(file_path):
+    """Extract text from DOCX with structure preservation"""
+    try:
+        text = docx2txt.process(file_path)
+        # Clean up excessive newlines while preserving paragraphs
+        text = re.sub(r'\n\s*\n', '\n\n', text)
+        return text
+    except Exception as e:
+        return f"Error extracting DOCX text: {str(e)}"
+def save_as_pdf(text, output_path):
+    """Save translated text as PDF with formatting"""
+    try:
+        pdf = FPDF()
+        pdf.add_page()
+        pdf.set_auto_page_break(auto=True, margin=15)
+        pdf.set_font("Arial", size=12)
+        # Split text into paragraphs
+        paragraphs = text.split('\n\n')
+        for para in paragraphs:
+            # Add paragraph with spacing
+            pdf.multi_cell(0, 10, para.strip())
+            pdf.ln(5)  # Add some space between paragraphs
+        pdf.output(output_path)
+        return output_path
+    except Exception as e:
+        return f"Error creating PDF: {str(e)}"
+def preprocess_text(text):
+    """Preprocess text to handle idioms and maintain context"""
+    # Split into manageable chunks while preserving context
+    chunks = []
+    sentences = text.split('.')
+    current_chunk = []
+    current_length = 0
+    for sentence in sentences:
+        sentence = sentence.strip()
+        if not sentence:
+            continue
+        if current_length + len(sentence) < 512:
+            current_chunk.append(sentence)
+            current_length += len(sentence)
+        else:
+            if current_chunk:
+                chunks.append('. '.join(current_chunk) + '.')
+            current_chunk = [sentence]
+            current_length = len(sentence)
+    if current_chunk:
+        chunks.append('. '.join(current_chunk) + '.')
+    return chunks
 def translate_text(text, source_lang, target_lang):
+    """Translate text with context preservation"""
     if not text:
+        return "Please provide text to translate."
     try:
+        model, tokenizer = load_model_for_pair(source_lang, target_lang)
+        if not model or not tokenizer:
+            return "Translation model not available for this language pair."
+        # Preprocess and chunk the text
+        chunks = preprocess_text(text)
+        translated_chunks = []
+        for chunk in chunks:
+            # Prepare input
+            inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
+            if torch.cuda.is_available():
+                inputs = {k: v.to("cuda") for k, v in inputs.items()}
+            # Generate translation
+            with torch.no_grad():
+                outputs = model.generate(**inputs, max_length=512, num_beams=4, early_stopping=True)
+            # Decode translation
+            translated_chunk = tokenizer.decode(outputs[0], skip_special_tokens=True)
+            translated_chunks.append(translated_chunk)
+        # Combine translations
+        return " ".join(translated_chunks)
+    except Exception as e:
+        return f"Translation Error: {str(e)}"
+def process_document(file, source_lang, target_lang):
+    """Process and translate document"""
+    if file is None:
+        return None, "No file uploaded."
+    try:
+        # Extract text based on file type
+        file_path = file.name
+        if file_path.lower().endswith('.pdf'):
+            text = extract_text_from_pdf(file_path)
+        elif file_path.lower().endswith('.docx'):
+            text = extract_text_from_docx(file_path)
+        elif file_path.lower().endswith('.txt'):
+            with open(file_path, 'r', encoding='utf-8') as f:
+                text = f.read()
+        else:
+            return None, "Unsupported file format. Please upload PDF, DOCX, or TXT files."
+        # Translate the extracted text
+        translated_text = translate_text(text, source_lang, target_lang)
+        # Save translation as PDF
+        output_path = os.path.join(os.path.dirname(file_path),
+                                 f"translated_{os.path.basename(file_path)}.pdf")
+        result = save_as_pdf(translated_text, output_path)
+        if isinstance(result, str) and result.startswith("Error"):
+            return None, result
+        return output_path, translated_text
     except Exception as e:
+        return None, f"Error processing document: {str(e)}"
 # Create Gradio interface
+with gr.Blocks(title="Document and Text Translator") as demo:
+    gr.Markdown("# Advanced Document and Text Translator")
+    with gr.Tabs():
+        with gr.TabItem("Text Translation"):
+            with gr.Row():
+                with gr.Column():
+                    input_text = gr.Textbox(
+                        label="Input Text",
+                        placeholder="Enter text to translate...",
+                        lines=5
+                    )
+                    source_lang = gr.Dropdown(
+                        choices=list(set(lang.split('-')[0] for lang in LANGUAGE_PAIRS.keys())),
+                        value="English",
+                        label="Source Language"
+                    )
+                    target_lang = gr.Dropdown(
+                        choices=list(set(lang.split('-')[1] for lang in LANGUAGE_PAIRS.keys())),
+                        value="Hindi",
+                        label="Target Language"
+                    )
+                    translate_btn = gr.Button("Translate")
+                with gr.Column():
+                    output_text = gr.Textbox(
+                        label="Translation",
+                        lines=5
+                    )
+        with gr.TabItem("Document Translation"):
+            with gr.Row():
+                with gr.Column():
+                    file_input = gr.File(
+                        label="Upload Document",
+                        file_types=[".pdf", ".docx", ".txt"]
+                    )
+                    doc_source_lang = gr.Dropdown(
+                        choices=list(set(lang.split('-')[0] for lang in LANGUAGE_PAIRS.keys())),
+                        value="English",
+                        label="Source Language"
+                    )
+                    doc_target_lang = gr.Dropdown(
+                        choices=list(set(lang.split('-')[1] for lang in LANGUAGE_PAIRS.keys())),
+                        value="Hindi",
+                        label="Target Language"
+                    )
+                    translate_doc_btn = gr.Button("Translate Document")
+                with gr.Column():
+                    output_file = gr.File(label="Translated PDF")
+                    output_preview = gr.Textbox(
+                        label="Translation Preview",
+                        lines=8
+                    )
+    # Set up event handlers
     translate_btn.click(
         fn=translate_text,
         inputs=[input_text, source_lang, target_lang],
         outputs=output_text
     )
+    translate_doc_btn.click(
+        fn=process_document,
+        inputs=[file_input, doc_source_lang, doc_target_lang],
+        outputs=[output_file, output_preview]
     )
 if __name__ == "__main__":