Spaces:

akarshan11
/

garrry

Running

App Files Files Community

akarshan11 commited on Apr 1

Commit

f7aa0f6

verified ·

1 Parent(s): a784937

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -258

app.py CHANGED Viewed

@@ -1,273 +1,107 @@
-import os
 import gradio as gr
-import torch
-from transformers import (
-    MarianMTModel, MarianTokenizer,
-    T5Tokenizer, T5ForConditionalGeneration,
-    pipeline,
-    AutoModelForSeq2SeqLM,  # Changed from AutoModelForSeq2SeqLegacy
-    AutoTokenizer
-)
-import fitz  # PyMuPDF
-import docx2txt
-from fpdf import FPDF
-import spacy
-import re
-# Language mappings for MarianMT models
-LANGUAGE_PAIRS = {
-    "English-Hindi": "Helsinki-NLP/opus-mt-en-hi",
-    "Hindi-English": "Helsinki-NLP/opus-mt-hi-en",
-    "English-Tamil": "Helsinki-NLP/opus-mt-en-tam",
-    "Tamil-English": "Helsinki-NLP/opus-mt-tam-en",
-    "English-Telugu": "Helsinki-NLP/opus-mt-en-tel",
-    "Telugu-English": "Helsinki-NLP/opus-mt-tel-en",
-}
-# Initialize models dictionary
-models = {}
-tokenizers = {}
-def load_model_for_pair(source_lang, target_lang):
-    """Load appropriate model for language pair"""
-    pair = f"{source_lang}-{target_lang}"
-    if pair not in models:
-        try:
-            model_name = LANGUAGE_PAIRS.get(pair)
-            if model_name:
-                tokenizers[pair] = MarianTokenizer.from_pretrained(model_name)
-                models[pair] = MarianMTModel.from_pretrained(model_name)
-                if torch.cuda.is_available():
-                    models[pair] = models[pair].to("cuda")
-            else:
-                # Fallback to T5 for unsupported language pairs
-                tokenizers[pair] = T5Tokenizer.from_pretrained("t5-base")
-                models[pair] = T5ForConditionalGeneration.from_pretrained("t5-base")
-                if torch.cuda.is_available():
-                    models[pair] = models[pair].to("cuda")
-        except Exception as e:
-            print(f"Error loading model for {pair}: {str(e)}")
-            return None, None
-    return models.get(pair), tokenizers.get(pair)
-# Text extraction functions
-def extract_text_from_pdf(file_path):
-    """Extract text from PDF while preserving structure"""
-    try:
-        text_blocks = []
-        doc = fitz.open(file_path)
         for page in doc:
-            # Get text blocks with position information
-            blocks = page.get_text("blocks")
-            # Sort blocks by vertical position then horizontal
-            blocks.sort(key=lambda b: (b[1], b[0]))
-            for b in blocks:
-                text_blocks.append(b[4])  # b[4] contains the text
-        return "\n\n".join(text_blocks)
-    except Exception as e:
-        return f"Error extracting PDF text: {str(e)}"
-def extract_text_from_docx(file_path):
-    """Extract text from DOCX with structure preservation"""
-    try:
-        text = docx2txt.process(file_path)
-        # Clean up excessive newlines while preserving paragraphs
-        text = re.sub(r'\n\s*\n', '\n\n', text)
-        return text
-    except Exception as e:
-        return f"Error extracting DOCX text: {str(e)}"
-def save_as_pdf(text, output_path):
-    """Save translated text as PDF with formatting"""
-    try:
-        pdf = FPDF()
-        pdf.add_page()
-        pdf.set_auto_page_break(auto=True, margin=15)
-        pdf.add_font('DejaVu', '', '/usr/share/fonts/truetype/dejavu/DejaVuSansCondensed.ttf', uni=True)
-        pdf.set_font('DejaVu', size=12)
-        # Split text into paragraphs
-        paragraphs = text.split('\n\n')
-        for para in paragraphs:
-            # Add paragraph with spacing
-            try:
-                pdf.multi_cell(0, 10, para.strip())
-                pdf.ln(5)  # Add some space between paragraphs
-            except Exception as e:
-                print(f"Error writing paragraph: {str(e)}")
-                continue
-        pdf.output(output_path)
-        return output_path
-    except Exception as e:
-        return f"Error creating PDF: {str(e)}"
-def preprocess_text(text):
-    """Preprocess text to handle idioms and maintain context"""
-    # Split into manageable chunks while preserving context
-    chunks = []
-    sentences = text.split('.')
-    current_chunk = []
-    current_length = 0
-    for sentence in sentences:
-        sentence = sentence.strip()
-        if not sentence:
-            continue
-        if current_length + len(sentence) < 512:
-            current_chunk.append(sentence)
-            current_length += len(sentence)
-        else:
-            if current_chunk:
-                chunks.append('. '.join(current_chunk) + '.')
-            current_chunk = [sentence]
-            current_length = len(sentence)
-    if current_chunk:
-        chunks.append('. '.join(current_chunk) + '.')
-    return chunks
-def translate_text(text, source_lang, target_lang):
-    """Translate text with context preservation"""
-    if not text:
-        return "Please provide text to translate."
-    try:
-        model, tokenizer = load_model_for_pair(source_lang, target_lang)
-        if not model or not tokenizer:
-            return "Translation model not available for this language pair."
-        # Preprocess and chunk the text
-        chunks = preprocess_text(text)
-        translated_chunks = []
-        for chunk in chunks:
-            # Prepare input
-            inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
-            if torch.cuda.is_available():
-                inputs = {k: v.to("cuda") for k, v in inputs.items()}
-            # Generate translation
-            with torch.no_grad():
-                outputs = model.generate(**inputs, max_length=512, num_beams=4, early_stopping=True)
-            # Decode translation
-            translated_chunk = tokenizer.decode(outputs[0], skip_special_tokens=True)
-            translated_chunks.append(translated_chunk)
-        # Combine translations
-        return " ".join(translated_chunks)
-    except Exception as e:
-        return f"Translation Error: {str(e)}"
-def process_document(file, source_lang, target_lang):
-    """Process and translate document"""
-    if file is None:
-        return None, "No file uploaded."
-    try:
-        # Extract text based on file type
-        file_path = file.name
-        if file_path.lower().endswith('.pdf'):
-            text = extract_text_from_pdf(file_path)
-        elif file_path.lower().endswith('.docx'):
-            text = extract_text_from_docx(file_path)
-        elif file_path.lower().endswith('.txt'):
-            with open(file_path, 'r', encoding='utf-8') as f:
-                text = f.read()
-        else:
-            return None, "Unsupported file format. Please upload PDF, DOCX, or TXT files."
-        # Translate the extracted text
-        translated_text = translate_text(text, source_lang, target_lang)
-        # Save translation as PDF
-        output_path = os.path.join(os.path.dirname(file_path),
-                                 f"translated_{os.path.basename(file_path)}.pdf")
-        result = save_as_pdf(translated_text, output_path)
-        if isinstance(result, str) and result.startswith("Error"):
-            return None, result
-        return output_path, translated_text
-    except Exception as e:
-        return None, f"Error processing document: {str(e)}"
-# Create Gradio interface
-with gr.Blocks(title="Document and Text Translator") as demo:
-    gr.Markdown("# Advanced Document and Text Translator")
-    with gr.Tabs():
-        with gr.TabItem("Text Translation"):
-            with gr.Row():
-                with gr.Column():
-                    input_text = gr.Textbox(
-                        label="Input Text",
-                        placeholder="Enter text to translate...",
-                        lines=5
-                    )
-                    source_lang = gr.Dropdown(
-                        choices=list(set(lang.split('-')[0] for lang in LANGUAGE_PAIRS.keys())),
-                        value="English",
-                        label="Source Language"
-                    )
-                    target_lang = gr.Dropdown(
-                        choices=list(set(lang.split('-')[1] for lang in LANGUAGE_PAIRS.keys())),
-                        value="Hindi",
-                        label="Target Language"
-                    )
-                    translate_btn = gr.Button("Translate")
-                with gr.Column():
-                    output_text = gr.Textbox(
-                        label="Translation",
-                        lines=5
-                    )
-        with gr.TabItem("Document Translation"):
-            with gr.Row():
-                with gr.Column():
-                    file_input = gr.File(
-                        label="Upload Document",
-                        file_types=[".pdf", ".docx", ".txt"]
-                    )
-                    doc_source_lang = gr.Dropdown(
-                        choices=list(set(lang.split('-')[0] for lang in LANGUAGE_PAIRS.keys())),
-                        value="English",
-                        label="Source Language"
-                    )
-                    doc_target_lang = gr.Dropdown(
-                        choices=list(set(lang.split('-')[1] for lang in LANGUAGE_PAIRS.keys())),
-                        value="Hindi",
-                        label="Target Language"
-                    )
-                    translate_doc_btn = gr.Button("Translate Document")
-                with gr.Column():
-                    output_file = gr.File(label="Translated PDF")
-                    output_preview = gr.Textbox(
-                        label="Translation Preview",
-                        lines=8
-                    )
-    # Set up event handlers
-    translate_btn.click(
-        fn=translate_text,
-        inputs=[input_text, source_lang, target_lang],
-        outputs=output_text
     )
-    translate_doc_btn.click(
-        fn=process_document,
-        inputs=[file_input, doc_source_lang, doc_target_lang],
-        outputs=[output_file, output_preview]
     )
-if __name__ == "__main__":
-    demo.launch(share=True)

 import gradio as gr
+from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
+import fitz  # PyMuPDF for PDF handling
+from io import BytesIO
+# Load IndicTrans model for Indian languages (example: English to Hindi)
+model_name = "ai4bharat/indictrans2-en-indic-1b"  # Supports multiple Indian languages
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+translator = pipeline("translation", model=model, tokenizer=tokenizer)
+# Language mapping for Indian languages and others
+language_map = {
+    "French": "fr",  # Using OPUS model for non-Indian languages
+    "Spanish": "es",
+    "German": "de",
+    "Hindi": "hi",
+    "Tamil": "ta",
+    "Telugu": "te",
+    "Bengali": "bn",
+    "Gujarati": "gu",
+    "Marathi": "mr",
+    "Kannada": "kn",
+    "Malayalam": "ml",
+    "Punjabi": "pa",
+}
+def translate_text(input_text, target_language):
+    """Translate text with context awareness"""
+    target_lang_code = language_map[target_language]
+    # For Indian languages, use IndicTrans
+    if target_lang_code in ["hi", "ta", "te", "bn", "gu", "mr", "kn", "ml", "pa"]:
+        translated = translator(input_text, src_lang="en", tgt_lang=target_lang_code)[0]['translation_text']
+    else:
+        # For non-Indian languages, switch to OPUS model (example: English to French)
+        opus_translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{target_lang_code}")
+        translated = opus_translator(input_text)[0]['translation_text']
+    return translated
+def process_input(input_data, target_language):
+    """Handle both text and document inputs"""
+    if isinstance(input_data, str):  # Direct text input
+        text = input_data
+    else:  # File input (assuming text-based document)
+        doc = fitz.open(stream=input_data.read(), filetype="pdf")
+        text = ""
         for page in doc:
+            text += page.get_text()
+    # Translate the extracted text
+    translated_text = translate_text(text, target_language)
+    # Create PDF output
+    pdf_output = fitz.open()
+    page = pdf_output.new_page()
+    page.insert_text((50, 50), translated_text)
+    # Save PDF to bytes
+    pdf_bytes = BytesIO()
+    pdf_output.save(pdf_bytes)
+    pdf_bytes.seek(0)
+    return pdf_bytes
+# Gradio Interface
+with gr.Blocks(title="Context-Aware Translator with Indian Languages") as demo:
+    gr.Markdown("# Context-Aware Language Translator")
+    gr.Markdown("Translate text or upload a document into Indian languages or others, and get a PDF output.")
+    with gr.Row():
+        with gr.Column():
+            input_type = gr.Radio(["Text", "Document"], label="Input Type", value="Text")
+            text_input = gr.Textbox(lines=5, label="Enter Text", visible=True)
+            file_input = gr.File(label="Upload Document", visible=False)
+            target_lang = gr.Dropdown(
+                choices=list(language_map.keys()),
+                label="Target Language",
+                value="Hindi"
+            )
+            submit_btn = gr.Button("Translate")
+        with gr.Column():
+            output_pdf = gr.File(label="Download Translated PDF")
+    # Dynamic visibility based on input type
+    def update_visibility(choice):
+        return (
+            gr.update(visible=(choice == "Text")),
+            gr.update(visible=(choice == "Document"))
+        )
+    input_type.change(
+        fn=update_visibility,
+        inputs=input_type,
+        outputs=[text_input, file_input]
     )
+    # Process the input and generate output
+    submit_btn.click(
+        fn=process_input,
+        inputs=[gr.State(value=None, _js="() => document.querySelector('input[name=\"input_type\"]:checked').value === 'Text' ? document.querySelector('#text_input textarea').value : document.querySelector('#file_input input').files[0]"), target_lang],
+        outputs=output_pdf
     )
+demo.launch()