Spaces:

akarshan11
/

garrry

Running

App Files Files Community

akarshan11 commited on Apr 1

Commit

14515e7

verified ·

1 Parent(s): e68c325

Update app.py

Browse files

Files changed (1) hide show

app.py +146 -114

app.py CHANGED Viewed

@@ -5,118 +5,7 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import fitz  # PyMuPDF for PDF processing
 import docx2txt  # For DOCX processing
 from fpdf import FPDF  # For creating PDF outputs
-import typing
-from typing import Any, Union
-# Add modified JSON schema handling functions
-def get_type(schema: Union[dict, bool]) -> str:
-    """Get the type of a JSON schema.
-    Args:
-        schema: JSON schema object or boolean
-    Returns:
-        str: Type of the schema
-    """
-    if isinstance(schema, bool):
-        return "boolean"
-    if not isinstance(schema, dict):
-        return "any"
-    if "const" in schema:
-        return "const"
-    if "enum" in schema:
-        return "enum"
-    elif "type" in schema:
-        return schema["type"]
-    elif schema.get("$ref"):
-        return "$ref"
-    elif schema.get("oneOf"):
-        return "oneOf"
-    elif schema.get("anyOf"):
-        return "anyOf"
-    elif schema.get("allOf"):
-        return "allOf"
-    return "any"
-def _json_schema_to_python_type(schema: Any, defs: Any) -> str:
-    """Convert JSON schema to Python type hint.
-    Args:
-        schema: JSON schema
-        defs: Schema definitions
-    Returns:
-        str: Python type hint
-    """
-    if schema == {}:
-        return "Any"
-    type_ = get_type(schema)
-    if type_ == "boolean":
-        return "bool"
-    elif type_ == "any":
-        if isinstance(schema, dict) and "description" in schema and "json" in schema["description"]:
-            return "str | float | bool | list | dict"
-        return "Any"
-    elif type_ == "$ref":
-        return _json_schema_to_python_type(defs[schema["$ref"].split("/")[-1]], defs)
-    elif type_ == "null":
-        return "None"
-    elif type_ == "const":
-        return f"Literal[{schema['const']}]"
-    elif type_ == "enum":
-        return "Literal[" + ", ".join([f"'{str(v)}'" for v in schema["enum"]]) + "]"
-    elif type_ == "integer":
-        return "int"
-    elif type_ == "string":
-        return "str"
-    elif type_ == "boolean":
-        return "bool"
-    elif type_ == "number":
-        return "float"
-    elif type_ == "array":
-        items = schema.get("items", {})
-        if isinstance(items, bool):
-            return "list[Any]"
-        if "prefixItems" in items:
-            elements = ", ".join(
-                [_json_schema_to_python_type(i, defs) for i in items["prefixItems"]]
-            )
-            return f"tuple[{elements}]"
-        elif "prefixItems" in schema:
-            elements = ", ".join(
-                [_json_schema_to_python_type(i, defs) for i in schema["prefixItems"]]
-            )
-            return f"tuple[{elements}]"
-        else:
-            elements = _json_schema_to_python_type(items, defs)
-            return f"list[{elements}]"
-    elif type_ == "object":
-        props = schema.get("properties", {})
-        def get_desc(v):
-            return f" ({v.get('description')})" if isinstance(v, dict) and v.get("description") else ""
-        des = [
-            f"{n}: {_json_schema_to_python_type(v, defs)}{get_desc(v)}"
-            for n, v in props.items()
-            if n != "$defs"
-        ]
-        if "additionalProperties" in schema:
-            additional_properties = schema["additionalProperties"]
-            if isinstance(additional_properties, bool):
-                if additional_properties:
-                    des += ["str, Any"]
-            else:
-                des += [f"str, {_json_schema_to_python_type(additional_properties, defs)}"]
-        des = ", ".join(des)
-        return f"dict({des})"
-    else:
-        return "Any"
-# The rest of your original code remains the same
 # Load model and tokenizer
 model_name = "facebook/mbart-large-50-many-to-many-mmt"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -126,7 +15,7 @@ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model = model.to(device)
-# Your LANGUAGES dictionary and other functions remain the same
 LANGUAGES = {
     # Major Global Languages
     "English": "en_XX",
@@ -149,8 +38,145 @@ LANGUAGES = {
     "Urdu": "ur_PK"
 }
-# Your file handling and translation functions remain the same
-# ... (rest of your original code)
 # Create Gradio interface
 with gr.Blocks(title="Indian Language Translator") as demo:
@@ -189,6 +215,12 @@ with gr.Blocks(title="Indian Language Translator") as demo:
             inputs=[file_input, source_lang_doc, target_lang_doc],
             outputs=[output_file, output_preview]
         )
 if __name__ == "__main__":
     demo.launch(share=True)

 import fitz  # PyMuPDF for PDF processing
 import docx2txt  # For DOCX processing
 from fpdf import FPDF  # For creating PDF outputs
 # Load model and tokenizer
 model_name = "facebook/mbart-large-50-many-to-many-mmt"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model = model.to(device)
+# Language mappings
 LANGUAGES = {
     # Major Global Languages
     "English": "en_XX",
     "Urdu": "ur_PK"
 }
+# Define translation function first
+def translate(text: str, source_lang: str, target_lang: str, max_length: int = 1024) -> str:
+    """
+    Translate text from source language to target language.
+    Args:
+        text: Text to translate
+        source_lang: Source language name
+        target_lang: Target language name
+        max_length: Maximum length of input text
+    Returns:
+        str: Translated text
+    """
+    if not text:
+        return "No text provided for translation."
+    try:
+        # Get language codes
+        src_lang = LANGUAGES.get(source_lang)
+        tgt_lang = LANGUAGES.get(target_lang)
+        if not src_lang or not tgt_lang:
+            return "Source or target language not supported."
+        # Set tokenizer source language
+        tokenizer.src_lang = src_lang
+        # Prepare input
+        inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True)
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        # Generate translation
+        with torch.no_grad():
+            generated_tokens = model.generate(
+                **inputs,
+                forced_bos_token_id=tokenizer.lang_to_id[tgt_lang],
+                max_length=max_length,
+                num_beams=5,
+                early_stopping=True
+            )
+        # Decode translation
+        translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
+        return translation
+    except Exception as e:
+        return f"Translation error: {str(e)}"
+# File handling functions
+def extract_text_from_pdf(file_path: str) -> str:
+    """Extract text from a PDF file"""
+    text = ""
+    try:
+        doc = fitz.open(file_path)
+        for page in doc:
+            text += page.get_text()
+        return text
+    except Exception as e:
+        return f"Error extracting PDF text: {str(e)}"
+def extract_text_from_docx(file_path: str) -> str:
+    """Extract text from a DOCX file"""
+    try:
+        return docx2txt.process(file_path)
+    except Exception as e:
+        return f"Error extracting DOCX text: {str(e)}"
+def extract_text_from_txt(file_path: str) -> str:
+    """Extract text from a TXT file"""
+    try:
+        with open(file_path, 'r', encoding='utf-8') as file:
+            return file.read()
+    except UnicodeDecodeError:
+        try:
+            with open(file_path, 'r', encoding='latin-1') as file:
+                return file.read()
+        except Exception as e:
+            return f"Error extracting TXT text: {str(e)}"
+    except Exception as e:
+        return f"Error extracting TXT text: {str(e)}"
+def save_as_pdf(text: str, output_path: str) -> str:
+    """Save text as PDF"""
+    pdf = FPDF()
+    pdf.add_page()
+    pdf.set_font("Arial", size=12)
+    try:
+        # Try UTF-8 first
+        pdf.multi_cell(0, 10, text)
+    except Exception:
+        try:
+            # Fall back to latin-1 with replacement
+            encoded_text = text.encode('latin-1', 'replace').decode('latin-1')
+            pdf.multi_cell(0, 10, encoded_text)
+        except Exception as e:
+            return f"Error creating PDF: {str(e)}"
+    try:
+        pdf.output(output_path)
+        return output_path
+    except Exception as e:
+        return f"Error saving PDF: {str(e)}"
+def process_file(file, source_lang: str, target_lang: str) -> tuple[str | None, str]:
+    """Process uploaded file and translate its content"""
+    if file is None:
+        return None, "No file uploaded."
+    try:
+        # Save uploaded file temporarily
+        temp_file_path = file.name
+        # Extract text based on file type
+        if temp_file_path.lower().endswith('.pdf'):
+            text = extract_text_from_pdf(temp_file_path)
+        elif temp_file_path.lower().endswith('.docx'):
+            text = extract_text_from_docx(temp_file_path)
+        elif temp_file_path.lower().endswith('.txt'):
+            text = extract_text_from_txt(temp_file_path)
+        else:
+            return None, "Unsupported file format. Please upload PDF, DOCX, or TXT files."
+        # Translate the extracted text
+        translated_text = translate(text, source_lang, target_lang)
+        # Save translation as PDF
+        output_pdf_path = os.path.join(os.path.dirname(temp_file_path),
+                                     f"translated_{os.path.basename(temp_file_path)}.pdf")
+        result = save_as_pdf(translated_text, output_pdf_path)
+        if isinstance(result, str) and result.startswith("Error"):
+            return None, result
+        return output_pdf_path, translated_text
+    except Exception as e:
+        return None, f"Error processing file: {str(e)}"
 # Create Gradio interface
 with gr.Blocks(title="Indian Language Translator") as demo:
             inputs=[file_input, source_lang_doc, target_lang_doc],
             outputs=[output_file, output_preview]
         )
+    gr.Markdown("### Supported File Types: PDF, DOCX, TXT")
+    gr.Markdown("### Features:")
+    gr.Markdown("- Supports major Indian languages including Hindi, Bengali, Tamil, Telugu, Malayalam")
+    gr.Markdown("- Context-aware translation that understands idioms and cultural expressions")
+    gr.Markdown("- Document translation with PDF output")
 if __name__ == "__main__":
     demo.launch(share=True)