Spaces:

akarshan11
/

garrry

Running

App Files Files Community

akarshan11 commited on Apr 1

Commit

759b6cc

verified ·

1 Parent(s): a082b95

Update app.py

Browse files

Files changed (1) hide show

app.py +115 -161

app.py CHANGED Viewed

@@ -5,9 +5,120 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import fitz  # PyMuPDF for PDF processing
 import docx2txt  # For DOCX processing
 from fpdf import FPDF  # For creating PDF outputs
 # Load model and tokenizer
-model_name = "facebook/mbart-large-50-many-to-many-mmt"  # Powerful translation model that can handle idioms well
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
@@ -15,7 +126,7 @@ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model = model.to(device)
-# Reduced language list with focus on major languages and Indian languages
 LANGUAGES = {
     # Major Global Languages
     "English": "en_XX",
@@ -38,159 +149,8 @@ LANGUAGES = {
     "Urdu": "ur_PK"
 }
-# File extraction functions
-def extract_text_from_pdf(file_path):
-    """Extract text from a PDF file"""
-    text = ""
-    try:
-        doc = fitz.open(file_path)
-        for page in doc:
-            text += page.get_text()
-        return text
-    except Exception as e:
-        return f"Error extracting PDF text: {str(e)}"
-def extract_text_from_docx(file_path):
-    """Extract text from a DOCX file"""
-    try:
-        return docx2txt.process(file_path)
-    except Exception as e:
-        return f"Error extracting DOCX text: {str(e)}"
-def extract_text_from_txt(file_path):
-    """Extract text from a TXT file"""
-    try:
-        with open(file_path, 'r', encoding='utf-8') as file:
-            return file.read()
-    except UnicodeDecodeError:
-        try:
-            with open(file_path, 'r', encoding='latin-1') as file:
-                return file.read()
-        except Exception as e:
-            return f"Error extracting TXT text: {str(e)}"
-    except Exception as e:
-        return f"Error extracting TXT text: {str(e)}"
-def save_as_pdf(text, output_path):
-    """Save text as PDF"""
-    pdf = FPDF()
-    pdf.add_page()
-    pdf.set_font("Arial", size=12)
-    # Handle encoding safely
-    try:
-        # Try UTF-8 first
-        encoded_text = text
-        pdf.multi_cell(0, 10, encoded_text)
-    except Exception:
-        try:
-            # Fall back to latin-1 with replacement
-            encoded_text = text.encode('latin-1', 'replace').decode('latin-1')
-            pdf.multi_cell(0, 10, encoded_text)
-        except Exception as e:
-            return f"Error creating PDF: {str(e)}"
-    try:
-        pdf.output(output_path)
-        return output_path
-    except Exception as e:
-        return f"Error saving PDF: {str(e)}"
-def get_type(schema: dict | bool):
-    """Updated get_type function to handle boolean schemas"""
-    if isinstance(schema, bool):
-        return "boolean"
-    if "const" in schema:
-        return "const"
-    if "enum" in schema:
-        return "enum"
-    elif "type" in schema:
-        return schema["type"]
-    elif schema.get("$ref"):
-        return "$ref"
-    elif schema.get("oneOf"):
-        return "oneOf"
-    elif schema.get("anyOf"):
-        return "anyOf"
-    elif schema.get("allOf"):
-        return "allOf"
-    elif "type" not in schema:
-        return {}
-    else:
-        raise ValueError(f"Cannot parse type for {schema}")
-# Translation function
-def translate(text, source_lang, target_lang, max_length=1024):
-    """Translate text from source language to target language"""
-    if not text:
-        return "No text provided for translation."
-    try:
-        # Set source and target language
-        src_lang = LANGUAGES.get(source_lang)
-        tgt_lang = LANGUAGES.get(target_lang)
-        if not src_lang or not tgt_lang:
-            return "Source or target language not supported."
-        # Set tokenizer source language
-        tokenizer.src_lang = src_lang
-        # Prepare input
-        inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True)
-        inputs = {k: v.to(device) for k, v in inputs.items()}
-        # Generate translation
-        with torch.no_grad():
-            generated_tokens = model.generate(
-                **inputs,
-                forced_bos_token_id=tokenizer.lang_to_id[tgt_lang],
-                max_length=max_length,
-                num_beams=5,
-                early_stopping=True
-            )
-        # Decode translation
-        translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
-        return translation
-    except Exception as e:
-        return f"Translation error: {str(e)}"
-# Process uploads and handle translation
-def process_file(file, source_lang, target_lang):
-    """Process uploaded file and translate its content"""
-    if file is None:
-        return None, "No file uploaded."
-    try:
-        # Save uploaded file temporarily
-        temp_file_path = file.name
-        # Extract text based on file type
-        if temp_file_path.lower().endswith('.pdf'):
-            text = extract_text_from_pdf(temp_file_path)
-        elif temp_file_path.lower().endswith('.docx'):
-            text = extract_text_from_docx(temp_file_path)
-        elif temp_file_path.lower().endswith('.txt'):
-            text = extract_text_from_txt(temp_file_path)
-        else:
-            return None, "Unsupported file format. Please upload PDF, DOCX, or TXT files."
-        # Translate the extracted text
-        translated_text = translate(text, source_lang, target_lang)
-        # Save translation as PDF
-        output_pdf_path = os.path.join(os.path.dirname(temp_file_path), f"translated_{os.path.basename(temp_file_path)}.pdf")
-        result = save_as_pdf(translated_text, output_pdf_path)
-        if isinstance(result, str) and result.startswith("Error"):
-            return None, result
-        return output_pdf_path, translated_text
-    except Exception as e:
-        return None, f"Error processing file: {str(e)}"
 # Create Gradio interface
 with gr.Blocks(title="Indian Language Translator") as demo:
@@ -229,12 +189,6 @@ with gr.Blocks(title="Indian Language Translator") as demo:
             inputs=[file_input, source_lang_doc, target_lang_doc],
             outputs=[output_file, output_preview]
         )
-    gr.Markdown("### Supported File Types: PDF, DOCX, TXT")
-    gr.Markdown("### Features:")
-    gr.Markdown("- Supports major Indian languages including Hindi, Bengali, Tamil, Telugu, Malayalam")
-    gr.Markdown("- Context-aware translation that understands idioms and cultural expressions")
-    gr.Markdown("- Document translation with PDF output")
 if __name__ == "__main__":
     demo.launch(share=True)

 import fitz  # PyMuPDF for PDF processing
 import docx2txt  # For DOCX processing
 from fpdf import FPDF  # For creating PDF outputs
+import typing
+from typing import Any, Union
+# Add modified JSON schema handling functions
+def get_type(schema: Union[dict, bool]) -> str:
+    """Get the type of a JSON schema.
+    Args:
+        schema: JSON schema object or boolean
+    Returns:
+        str: Type of the schema
+    """
+    if isinstance(schema, bool):
+        return "boolean"
+    if not isinstance(schema, dict):
+        return "any"
+    if "const" in schema:
+        return "const"
+    if "enum" in schema:
+        return "enum"
+    elif "type" in schema:
+        return schema["type"]
+    elif schema.get("$ref"):
+        return "$ref"
+    elif schema.get("oneOf"):
+        return "oneOf"
+    elif schema.get("anyOf"):
+        return "anyOf"
+    elif schema.get("allOf"):
+        return "allOf"
+    return "any"
+def _json_schema_to_python_type(schema: Any, defs: Any) -> str:
+    """Convert JSON schema to Python type hint.
+    Args:
+        schema: JSON schema
+        defs: Schema definitions
+    Returns:
+        str: Python type hint
+    """
+    if schema == {}:
+        return "Any"
+    type_ = get_type(schema)
+    if type_ == "boolean":
+        return "bool"
+    elif type_ == "any":
+        if isinstance(schema, dict) and "description" in schema and "json" in schema["description"]:
+            return "str | float | bool | list | dict"
+        return "Any"
+    elif type_ == "$ref":
+        return _json_schema_to_python_type(defs[schema["$ref"].split("/")[-1]], defs)
+    elif type_ == "null":
+        return "None"
+    elif type_ == "const":
+        return f"Literal[{schema['const']}]"
+    elif type_ == "enum":
+        return "Literal[" + ", ".join([f"'{str(v)}'" for v in schema["enum"]]) + "]"
+    elif type_ == "integer":
+        return "int"
+    elif type_ == "string":
+        return "str"
+    elif type_ == "boolean":
+        return "bool"
+    elif type_ == "number":
+        return "float"
+    elif type_ == "array":
+        items = schema.get("items", {})
+        if isinstance(items, bool):
+            return "list[Any]"
+        if "prefixItems" in items:
+            elements = ", ".join(
+                [_json_schema_to_python_type(i, defs) for i in items["prefixItems"]]
+            )
+            return f"tuple[{elements}]"
+        elif "prefixItems" in schema:
+            elements = ", ".join(
+                [_json_schema_to_python_type(i, defs) for i in schema["prefixItems"]]
+            )
+            return f"tuple[{elements}]"
+        else:
+            elements = _json_schema_to_python_type(items, defs)
+            return f"list[{elements}]"
+    elif type_ == "object":
+        props = schema.get("properties", {})
+        def get_desc(v):
+            return f" ({v.get('description')})" if isinstance(v, dict) and v.get("description") else ""
+        des = [
+            f"{n}: {_json_schema_to_python_type(v, defs)}{get_desc(v)}"
+            for n, v in props.items()
+            if n != "$defs"
+        ]
+        if "additionalProperties" in schema:
+            additional_properties = schema["additionalProperties"]
+            if isinstance(additional_properties, bool):
+                if additional_properties:
+                    des += ["str, Any"]
+            else:
+                des += [f"str, {_json_schema_to_python_type(additional_properties, defs)}"]
+        des = ", ".join(des)
+        return f"dict({des})"
+    else:
+        return "Any"
+# The rest of your original code remains the same
 # Load model and tokenizer
+model_name = "facebook/mbart-large-50-many-to-many-mmt"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model = model.to(device)
+# Your LANGUAGES dictionary and other functions remain the same
 LANGUAGES = {
     # Major Global Languages
     "English": "en_XX",
     "Urdu": "ur_PK"
 }
+# Your file handling and translation functions remain the same
+# ... (rest of your original code)
 # Create Gradio interface
 with gr.Blocks(title="Indian Language Translator") as demo:
             inputs=[file_input, source_lang_doc, target_lang_doc],
             outputs=[output_file, output_preview]
         )
 if __name__ == "__main__":
     demo.launch(share=True)