Spaces:

garyd1
/

Translator

Sleeping

App Files Files Community

garyd1 commited on Feb 25

Commit

3cf0975

verified ·

1 Parent(s): f3085d7

Create app.py

Browse files

Files changed (1) hide show

app.py +196 -0

app.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import os
+import uuid
+import tempfile
+import re
+import requests
+import pandas as pd
+from tika import parser
+from docx import Document
+from sentence_transformers import SentenceTransformer, util
+import torch
+import streamlit as st
+from io import BytesIO
+# Load the pre-trained embedding model for semantic matching.
+model = SentenceTransformer('all-MiniLM-L6-v2')
+# -----------------------------
+# Glossary Loader and Enforcement
+# -----------------------------
+def load_glossary(glossary_file) -> dict:
+    """
+    Load the company glossary from an Excel file.
+    Expects columns: 'English' and 'CanadianFrench'
+    """
+    try:
+        # Use pandas to read directly from the uploaded file (BytesIO)
+        df = pd.read_excel(glossary_file)
+        glossary = {
+            row['English'].strip().lower(): row['CanadianFrench'].strip()
+            for _, row in df.iterrows()
+            if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench'])
+        }
+        return glossary
+    except Exception as e:
+        raise Exception(f"Error loading glossary: {str(e)}")
+def apply_glossary(text: str, glossary: dict) -> str:
+    """
+    Replace occurrences of glossary terms (exact word match) with preferred Canadian French terms.
+    """
+    for eng_term, fr_term in glossary.items():
+        pattern = r'\b' + re.escape(eng_term) + r'\b'
+        text = re.sub(pattern, fr_term, text, flags=re.IGNORECASE)
+    return text
+# -----------------------------
+# Semantic Glossary Enforcement
+# -----------------------------
+def compute_glossary_embeddings(glossary: dict):
+    """
+    Precompute embeddings for the glossary keys.
+    """
+    glossary_terms = list(glossary.keys())
+    embeddings = model.encode(glossary_terms, convert_to_tensor=True)
+    return glossary_terms, embeddings
+def apply_semantic_glossary(text: str, glossary: dict, threshold: float = 0.8) -> str:
+    """
+    Enhance glossary enforcement using semantic similarity.
+    Splits text into sentences, computes embeddings, and if a sentence is
+    semantically similar to a glossary term (above threshold), performs replacement.
+    """
+    glossary_terms, glossary_embeddings = compute_glossary_embeddings(glossary)
+    sentences = text.split('.')
+    updated_sentences = []
+    for sentence in sentences:
+        if not sentence.strip():
+            continue
+        sentence_embedding = model.encode(sentence, convert_to_tensor=True)
+        cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
+        max_score, max_idx = torch.max(cos_scores, dim=1)
+        if max_score.item() >= threshold:
+            term = glossary_terms[max_idx]
+            replacement = glossary[term]
+            pattern = r'\b' + re.escape(term) + r'\b'
+            sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)
+        updated_sentences.append(sentence.strip())
+    final_text = '. '.join(updated_sentences)
+    return final_text
+# -----------------------------
+# Translation using Azure Translator API
+# -----------------------------
+def translate_text_azure(text: str) -> str:
+    """
+    Translate text to Canadian French using the Azure Translator API.
+    """
+    subscription_key = os.getenv("AZURE_TRANSLATOR_KEY")
+    region = os.getenv("AZURE_TRANSLATOR_REGION")
+    if not subscription_key or not region:
+        raise Exception("Azure Translator credentials not set.")
+    endpoint = "https://api.cognitive.microsofttranslator.com/translate"
+    params = {"api-version": "3.0", "to": "fr-CA"}
+    headers = {
+        "Ocp-Apim-Subscription-Key": subscription_key,
+        "Ocp-Apim-Subscription-Region": region,
+        "Content-type": "application/json",
+        "X-ClientTraceId": str(uuid.uuid4())
+    }
+    body = [{"text": text}]
+    response = requests.post(endpoint, params=params, headers=headers, json=body)
+    if response.status_code != 200:
+        raise Exception(f"Translation API error: {response.text}")
+    result = response.json()
+    translated_text = result[0]['translations'][0]['text']
+    return translated_text
+# -----------------------------
+# Document Parsing & Reconstruction
+# -----------------------------
+def parse_document(file_path: str) -> str:
+    """
+    Extract text content from a document using Apache Tika.
+    """
+    parsed = parser.from_file(file_path)
+    text = parsed.get("content", "")
+    if not text:
+        raise Exception("No text content found in the document.")
+    return text
+def rebuild_document(text: str) -> bytes:
+    """
+    Rebuild a DOCX document from the provided text.
+    Returns the document as bytes.
+    """
+    document = Document()
+    for line in text.split("\n"):
+        if line.strip():
+            document.add_paragraph(line)
+    bio = BytesIO()
+    document.save(bio)
+    bio.seek(0)
+    return bio.getvalue()
+# -----------------------------
+# Processing Pipeline
+# -----------------------------
+def process_translation(doc_file, glossary_file) -> bytes:
+    try:
+        # Write uploaded document to a temporary file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_doc:
+            tmp_doc.write(doc_file.read())
+            doc_path = tmp_doc.name
+        # Load glossary from the uploaded Excel file
+        glossary = load_glossary(glossary_file)
+        # Parse document text
+        raw_text = parse_document(doc_path)
+        # Translate text via Azure Translator
+        translated_text = translate_text_azure(raw_text)
+        # Apply exact glossary enforcement
+        final_text = apply_glossary(translated_text, glossary)
+        # Apply semantic glossary enforcement
+        final_text = apply_semantic_glossary(final_text, glossary, threshold=0.8)
+        # Rebuild document to DOCX and get bytes
+        output_bytes = rebuild_document(final_text)
+        # Clean up temporary file
+        os.unlink(doc_path)
+        return output_bytes
+    except Exception as e:
+        st.error(f"Error: {str(e)}")
+        return None
+# -----------------------------
+# Streamlit App UI
+# -----------------------------
+def main():
+    st.title("English to Canadian Quebec French Translator")
+    st.write("Upload an English document (Word or PDF) and your company glossary (Excel) to translate.")
+    doc_file = st.file_uploader("Upload English Document", type=["doc", "docx", "pdf"])
+    glossary_file = st.file_uploader("Upload Company Glossary (Excel)", type=["xlsx"])
+    if st.button("Translate Document"):
+        if doc_file is None or glossary_file is None:
+            st.error("Please upload both the document and glossary files.")
+        else:
+            with st.spinner("Translating..."):
+                result = process_translation(doc_file, glossary_file)
+                if result is not None:
+                    st.download_button(
+                        label="Download Translated DOCX",
+                        data=result,
+                        file_name="translated.docx",
+                        mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+                    )
+if __name__ == "__main__":
+    main()