Spaces:

garyd1
/

Translator

Sleeping

File size: 7,123 Bytes

3cf0975

import os
import uuid
import tempfile
import re
import requests
import pandas as pd
from tika import parser
from docx import Document
from sentence_transformers import SentenceTransformer, util
import torch
import streamlit as st
from io import BytesIO

# Load the pre-trained embedding model for semantic matching.
model = SentenceTransformer('all-MiniLM-L6-v2')

# -----------------------------
# Glossary Loader and Enforcement
# -----------------------------
def load_glossary(glossary_file) -> dict:
    """
    Load the company glossary from an Excel file.
    Expects columns: 'English' and 'CanadianFrench'
    """
    try:
        # Use pandas to read directly from the uploaded file (BytesIO)
        df = pd.read_excel(glossary_file)
        glossary = {
            row['English'].strip().lower(): row['CanadianFrench'].strip()
            for _, row in df.iterrows()
            if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench'])
        }
        return glossary
    except Exception as e:
        raise Exception(f"Error loading glossary: {str(e)}")

def apply_glossary(text: str, glossary: dict) -> str:
    """
    Replace occurrences of glossary terms (exact word match) with preferred Canadian French terms.
    """
    for eng_term, fr_term in glossary.items():
        pattern = r'\b' + re.escape(eng_term) + r'\b'
        text = re.sub(pattern, fr_term, text, flags=re.IGNORECASE)
    return text

# -----------------------------
# Semantic Glossary Enforcement
# -----------------------------
def compute_glossary_embeddings(glossary: dict):
    """
    Precompute embeddings for the glossary keys.
    """
    glossary_terms = list(glossary.keys())
    embeddings = model.encode(glossary_terms, convert_to_tensor=True)
    return glossary_terms, embeddings

def apply_semantic_glossary(text: str, glossary: dict, threshold: float = 0.8) -> str:
    """
    Enhance glossary enforcement using semantic similarity.
    Splits text into sentences, computes embeddings, and if a sentence is
    semantically similar to a glossary term (above threshold), performs replacement.
    """
    glossary_terms, glossary_embeddings = compute_glossary_embeddings(glossary)
    sentences = text.split('.')
    updated_sentences = []
    for sentence in sentences:
        if not sentence.strip():
            continue
        sentence_embedding = model.encode(sentence, convert_to_tensor=True)
        cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
        max_score, max_idx = torch.max(cos_scores, dim=1)
        if max_score.item() >= threshold:
            term = glossary_terms[max_idx]
            replacement = glossary[term]
            pattern = r'\b' + re.escape(term) + r'\b'
            sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)
        updated_sentences.append(sentence.strip())
    final_text = '. '.join(updated_sentences)
    return final_text

# -----------------------------
# Translation using Azure Translator API
# -----------------------------
def translate_text_azure(text: str) -> str:
    """
    Translate text to Canadian French using the Azure Translator API.
    """
    subscription_key = os.getenv("AZURE_TRANSLATOR_KEY")
    region = os.getenv("AZURE_TRANSLATOR_REGION")
    if not subscription_key or not region:
        raise Exception("Azure Translator credentials not set.")
    
    endpoint = "https://api.cognitive.microsofttranslator.com/translate"
    params = {"api-version": "3.0", "to": "fr-CA"}
    headers = {
        "Ocp-Apim-Subscription-Key": subscription_key,
        "Ocp-Apim-Subscription-Region": region,
        "Content-type": "application/json",
        "X-ClientTraceId": str(uuid.uuid4())
    }
    body = [{"text": text}]
    response = requests.post(endpoint, params=params, headers=headers, json=body)
    if response.status_code != 200:
        raise Exception(f"Translation API error: {response.text}")
    result = response.json()
    translated_text = result[0]['translations'][0]['text']
    return translated_text

# -----------------------------
# Document Parsing & Reconstruction
# -----------------------------
def parse_document(file_path: str) -> str:
    """
    Extract text content from a document using Apache Tika.
    """
    parsed = parser.from_file(file_path)
    text = parsed.get("content", "")
    if not text:
        raise Exception("No text content found in the document.")
    return text

def rebuild_document(text: str) -> bytes:
    """
    Rebuild a DOCX document from the provided text.
    Returns the document as bytes.
    """
    document = Document()
    for line in text.split("\n"):
        if line.strip():
            document.add_paragraph(line)
    bio = BytesIO()
    document.save(bio)
    bio.seek(0)
    return bio.getvalue()

# -----------------------------
# Processing Pipeline
# -----------------------------
def process_translation(doc_file, glossary_file) -> bytes:
    try:
        # Write uploaded document to a temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_doc:
            tmp_doc.write(doc_file.read())
            doc_path = tmp_doc.name

        # Load glossary from the uploaded Excel file
        glossary = load_glossary(glossary_file)

        # Parse document text
        raw_text = parse_document(doc_path)

        # Translate text via Azure Translator
        translated_text = translate_text_azure(raw_text)

        # Apply exact glossary enforcement
        final_text = apply_glossary(translated_text, glossary)

        # Apply semantic glossary enforcement
        final_text = apply_semantic_glossary(final_text, glossary, threshold=0.8)

        # Rebuild document to DOCX and get bytes
        output_bytes = rebuild_document(final_text)

        # Clean up temporary file
        os.unlink(doc_path)
        return output_bytes
    except Exception as e:
        st.error(f"Error: {str(e)}")
        return None

# -----------------------------
# Streamlit App UI
# -----------------------------
def main():
    st.title("English to Canadian Quebec French Translator")
    st.write("Upload an English document (Word or PDF) and your company glossary (Excel) to translate.")
    
    doc_file = st.file_uploader("Upload English Document", type=["doc", "docx", "pdf"])
    glossary_file = st.file_uploader("Upload Company Glossary (Excel)", type=["xlsx"])
    
    if st.button("Translate Document"):
        if doc_file is None or glossary_file is None:
            st.error("Please upload both the document and glossary files.")
        else:
            with st.spinner("Translating..."):
                result = process_translation(doc_file, glossary_file)
                if result is not None:
                    st.download_button(
                        label="Download Translated DOCX",
                        data=result,
                        file_name="translated.docx",
                        mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
                    )

if __name__ == "__main__":
    main()