File size: 7,123 Bytes
3cf0975
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import os
import uuid
import tempfile
import re
import requests
import pandas as pd
from tika import parser
from docx import Document
from sentence_transformers import SentenceTransformer, util
import torch
import streamlit as st
from io import BytesIO

# Load the pre-trained embedding model for semantic matching.
model = SentenceTransformer('all-MiniLM-L6-v2')

# -----------------------------
# Glossary Loader and Enforcement
# -----------------------------
def load_glossary(glossary_file) -> dict:
    """
    Load the company glossary from an Excel file.
    Expects columns: 'English' and 'CanadianFrench'
    """
    try:
        # Use pandas to read directly from the uploaded file (BytesIO)
        df = pd.read_excel(glossary_file)
        glossary = {
            row['English'].strip().lower(): row['CanadianFrench'].strip()
            for _, row in df.iterrows()
            if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench'])
        }
        return glossary
    except Exception as e:
        raise Exception(f"Error loading glossary: {str(e)}")

def apply_glossary(text: str, glossary: dict) -> str:
    """
    Replace occurrences of glossary terms (exact word match) with preferred Canadian French terms.
    """
    for eng_term, fr_term in glossary.items():
        pattern = r'\b' + re.escape(eng_term) + r'\b'
        text = re.sub(pattern, fr_term, text, flags=re.IGNORECASE)
    return text

# -----------------------------
# Semantic Glossary Enforcement
# -----------------------------
def compute_glossary_embeddings(glossary: dict):
    """
    Precompute embeddings for the glossary keys.
    """
    glossary_terms = list(glossary.keys())
    embeddings = model.encode(glossary_terms, convert_to_tensor=True)
    return glossary_terms, embeddings

def apply_semantic_glossary(text: str, glossary: dict, threshold: float = 0.8) -> str:
    """
    Enhance glossary enforcement using semantic similarity.
    Splits text into sentences, computes embeddings, and if a sentence is
    semantically similar to a glossary term (above threshold), performs replacement.
    """
    glossary_terms, glossary_embeddings = compute_glossary_embeddings(glossary)
    sentences = text.split('.')
    updated_sentences = []
    for sentence in sentences:
        if not sentence.strip():
            continue
        sentence_embedding = model.encode(sentence, convert_to_tensor=True)
        cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
        max_score, max_idx = torch.max(cos_scores, dim=1)
        if max_score.item() >= threshold:
            term = glossary_terms[max_idx]
            replacement = glossary[term]
            pattern = r'\b' + re.escape(term) + r'\b'
            sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)
        updated_sentences.append(sentence.strip())
    final_text = '. '.join(updated_sentences)
    return final_text

# -----------------------------
# Translation using Azure Translator API
# -----------------------------
def translate_text_azure(text: str) -> str:
    """
    Translate text to Canadian French using the Azure Translator API.
    """
    subscription_key = os.getenv("AZURE_TRANSLATOR_KEY")
    region = os.getenv("AZURE_TRANSLATOR_REGION")
    if not subscription_key or not region:
        raise Exception("Azure Translator credentials not set.")
    
    endpoint = "https://api.cognitive.microsofttranslator.com/translate"
    params = {"api-version": "3.0", "to": "fr-CA"}
    headers = {
        "Ocp-Apim-Subscription-Key": subscription_key,
        "Ocp-Apim-Subscription-Region": region,
        "Content-type": "application/json",
        "X-ClientTraceId": str(uuid.uuid4())
    }
    body = [{"text": text}]
    response = requests.post(endpoint, params=params, headers=headers, json=body)
    if response.status_code != 200:
        raise Exception(f"Translation API error: {response.text}")
    result = response.json()
    translated_text = result[0]['translations'][0]['text']
    return translated_text

# -----------------------------
# Document Parsing & Reconstruction
# -----------------------------
def parse_document(file_path: str) -> str:
    """
    Extract text content from a document using Apache Tika.
    """
    parsed = parser.from_file(file_path)
    text = parsed.get("content", "")
    if not text:
        raise Exception("No text content found in the document.")
    return text

def rebuild_document(text: str) -> bytes:
    """
    Rebuild a DOCX document from the provided text.
    Returns the document as bytes.
    """
    document = Document()
    for line in text.split("\n"):
        if line.strip():
            document.add_paragraph(line)
    bio = BytesIO()
    document.save(bio)
    bio.seek(0)
    return bio.getvalue()

# -----------------------------
# Processing Pipeline
# -----------------------------
def process_translation(doc_file, glossary_file) -> bytes:
    try:
        # Write uploaded document to a temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_doc:
            tmp_doc.write(doc_file.read())
            doc_path = tmp_doc.name

        # Load glossary from the uploaded Excel file
        glossary = load_glossary(glossary_file)

        # Parse document text
        raw_text = parse_document(doc_path)

        # Translate text via Azure Translator
        translated_text = translate_text_azure(raw_text)

        # Apply exact glossary enforcement
        final_text = apply_glossary(translated_text, glossary)

        # Apply semantic glossary enforcement
        final_text = apply_semantic_glossary(final_text, glossary, threshold=0.8)

        # Rebuild document to DOCX and get bytes
        output_bytes = rebuild_document(final_text)

        # Clean up temporary file
        os.unlink(doc_path)
        return output_bytes
    except Exception as e:
        st.error(f"Error: {str(e)}")
        return None

# -----------------------------
# Streamlit App UI
# -----------------------------
def main():
    st.title("English to Canadian Quebec French Translator")
    st.write("Upload an English document (Word or PDF) and your company glossary (Excel) to translate.")
    
    doc_file = st.file_uploader("Upload English Document", type=["doc", "docx", "pdf"])
    glossary_file = st.file_uploader("Upload Company Glossary (Excel)", type=["xlsx"])
    
    if st.button("Translate Document"):
        if doc_file is None or glossary_file is None:
            st.error("Please upload both the document and glossary files.")
        else:
            with st.spinner("Translating..."):
                result = process_translation(doc_file, glossary_file)
                if result is not None:
                    st.download_button(
                        label="Download Translated DOCX",
                        data=result,
                        file_name="translated.docx",
                        mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
                    )

if __name__ == "__main__":
    main()