Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import uuid
|
3 |
+
import tempfile
|
4 |
+
import re
|
5 |
+
import requests
|
6 |
+
import pandas as pd
|
7 |
+
from tika import parser
|
8 |
+
from docx import Document
|
9 |
+
from sentence_transformers import SentenceTransformer, util
|
10 |
+
import torch
|
11 |
+
import streamlit as st
|
12 |
+
from io import BytesIO
|
13 |
+
|
14 |
+
# Load the pre-trained embedding model for semantic matching.
|
15 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
16 |
+
|
17 |
+
# -----------------------------
|
18 |
+
# Glossary Loader and Enforcement
|
19 |
+
# -----------------------------
|
20 |
+
def load_glossary(glossary_file) -> dict:
|
21 |
+
"""
|
22 |
+
Load the company glossary from an Excel file.
|
23 |
+
Expects columns: 'English' and 'CanadianFrench'
|
24 |
+
"""
|
25 |
+
try:
|
26 |
+
# Use pandas to read directly from the uploaded file (BytesIO)
|
27 |
+
df = pd.read_excel(glossary_file)
|
28 |
+
glossary = {
|
29 |
+
row['English'].strip().lower(): row['CanadianFrench'].strip()
|
30 |
+
for _, row in df.iterrows()
|
31 |
+
if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench'])
|
32 |
+
}
|
33 |
+
return glossary
|
34 |
+
except Exception as e:
|
35 |
+
raise Exception(f"Error loading glossary: {str(e)}")
|
36 |
+
|
37 |
+
def apply_glossary(text: str, glossary: dict) -> str:
|
38 |
+
"""
|
39 |
+
Replace occurrences of glossary terms (exact word match) with preferred Canadian French terms.
|
40 |
+
"""
|
41 |
+
for eng_term, fr_term in glossary.items():
|
42 |
+
pattern = r'\b' + re.escape(eng_term) + r'\b'
|
43 |
+
text = re.sub(pattern, fr_term, text, flags=re.IGNORECASE)
|
44 |
+
return text
|
45 |
+
|
46 |
+
# -----------------------------
|
47 |
+
# Semantic Glossary Enforcement
|
48 |
+
# -----------------------------
|
49 |
+
def compute_glossary_embeddings(glossary: dict):
|
50 |
+
"""
|
51 |
+
Precompute embeddings for the glossary keys.
|
52 |
+
"""
|
53 |
+
glossary_terms = list(glossary.keys())
|
54 |
+
embeddings = model.encode(glossary_terms, convert_to_tensor=True)
|
55 |
+
return glossary_terms, embeddings
|
56 |
+
|
57 |
+
def apply_semantic_glossary(text: str, glossary: dict, threshold: float = 0.8) -> str:
|
58 |
+
"""
|
59 |
+
Enhance glossary enforcement using semantic similarity.
|
60 |
+
Splits text into sentences, computes embeddings, and if a sentence is
|
61 |
+
semantically similar to a glossary term (above threshold), performs replacement.
|
62 |
+
"""
|
63 |
+
glossary_terms, glossary_embeddings = compute_glossary_embeddings(glossary)
|
64 |
+
sentences = text.split('.')
|
65 |
+
updated_sentences = []
|
66 |
+
for sentence in sentences:
|
67 |
+
if not sentence.strip():
|
68 |
+
continue
|
69 |
+
sentence_embedding = model.encode(sentence, convert_to_tensor=True)
|
70 |
+
cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
|
71 |
+
max_score, max_idx = torch.max(cos_scores, dim=1)
|
72 |
+
if max_score.item() >= threshold:
|
73 |
+
term = glossary_terms[max_idx]
|
74 |
+
replacement = glossary[term]
|
75 |
+
pattern = r'\b' + re.escape(term) + r'\b'
|
76 |
+
sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)
|
77 |
+
updated_sentences.append(sentence.strip())
|
78 |
+
final_text = '. '.join(updated_sentences)
|
79 |
+
return final_text
|
80 |
+
|
81 |
+
# -----------------------------
|
82 |
+
# Translation using Azure Translator API
|
83 |
+
# -----------------------------
|
84 |
+
def translate_text_azure(text: str) -> str:
|
85 |
+
"""
|
86 |
+
Translate text to Canadian French using the Azure Translator API.
|
87 |
+
"""
|
88 |
+
subscription_key = os.getenv("AZURE_TRANSLATOR_KEY")
|
89 |
+
region = os.getenv("AZURE_TRANSLATOR_REGION")
|
90 |
+
if not subscription_key or not region:
|
91 |
+
raise Exception("Azure Translator credentials not set.")
|
92 |
+
|
93 |
+
endpoint = "https://api.cognitive.microsofttranslator.com/translate"
|
94 |
+
params = {"api-version": "3.0", "to": "fr-CA"}
|
95 |
+
headers = {
|
96 |
+
"Ocp-Apim-Subscription-Key": subscription_key,
|
97 |
+
"Ocp-Apim-Subscription-Region": region,
|
98 |
+
"Content-type": "application/json",
|
99 |
+
"X-ClientTraceId": str(uuid.uuid4())
|
100 |
+
}
|
101 |
+
body = [{"text": text}]
|
102 |
+
response = requests.post(endpoint, params=params, headers=headers, json=body)
|
103 |
+
if response.status_code != 200:
|
104 |
+
raise Exception(f"Translation API error: {response.text}")
|
105 |
+
result = response.json()
|
106 |
+
translated_text = result[0]['translations'][0]['text']
|
107 |
+
return translated_text
|
108 |
+
|
109 |
+
# -----------------------------
|
110 |
+
# Document Parsing & Reconstruction
|
111 |
+
# -----------------------------
|
112 |
+
def parse_document(file_path: str) -> str:
|
113 |
+
"""
|
114 |
+
Extract text content from a document using Apache Tika.
|
115 |
+
"""
|
116 |
+
parsed = parser.from_file(file_path)
|
117 |
+
text = parsed.get("content", "")
|
118 |
+
if not text:
|
119 |
+
raise Exception("No text content found in the document.")
|
120 |
+
return text
|
121 |
+
|
122 |
+
def rebuild_document(text: str) -> bytes:
|
123 |
+
"""
|
124 |
+
Rebuild a DOCX document from the provided text.
|
125 |
+
Returns the document as bytes.
|
126 |
+
"""
|
127 |
+
document = Document()
|
128 |
+
for line in text.split("\n"):
|
129 |
+
if line.strip():
|
130 |
+
document.add_paragraph(line)
|
131 |
+
bio = BytesIO()
|
132 |
+
document.save(bio)
|
133 |
+
bio.seek(0)
|
134 |
+
return bio.getvalue()
|
135 |
+
|
136 |
+
# -----------------------------
|
137 |
+
# Processing Pipeline
|
138 |
+
# -----------------------------
|
139 |
+
def process_translation(doc_file, glossary_file) -> bytes:
|
140 |
+
try:
|
141 |
+
# Write uploaded document to a temporary file
|
142 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_doc:
|
143 |
+
tmp_doc.write(doc_file.read())
|
144 |
+
doc_path = tmp_doc.name
|
145 |
+
|
146 |
+
# Load glossary from the uploaded Excel file
|
147 |
+
glossary = load_glossary(glossary_file)
|
148 |
+
|
149 |
+
# Parse document text
|
150 |
+
raw_text = parse_document(doc_path)
|
151 |
+
|
152 |
+
# Translate text via Azure Translator
|
153 |
+
translated_text = translate_text_azure(raw_text)
|
154 |
+
|
155 |
+
# Apply exact glossary enforcement
|
156 |
+
final_text = apply_glossary(translated_text, glossary)
|
157 |
+
|
158 |
+
# Apply semantic glossary enforcement
|
159 |
+
final_text = apply_semantic_glossary(final_text, glossary, threshold=0.8)
|
160 |
+
|
161 |
+
# Rebuild document to DOCX and get bytes
|
162 |
+
output_bytes = rebuild_document(final_text)
|
163 |
+
|
164 |
+
# Clean up temporary file
|
165 |
+
os.unlink(doc_path)
|
166 |
+
return output_bytes
|
167 |
+
except Exception as e:
|
168 |
+
st.error(f"Error: {str(e)}")
|
169 |
+
return None
|
170 |
+
|
171 |
+
# -----------------------------
|
172 |
+
# Streamlit App UI
|
173 |
+
# -----------------------------
|
174 |
+
def main():
|
175 |
+
st.title("English to Canadian Quebec French Translator")
|
176 |
+
st.write("Upload an English document (Word or PDF) and your company glossary (Excel) to translate.")
|
177 |
+
|
178 |
+
doc_file = st.file_uploader("Upload English Document", type=["doc", "docx", "pdf"])
|
179 |
+
glossary_file = st.file_uploader("Upload Company Glossary (Excel)", type=["xlsx"])
|
180 |
+
|
181 |
+
if st.button("Translate Document"):
|
182 |
+
if doc_file is None or glossary_file is None:
|
183 |
+
st.error("Please upload both the document and glossary files.")
|
184 |
+
else:
|
185 |
+
with st.spinner("Translating..."):
|
186 |
+
result = process_translation(doc_file, glossary_file)
|
187 |
+
if result is not None:
|
188 |
+
st.download_button(
|
189 |
+
label="Download Translated DOCX",
|
190 |
+
data=result,
|
191 |
+
file_name="translated.docx",
|
192 |
+
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
193 |
+
)
|
194 |
+
|
195 |
+
if __name__ == "__main__":
|
196 |
+
main()
|