garyd1 commited on
Commit
3cf0975
·
verified ·
1 Parent(s): f3085d7

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +196 -0
app.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uuid
3
+ import tempfile
4
+ import re
5
+ import requests
6
+ import pandas as pd
7
+ from tika import parser
8
+ from docx import Document
9
+ from sentence_transformers import SentenceTransformer, util
10
+ import torch
11
+ import streamlit as st
12
+ from io import BytesIO
13
+
14
+ # Load the pre-trained embedding model for semantic matching.
15
+ model = SentenceTransformer('all-MiniLM-L6-v2')
16
+
17
+ # -----------------------------
18
+ # Glossary Loader and Enforcement
19
+ # -----------------------------
20
+ def load_glossary(glossary_file) -> dict:
21
+ """
22
+ Load the company glossary from an Excel file.
23
+ Expects columns: 'English' and 'CanadianFrench'
24
+ """
25
+ try:
26
+ # Use pandas to read directly from the uploaded file (BytesIO)
27
+ df = pd.read_excel(glossary_file)
28
+ glossary = {
29
+ row['English'].strip().lower(): row['CanadianFrench'].strip()
30
+ for _, row in df.iterrows()
31
+ if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench'])
32
+ }
33
+ return glossary
34
+ except Exception as e:
35
+ raise Exception(f"Error loading glossary: {str(e)}")
36
+
37
+ def apply_glossary(text: str, glossary: dict) -> str:
38
+ """
39
+ Replace occurrences of glossary terms (exact word match) with preferred Canadian French terms.
40
+ """
41
+ for eng_term, fr_term in glossary.items():
42
+ pattern = r'\b' + re.escape(eng_term) + r'\b'
43
+ text = re.sub(pattern, fr_term, text, flags=re.IGNORECASE)
44
+ return text
45
+
46
+ # -----------------------------
47
+ # Semantic Glossary Enforcement
48
+ # -----------------------------
49
+ def compute_glossary_embeddings(glossary: dict):
50
+ """
51
+ Precompute embeddings for the glossary keys.
52
+ """
53
+ glossary_terms = list(glossary.keys())
54
+ embeddings = model.encode(glossary_terms, convert_to_tensor=True)
55
+ return glossary_terms, embeddings
56
+
57
+ def apply_semantic_glossary(text: str, glossary: dict, threshold: float = 0.8) -> str:
58
+ """
59
+ Enhance glossary enforcement using semantic similarity.
60
+ Splits text into sentences, computes embeddings, and if a sentence is
61
+ semantically similar to a glossary term (above threshold), performs replacement.
62
+ """
63
+ glossary_terms, glossary_embeddings = compute_glossary_embeddings(glossary)
64
+ sentences = text.split('.')
65
+ updated_sentences = []
66
+ for sentence in sentences:
67
+ if not sentence.strip():
68
+ continue
69
+ sentence_embedding = model.encode(sentence, convert_to_tensor=True)
70
+ cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
71
+ max_score, max_idx = torch.max(cos_scores, dim=1)
72
+ if max_score.item() >= threshold:
73
+ term = glossary_terms[max_idx]
74
+ replacement = glossary[term]
75
+ pattern = r'\b' + re.escape(term) + r'\b'
76
+ sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)
77
+ updated_sentences.append(sentence.strip())
78
+ final_text = '. '.join(updated_sentences)
79
+ return final_text
80
+
81
+ # -----------------------------
82
+ # Translation using Azure Translator API
83
+ # -----------------------------
84
+ def translate_text_azure(text: str) -> str:
85
+ """
86
+ Translate text to Canadian French using the Azure Translator API.
87
+ """
88
+ subscription_key = os.getenv("AZURE_TRANSLATOR_KEY")
89
+ region = os.getenv("AZURE_TRANSLATOR_REGION")
90
+ if not subscription_key or not region:
91
+ raise Exception("Azure Translator credentials not set.")
92
+
93
+ endpoint = "https://api.cognitive.microsofttranslator.com/translate"
94
+ params = {"api-version": "3.0", "to": "fr-CA"}
95
+ headers = {
96
+ "Ocp-Apim-Subscription-Key": subscription_key,
97
+ "Ocp-Apim-Subscription-Region": region,
98
+ "Content-type": "application/json",
99
+ "X-ClientTraceId": str(uuid.uuid4())
100
+ }
101
+ body = [{"text": text}]
102
+ response = requests.post(endpoint, params=params, headers=headers, json=body)
103
+ if response.status_code != 200:
104
+ raise Exception(f"Translation API error: {response.text}")
105
+ result = response.json()
106
+ translated_text = result[0]['translations'][0]['text']
107
+ return translated_text
108
+
109
+ # -----------------------------
110
+ # Document Parsing & Reconstruction
111
+ # -----------------------------
112
+ def parse_document(file_path: str) -> str:
113
+ """
114
+ Extract text content from a document using Apache Tika.
115
+ """
116
+ parsed = parser.from_file(file_path)
117
+ text = parsed.get("content", "")
118
+ if not text:
119
+ raise Exception("No text content found in the document.")
120
+ return text
121
+
122
+ def rebuild_document(text: str) -> bytes:
123
+ """
124
+ Rebuild a DOCX document from the provided text.
125
+ Returns the document as bytes.
126
+ """
127
+ document = Document()
128
+ for line in text.split("\n"):
129
+ if line.strip():
130
+ document.add_paragraph(line)
131
+ bio = BytesIO()
132
+ document.save(bio)
133
+ bio.seek(0)
134
+ return bio.getvalue()
135
+
136
+ # -----------------------------
137
+ # Processing Pipeline
138
+ # -----------------------------
139
+ def process_translation(doc_file, glossary_file) -> bytes:
140
+ try:
141
+ # Write uploaded document to a temporary file
142
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_doc:
143
+ tmp_doc.write(doc_file.read())
144
+ doc_path = tmp_doc.name
145
+
146
+ # Load glossary from the uploaded Excel file
147
+ glossary = load_glossary(glossary_file)
148
+
149
+ # Parse document text
150
+ raw_text = parse_document(doc_path)
151
+
152
+ # Translate text via Azure Translator
153
+ translated_text = translate_text_azure(raw_text)
154
+
155
+ # Apply exact glossary enforcement
156
+ final_text = apply_glossary(translated_text, glossary)
157
+
158
+ # Apply semantic glossary enforcement
159
+ final_text = apply_semantic_glossary(final_text, glossary, threshold=0.8)
160
+
161
+ # Rebuild document to DOCX and get bytes
162
+ output_bytes = rebuild_document(final_text)
163
+
164
+ # Clean up temporary file
165
+ os.unlink(doc_path)
166
+ return output_bytes
167
+ except Exception as e:
168
+ st.error(f"Error: {str(e)}")
169
+ return None
170
+
171
+ # -----------------------------
172
+ # Streamlit App UI
173
+ # -----------------------------
174
+ def main():
175
+ st.title("English to Canadian Quebec French Translator")
176
+ st.write("Upload an English document (Word or PDF) and your company glossary (Excel) to translate.")
177
+
178
+ doc_file = st.file_uploader("Upload English Document", type=["doc", "docx", "pdf"])
179
+ glossary_file = st.file_uploader("Upload Company Glossary (Excel)", type=["xlsx"])
180
+
181
+ if st.button("Translate Document"):
182
+ if doc_file is None or glossary_file is None:
183
+ st.error("Please upload both the document and glossary files.")
184
+ else:
185
+ with st.spinner("Translating..."):
186
+ result = process_translation(doc_file, glossary_file)
187
+ if result is not None:
188
+ st.download_button(
189
+ label="Download Translated DOCX",
190
+ data=result,
191
+ file_name="translated.docx",
192
+ mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
193
+ )
194
+
195
+ if __name__ == "__main__":
196
+ main()