import os import uuid import tempfile import re import requests import pandas as pd from tika import parser from docx import Document from sentence_transformers import SentenceTransformer, util import torch import streamlit as st from io import BytesIO # Load the pre-trained embedding model for semantic matching. model = SentenceTransformer('all-MiniLM-L6-v2') # ----------------------------- # Glossary Loader and Enforcement # ----------------------------- def load_glossary(glossary_file) -> dict: """ Load the company glossary from an Excel file. Expects columns: 'English' and 'CanadianFrench' """ try: # Use pandas to read directly from the uploaded file (BytesIO) df = pd.read_excel(glossary_file) glossary = { row['English'].strip().lower(): row['CanadianFrench'].strip() for _, row in df.iterrows() if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench']) } return glossary except Exception as e: raise Exception(f"Error loading glossary: {str(e)}") def apply_glossary(text: str, glossary: dict) -> str: """ Replace occurrences of glossary terms (exact word match) with preferred Canadian French terms. """ for eng_term, fr_term in glossary.items(): pattern = r'\b' + re.escape(eng_term) + r'\b' text = re.sub(pattern, fr_term, text, flags=re.IGNORECASE) return text # ----------------------------- # Semantic Glossary Enforcement # ----------------------------- def compute_glossary_embeddings(glossary: dict): """ Precompute embeddings for the glossary keys. """ glossary_terms = list(glossary.keys()) embeddings = model.encode(glossary_terms, convert_to_tensor=True) return glossary_terms, embeddings def apply_semantic_glossary(text: str, glossary: dict, threshold: float = 0.8) -> str: """ Enhance glossary enforcement using semantic similarity. Splits text into sentences, computes embeddings, and if a sentence is semantically similar to a glossary term (above threshold), performs replacement. """ glossary_terms, glossary_embeddings = compute_glossary_embeddings(glossary) sentences = text.split('.') updated_sentences = [] for sentence in sentences: if not sentence.strip(): continue sentence_embedding = model.encode(sentence, convert_to_tensor=True) cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings) max_score, max_idx = torch.max(cos_scores, dim=1) if max_score.item() >= threshold: term = glossary_terms[max_idx] replacement = glossary[term] pattern = r'\b' + re.escape(term) + r'\b' sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE) updated_sentences.append(sentence.strip()) final_text = '. '.join(updated_sentences) return final_text # ----------------------------- # Translation using Azure Translator API # ----------------------------- def translate_text_azure(text: str) -> str: """ Translate text to Canadian French using the Azure Translator API. """ subscription_key = os.getenv("AZURE_TRANSLATOR_KEY") region = os.getenv("AZURE_TRANSLATOR_REGION") if not subscription_key or not region: raise Exception("Azure Translator credentials not set.") endpoint = "https://api.cognitive.microsofttranslator.com/translate" params = {"api-version": "3.0", "to": "fr-CA"} headers = { "Ocp-Apim-Subscription-Key": subscription_key, "Ocp-Apim-Subscription-Region": region, "Content-type": "application/json", "X-ClientTraceId": str(uuid.uuid4()) } body = [{"text": text}] response = requests.post(endpoint, params=params, headers=headers, json=body) if response.status_code != 200: raise Exception(f"Translation API error: {response.text}") result = response.json() translated_text = result[0]['translations'][0]['text'] return translated_text # ----------------------------- # Document Parsing & Reconstruction # ----------------------------- def parse_document(file_path: str) -> str: """ Extract text content from a document using Apache Tika. """ parsed = parser.from_file(file_path) text = parsed.get("content", "") if not text: raise Exception("No text content found in the document.") return text def rebuild_document(text: str) -> bytes: """ Rebuild a DOCX document from the provided text. Returns the document as bytes. """ document = Document() for line in text.split("\n"): if line.strip(): document.add_paragraph(line) bio = BytesIO() document.save(bio) bio.seek(0) return bio.getvalue() # ----------------------------- # Processing Pipeline # ----------------------------- def process_translation(doc_file, glossary_file) -> bytes: try: # Write uploaded document to a temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_doc: tmp_doc.write(doc_file.read()) doc_path = tmp_doc.name # Load glossary from the uploaded Excel file glossary = load_glossary(glossary_file) # Parse document text raw_text = parse_document(doc_path) # Translate text via Azure Translator translated_text = translate_text_azure(raw_text) # Apply exact glossary enforcement final_text = apply_glossary(translated_text, glossary) # Apply semantic glossary enforcement final_text = apply_semantic_glossary(final_text, glossary, threshold=0.8) # Rebuild document to DOCX and get bytes output_bytes = rebuild_document(final_text) # Clean up temporary file os.unlink(doc_path) return output_bytes except Exception as e: st.error(f"Error: {str(e)}") return None # ----------------------------- # Streamlit App UI # ----------------------------- def main(): st.title("English to Canadian Quebec French Translator") st.write("Upload an English document (Word or PDF) and your company glossary (Excel) to translate.") doc_file = st.file_uploader("Upload English Document", type=["doc", "docx", "pdf"]) glossary_file = st.file_uploader("Upload Company Glossary (Excel)", type=["xlsx"]) if st.button("Translate Document"): if doc_file is None or glossary_file is None: st.error("Please upload both the document and glossary files.") else: with st.spinner("Translating..."): result = process_translation(doc_file, glossary_file) if result is not None: st.download_button( label="Download Translated DOCX", data=result, file_name="translated.docx", mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document" ) if __name__ == "__main__": main()