Spaces:
Sleeping
Sleeping
import os | |
import uuid | |
import tempfile | |
import re | |
import requests | |
import pandas as pd | |
from tika import parser | |
from docx import Document | |
from sentence_transformers import SentenceTransformer, util | |
import torch | |
import streamlit as st | |
from io import BytesIO | |
# Load the pre-trained embedding model for semantic matching. | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
# ----------------------------- | |
# Glossary Loader and Enforcement | |
# ----------------------------- | |
def load_glossary(glossary_file) -> dict: | |
""" | |
Load the company glossary from an Excel file. | |
Expects columns: 'English' and 'CanadianFrench' | |
""" | |
try: | |
# Use pandas to read directly from the uploaded file (BytesIO) | |
df = pd.read_excel(glossary_file) | |
glossary = { | |
row['English'].strip().lower(): row['CanadianFrench'].strip() | |
for _, row in df.iterrows() | |
if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench']) | |
} | |
return glossary | |
except Exception as e: | |
raise Exception(f"Error loading glossary: {str(e)}") | |
def apply_glossary(text: str, glossary: dict) -> str: | |
""" | |
Replace occurrences of glossary terms (exact word match) with preferred Canadian French terms. | |
""" | |
for eng_term, fr_term in glossary.items(): | |
pattern = r'\b' + re.escape(eng_term) + r'\b' | |
text = re.sub(pattern, fr_term, text, flags=re.IGNORECASE) | |
return text | |
# ----------------------------- | |
# Semantic Glossary Enforcement | |
# ----------------------------- | |
def compute_glossary_embeddings(glossary: dict): | |
""" | |
Precompute embeddings for the glossary keys. | |
""" | |
glossary_terms = list(glossary.keys()) | |
embeddings = model.encode(glossary_terms, convert_to_tensor=True) | |
return glossary_terms, embeddings | |
def apply_semantic_glossary(text: str, glossary: dict, threshold: float = 0.8) -> str: | |
""" | |
Enhance glossary enforcement using semantic similarity. | |
Splits text into sentences, computes embeddings, and if a sentence is | |
semantically similar to a glossary term (above threshold), performs replacement. | |
""" | |
glossary_terms, glossary_embeddings = compute_glossary_embeddings(glossary) | |
sentences = text.split('.') | |
updated_sentences = [] | |
for sentence in sentences: | |
if not sentence.strip(): | |
continue | |
sentence_embedding = model.encode(sentence, convert_to_tensor=True) | |
cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings) | |
max_score, max_idx = torch.max(cos_scores, dim=1) | |
if max_score.item() >= threshold: | |
term = glossary_terms[max_idx] | |
replacement = glossary[term] | |
pattern = r'\b' + re.escape(term) + r'\b' | |
sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE) | |
updated_sentences.append(sentence.strip()) | |
final_text = '. '.join(updated_sentences) | |
return final_text | |
# ----------------------------- | |
# Translation using Azure Translator API | |
# ----------------------------- | |
def translate_text_azure(text: str) -> str: | |
""" | |
Translate text to Canadian French using the Azure Translator API. | |
""" | |
subscription_key = os.getenv("AZURE_TRANSLATOR_KEY") | |
region = os.getenv("AZURE_TRANSLATOR_REGION") | |
if not subscription_key or not region: | |
raise Exception("Azure Translator credentials not set.") | |
endpoint = "https://api.cognitive.microsofttranslator.com/translate" | |
params = {"api-version": "3.0", "to": "fr-CA"} | |
headers = { | |
"Ocp-Apim-Subscription-Key": subscription_key, | |
"Ocp-Apim-Subscription-Region": region, | |
"Content-type": "application/json", | |
"X-ClientTraceId": str(uuid.uuid4()) | |
} | |
body = [{"text": text}] | |
response = requests.post(endpoint, params=params, headers=headers, json=body) | |
if response.status_code != 200: | |
raise Exception(f"Translation API error: {response.text}") | |
result = response.json() | |
translated_text = result[0]['translations'][0]['text'] | |
return translated_text | |
# ----------------------------- | |
# Document Parsing & Reconstruction | |
# ----------------------------- | |
def parse_document(file_path: str) -> str: | |
""" | |
Extract text content from a document using Apache Tika. | |
""" | |
parsed = parser.from_file(file_path) | |
text = parsed.get("content", "") | |
if not text: | |
raise Exception("No text content found in the document.") | |
return text | |
def rebuild_document(text: str) -> bytes: | |
""" | |
Rebuild a DOCX document from the provided text. | |
Returns the document as bytes. | |
""" | |
document = Document() | |
for line in text.split("\n"): | |
if line.strip(): | |
document.add_paragraph(line) | |
bio = BytesIO() | |
document.save(bio) | |
bio.seek(0) | |
return bio.getvalue() | |
# ----------------------------- | |
# Processing Pipeline | |
# ----------------------------- | |
def process_translation(doc_file, glossary_file) -> bytes: | |
try: | |
# Write uploaded document to a temporary file | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_doc: | |
tmp_doc.write(doc_file.read()) | |
doc_path = tmp_doc.name | |
# Load glossary from the uploaded Excel file | |
glossary = load_glossary(glossary_file) | |
# Parse document text | |
raw_text = parse_document(doc_path) | |
# Translate text via Azure Translator | |
translated_text = translate_text_azure(raw_text) | |
# Apply exact glossary enforcement | |
final_text = apply_glossary(translated_text, glossary) | |
# Apply semantic glossary enforcement | |
final_text = apply_semantic_glossary(final_text, glossary, threshold=0.8) | |
# Rebuild document to DOCX and get bytes | |
output_bytes = rebuild_document(final_text) | |
# Clean up temporary file | |
os.unlink(doc_path) | |
return output_bytes | |
except Exception as e: | |
st.error(f"Error: {str(e)}") | |
return None | |
# ----------------------------- | |
# Streamlit App UI | |
# ----------------------------- | |
def main(): | |
st.title("English to Canadian Quebec French Translator") | |
st.write("Upload an English document (Word or PDF) and your company glossary (Excel) to translate.") | |
doc_file = st.file_uploader("Upload English Document", type=["doc", "docx", "pdf"]) | |
glossary_file = st.file_uploader("Upload Company Glossary (Excel)", type=["xlsx"]) | |
if st.button("Translate Document"): | |
if doc_file is None or glossary_file is None: | |
st.error("Please upload both the document and glossary files.") | |
else: | |
with st.spinner("Translating..."): | |
result = process_translation(doc_file, glossary_file) | |
if result is not None: | |
st.download_button( | |
label="Download Translated DOCX", | |
data=result, | |
file_name="translated.docx", | |
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document" | |
) | |
if __name__ == "__main__": | |
main() | |