Translator / app.py
garyd1's picture
Create app.py
3cf0975 verified
raw
history blame
7.12 kB
import os
import uuid
import tempfile
import re
import requests
import pandas as pd
from tika import parser
from docx import Document
from sentence_transformers import SentenceTransformer, util
import torch
import streamlit as st
from io import BytesIO
# Load the pre-trained embedding model for semantic matching.
model = SentenceTransformer('all-MiniLM-L6-v2')
# -----------------------------
# Glossary Loader and Enforcement
# -----------------------------
def load_glossary(glossary_file) -> dict:
"""
Load the company glossary from an Excel file.
Expects columns: 'English' and 'CanadianFrench'
"""
try:
# Use pandas to read directly from the uploaded file (BytesIO)
df = pd.read_excel(glossary_file)
glossary = {
row['English'].strip().lower(): row['CanadianFrench'].strip()
for _, row in df.iterrows()
if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench'])
}
return glossary
except Exception as e:
raise Exception(f"Error loading glossary: {str(e)}")
def apply_glossary(text: str, glossary: dict) -> str:
"""
Replace occurrences of glossary terms (exact word match) with preferred Canadian French terms.
"""
for eng_term, fr_term in glossary.items():
pattern = r'\b' + re.escape(eng_term) + r'\b'
text = re.sub(pattern, fr_term, text, flags=re.IGNORECASE)
return text
# -----------------------------
# Semantic Glossary Enforcement
# -----------------------------
def compute_glossary_embeddings(glossary: dict):
"""
Precompute embeddings for the glossary keys.
"""
glossary_terms = list(glossary.keys())
embeddings = model.encode(glossary_terms, convert_to_tensor=True)
return glossary_terms, embeddings
def apply_semantic_glossary(text: str, glossary: dict, threshold: float = 0.8) -> str:
"""
Enhance glossary enforcement using semantic similarity.
Splits text into sentences, computes embeddings, and if a sentence is
semantically similar to a glossary term (above threshold), performs replacement.
"""
glossary_terms, glossary_embeddings = compute_glossary_embeddings(glossary)
sentences = text.split('.')
updated_sentences = []
for sentence in sentences:
if not sentence.strip():
continue
sentence_embedding = model.encode(sentence, convert_to_tensor=True)
cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
max_score, max_idx = torch.max(cos_scores, dim=1)
if max_score.item() >= threshold:
term = glossary_terms[max_idx]
replacement = glossary[term]
pattern = r'\b' + re.escape(term) + r'\b'
sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)
updated_sentences.append(sentence.strip())
final_text = '. '.join(updated_sentences)
return final_text
# -----------------------------
# Translation using Azure Translator API
# -----------------------------
def translate_text_azure(text: str) -> str:
"""
Translate text to Canadian French using the Azure Translator API.
"""
subscription_key = os.getenv("AZURE_TRANSLATOR_KEY")
region = os.getenv("AZURE_TRANSLATOR_REGION")
if not subscription_key or not region:
raise Exception("Azure Translator credentials not set.")
endpoint = "https://api.cognitive.microsofttranslator.com/translate"
params = {"api-version": "3.0", "to": "fr-CA"}
headers = {
"Ocp-Apim-Subscription-Key": subscription_key,
"Ocp-Apim-Subscription-Region": region,
"Content-type": "application/json",
"X-ClientTraceId": str(uuid.uuid4())
}
body = [{"text": text}]
response = requests.post(endpoint, params=params, headers=headers, json=body)
if response.status_code != 200:
raise Exception(f"Translation API error: {response.text}")
result = response.json()
translated_text = result[0]['translations'][0]['text']
return translated_text
# -----------------------------
# Document Parsing & Reconstruction
# -----------------------------
def parse_document(file_path: str) -> str:
"""
Extract text content from a document using Apache Tika.
"""
parsed = parser.from_file(file_path)
text = parsed.get("content", "")
if not text:
raise Exception("No text content found in the document.")
return text
def rebuild_document(text: str) -> bytes:
"""
Rebuild a DOCX document from the provided text.
Returns the document as bytes.
"""
document = Document()
for line in text.split("\n"):
if line.strip():
document.add_paragraph(line)
bio = BytesIO()
document.save(bio)
bio.seek(0)
return bio.getvalue()
# -----------------------------
# Processing Pipeline
# -----------------------------
def process_translation(doc_file, glossary_file) -> bytes:
try:
# Write uploaded document to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_doc:
tmp_doc.write(doc_file.read())
doc_path = tmp_doc.name
# Load glossary from the uploaded Excel file
glossary = load_glossary(glossary_file)
# Parse document text
raw_text = parse_document(doc_path)
# Translate text via Azure Translator
translated_text = translate_text_azure(raw_text)
# Apply exact glossary enforcement
final_text = apply_glossary(translated_text, glossary)
# Apply semantic glossary enforcement
final_text = apply_semantic_glossary(final_text, glossary, threshold=0.8)
# Rebuild document to DOCX and get bytes
output_bytes = rebuild_document(final_text)
# Clean up temporary file
os.unlink(doc_path)
return output_bytes
except Exception as e:
st.error(f"Error: {str(e)}")
return None
# -----------------------------
# Streamlit App UI
# -----------------------------
def main():
st.title("English to Canadian Quebec French Translator")
st.write("Upload an English document (Word or PDF) and your company glossary (Excel) to translate.")
doc_file = st.file_uploader("Upload English Document", type=["doc", "docx", "pdf"])
glossary_file = st.file_uploader("Upload Company Glossary (Excel)", type=["xlsx"])
if st.button("Translate Document"):
if doc_file is None or glossary_file is None:
st.error("Please upload both the document and glossary files.")
else:
with st.spinner("Translating..."):
result = process_translation(doc_file, glossary_file)
if result is not None:
st.download_button(
label="Download Translated DOCX",
data=result,
file_name="translated.docx",
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
if __name__ == "__main__":
main()