Spaces:
Sleeping
Sleeping
File size: 7,123 Bytes
3cf0975 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 |
import os
import uuid
import tempfile
import re
import requests
import pandas as pd
from tika import parser
from docx import Document
from sentence_transformers import SentenceTransformer, util
import torch
import streamlit as st
from io import BytesIO
# Load the pre-trained embedding model for semantic matching.
model = SentenceTransformer('all-MiniLM-L6-v2')
# -----------------------------
# Glossary Loader and Enforcement
# -----------------------------
def load_glossary(glossary_file) -> dict:
"""
Load the company glossary from an Excel file.
Expects columns: 'English' and 'CanadianFrench'
"""
try:
# Use pandas to read directly from the uploaded file (BytesIO)
df = pd.read_excel(glossary_file)
glossary = {
row['English'].strip().lower(): row['CanadianFrench'].strip()
for _, row in df.iterrows()
if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench'])
}
return glossary
except Exception as e:
raise Exception(f"Error loading glossary: {str(e)}")
def apply_glossary(text: str, glossary: dict) -> str:
"""
Replace occurrences of glossary terms (exact word match) with preferred Canadian French terms.
"""
for eng_term, fr_term in glossary.items():
pattern = r'\b' + re.escape(eng_term) + r'\b'
text = re.sub(pattern, fr_term, text, flags=re.IGNORECASE)
return text
# -----------------------------
# Semantic Glossary Enforcement
# -----------------------------
def compute_glossary_embeddings(glossary: dict):
"""
Precompute embeddings for the glossary keys.
"""
glossary_terms = list(glossary.keys())
embeddings = model.encode(glossary_terms, convert_to_tensor=True)
return glossary_terms, embeddings
def apply_semantic_glossary(text: str, glossary: dict, threshold: float = 0.8) -> str:
"""
Enhance glossary enforcement using semantic similarity.
Splits text into sentences, computes embeddings, and if a sentence is
semantically similar to a glossary term (above threshold), performs replacement.
"""
glossary_terms, glossary_embeddings = compute_glossary_embeddings(glossary)
sentences = text.split('.')
updated_sentences = []
for sentence in sentences:
if not sentence.strip():
continue
sentence_embedding = model.encode(sentence, convert_to_tensor=True)
cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
max_score, max_idx = torch.max(cos_scores, dim=1)
if max_score.item() >= threshold:
term = glossary_terms[max_idx]
replacement = glossary[term]
pattern = r'\b' + re.escape(term) + r'\b'
sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)
updated_sentences.append(sentence.strip())
final_text = '. '.join(updated_sentences)
return final_text
# -----------------------------
# Translation using Azure Translator API
# -----------------------------
def translate_text_azure(text: str) -> str:
"""
Translate text to Canadian French using the Azure Translator API.
"""
subscription_key = os.getenv("AZURE_TRANSLATOR_KEY")
region = os.getenv("AZURE_TRANSLATOR_REGION")
if not subscription_key or not region:
raise Exception("Azure Translator credentials not set.")
endpoint = "https://api.cognitive.microsofttranslator.com/translate"
params = {"api-version": "3.0", "to": "fr-CA"}
headers = {
"Ocp-Apim-Subscription-Key": subscription_key,
"Ocp-Apim-Subscription-Region": region,
"Content-type": "application/json",
"X-ClientTraceId": str(uuid.uuid4())
}
body = [{"text": text}]
response = requests.post(endpoint, params=params, headers=headers, json=body)
if response.status_code != 200:
raise Exception(f"Translation API error: {response.text}")
result = response.json()
translated_text = result[0]['translations'][0]['text']
return translated_text
# -----------------------------
# Document Parsing & Reconstruction
# -----------------------------
def parse_document(file_path: str) -> str:
"""
Extract text content from a document using Apache Tika.
"""
parsed = parser.from_file(file_path)
text = parsed.get("content", "")
if not text:
raise Exception("No text content found in the document.")
return text
def rebuild_document(text: str) -> bytes:
"""
Rebuild a DOCX document from the provided text.
Returns the document as bytes.
"""
document = Document()
for line in text.split("\n"):
if line.strip():
document.add_paragraph(line)
bio = BytesIO()
document.save(bio)
bio.seek(0)
return bio.getvalue()
# -----------------------------
# Processing Pipeline
# -----------------------------
def process_translation(doc_file, glossary_file) -> bytes:
try:
# Write uploaded document to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_doc:
tmp_doc.write(doc_file.read())
doc_path = tmp_doc.name
# Load glossary from the uploaded Excel file
glossary = load_glossary(glossary_file)
# Parse document text
raw_text = parse_document(doc_path)
# Translate text via Azure Translator
translated_text = translate_text_azure(raw_text)
# Apply exact glossary enforcement
final_text = apply_glossary(translated_text, glossary)
# Apply semantic glossary enforcement
final_text = apply_semantic_glossary(final_text, glossary, threshold=0.8)
# Rebuild document to DOCX and get bytes
output_bytes = rebuild_document(final_text)
# Clean up temporary file
os.unlink(doc_path)
return output_bytes
except Exception as e:
st.error(f"Error: {str(e)}")
return None
# -----------------------------
# Streamlit App UI
# -----------------------------
def main():
st.title("English to Canadian Quebec French Translator")
st.write("Upload an English document (Word or PDF) and your company glossary (Excel) to translate.")
doc_file = st.file_uploader("Upload English Document", type=["doc", "docx", "pdf"])
glossary_file = st.file_uploader("Upload Company Glossary (Excel)", type=["xlsx"])
if st.button("Translate Document"):
if doc_file is None or glossary_file is None:
st.error("Please upload both the document and glossary files.")
else:
with st.spinner("Translating..."):
result = process_translation(doc_file, glossary_file)
if result is not None:
st.download_button(
label="Download Translated DOCX",
data=result,
file_name="translated.docx",
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
if __name__ == "__main__":
main()
|