File size: 5,927 Bytes
b7c9c63
 
 
 
 
 
 
57ec4e3
fbf0833
57ec4e3
b7c9c63
6361829
b7c9c63
 
 
fbf0833
 
 
 
 
 
 
b7c9c63
 
 
fbf0833
 
 
 
b7c9c63
 
 
 
 
 
 
57ec4e3
b7c9c63
 
 
 
 
 
 
fbf0833
 
b7c9c63
 
 
 
 
 
 
 
 
 
 
 
fd72a6e
 
 
 
 
 
 
57ec4e3
 
 
 
 
fd72a6e
57ec4e3
 
 
 
 
 
fbf0833
57ec4e3
b7c9c63
fd72a6e
 
 
 
 
 
 
 
 
b7c9c63
 
 
fbf0833
fd72a6e
57ec4e3
 
b7c9c63
57ec4e3
 
b7c9c63
 
 
 
fd72a6e
b7c9c63
 
 
 
 
57ec4e3
 
 
 
b7c9c63
 
 
 
fd72a6e
 
b7c9c63
 
 
fd72a6e
b7c9c63
 
 
 
 
 
 
 
 
fd72a6e
 
 
 
 
 
 
 
b7c9c63
fd72a6e
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import os
import re
import openai
import streamlit as st
import pandas as pd
import torch
import nltk
import time
import subprocess
from concurrent.futures import ThreadPoolExecutor

from langchain_openai import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
from sentence_transformers import SentenceTransformer, util

# Ensure necessary NLP models are available
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    print("Downloading NLTK punkt tokenizer...")
    nltk.download("punkt")

try:
    import spacy
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading SpaCy model...")
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm")

# Load AI models
translator = ChatOpenAI(model="gpt-3.5-turbo")
model = SentenceTransformer('all-MiniLM-L6-v2')

@st.cache_data
def load_glossary_from_excel(glossary_file_bytes) -> dict:
    """Load glossary from an Excel file, apply lemmatization, and sort by length."""
    df = pd.read_excel(glossary_file_bytes)
    glossary = {}

    for _, row in df.iterrows():
        if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench']):
            english_term = row['English'].strip().lower()
            french_term = row['CanadianFrench'].strip()
            doc = nlp(english_term) if nlp else english_term.split()
            lemmatized_term = " ".join([token.lemma_ for token in doc]) if nlp else english_term
            glossary[lemmatized_term] = french_term

    return dict(sorted(glossary.items(), key=lambda item: len(item[0]), reverse=True))

@st.cache_data
def compute_glossary_embeddings_cached(glossary_items: tuple):
    """Compute cached embeddings for glossary terms."""
    glossary = dict(glossary_items)
    glossary_terms = list(glossary.keys())
    embeddings = model.encode(glossary_terms, convert_to_tensor=True)
    return glossary_terms, embeddings

def enforce_glossary_pre_translation(text: str, glossary: dict) -> str:
    """Forces glossary terms in the English text before translation."""
    for eng_term, fr_term in glossary.items():
        pattern = r'\b' + re.escape(eng_term) + r'\b'
        text = re.sub(pattern, eng_term.upper(), text, flags=re.IGNORECASE)  # Capitalize for emphasis
    return text

def retry_translate_text(text: str, max_retries=3) -> str:
    """Retries translation in case of API failure."""
    for attempt in range(max_retries):
        try:
            messages = [
                SystemMessage(content="You are a professional translator. Translate the following text to Canadian French while preserving its meaning and respecting these specific terms."),
                HumanMessage(content=text)
            ]
            response = translator(messages)
            return response.content.strip()
        except Exception as e:
            print(f"Error in translation (attempt {attempt+1}): {e}")
            time.sleep(2)
    return "Translation failed. Please try again later."

def enforce_glossary_post_translation(text: str, glossary: dict) -> str:
    """Ensures glossary terms are applied after translation."""
    for eng_term, fr_term in glossary.items():
        pattern = r'\b' + re.escape(eng_term.upper()) + r'\b'
        text = re.sub(pattern, fr_term, text, flags=re.IGNORECASE)
    return text

def enforce_glossary_with_semantics(text: str, glossary: dict, threshold: float) -> str:
    """Applies glossary replacements based on semantic similarity."""
    glossary_items = tuple(sorted(glossary.items()))
    glossary_terms, glossary_embeddings = compute_glossary_embeddings_cached(glossary_items)

    sentences = nltk.tokenize.sent_tokenize(text) if not nlp else [sent.text for sent in nlp(text).sents]

    def process_sentence(sentence):
        """Processes a single sentence with glossary enforcement."""
        if not sentence.strip():
            return sentence
        
        sentence_embedding = model.encode(sentence, convert_to_tensor=True)
        cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
        max_score, max_idx = torch.max(cos_scores, dim=1)

        if max_score.item() >= threshold:
            term = glossary_terms[max_idx]
            replacement = glossary[term]
            pattern = r'\b' + re.escape(term) + r'\b'
            sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)

        return sentence.strip()

    with ThreadPoolExecutor() as executor:
        updated_sentences = list(executor.map(process_sentence, sentences))

    return " ".join(updated_sentences)

# Streamlit UI
st.title("AI-Powered English to Canadian French Translator")
st.write("This version ensures glossary priority, improves enforcement, and validates meaning.")

input_text = st.text_area("Enter text to translate:")
glossary_file = st.file_uploader("Upload Glossary File (Excel)", type=["xlsx"])
threshold = st.slider("Semantic Matching Threshold", 0.5, 1.0, 0.85)

if st.button("Translate"):
    if not input_text.strip():
        st.error("Please enter text to translate.")
    elif glossary_file is None:
        st.error("Glossary file is required.")
    else:
        glossary = load_glossary_from_excel(glossary_file)

        # Step 1: Enforce Glossary Before Translation
        pre_translated_text = enforce_glossary_pre_translation(input_text, glossary)

        # Step 2: Translate Text with OpenAI
        translated_text = retry_translate_text(pre_translated_text)

        # Step 3: Enforce Glossary After Translation
        post_translated_text = enforce_glossary_post_translation(translated_text, glossary)

        # Step 4: Apply Semantic Matching to Catch Any Missed Glossary Terms
        glossary_enforced_text = enforce_glossary_with_semantics(post_translated_text, glossary, threshold)

        st.subheader("Final Translated Text:")
        st.write(glossary_enforced_text)