File size: 4,756 Bytes
b7c9c63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os
import re
import openai
import streamlit as st
import pandas as pd
import torch
import nltk

from langchain.chat_models import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
from sentence_transformers import SentenceTransformer, util

# Try to load spaCy for advanced NLP processing
try:
    import spacy
    nlp = spacy.load("en_core_web_sm")
    use_spacy = True
except Exception:
    st.warning("SpaCy model not found, falling back to NLTK for tokenization.")
    nltk.download("punkt")
    use_spacy = False

# Load AI models
translator = ChatOpenAI(model="gpt-3.5-turbo")
model = SentenceTransformer('all-MiniLM-L6-v2')

@st.cache_data
def load_glossary_from_excel(glossary_file_bytes) -> dict:
    """Load glossary from an Excel file, applying lemmatization and sorting by length."""
    df = pd.read_excel(glossary_file_bytes)
    glossary = {}

    for _, row in df.iterrows():
        if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench']):
            english_term = row['English'].strip().lower()
            french_term = row['CanadianFrench'].strip()
            doc = nlp(english_term) if use_spacy else english_term.split()
            lemmatized_term = " ".join([token.lemma_ for token in doc]) if use_spacy else english_term
            glossary[lemmatized_term] = french_term

    return dict(sorted(glossary.items(), key=lambda item: len(item[0]), reverse=True))

@st.cache_data
def compute_glossary_embeddings_cached(glossary_items: tuple):
    """Compute cached embeddings for glossary terms."""
    glossary = dict(glossary_items)
    glossary_terms = list(glossary.keys())
    embeddings = model.encode(glossary_terms, convert_to_tensor=True)
    return glossary_terms, embeddings

def translate_text(text: str) -> str:
    """Uses OpenAI's GPT to translate text to Canadian French."""
    messages = [
        SystemMessage(content="You are a professional translator. Translate the following text to Canadian French while preserving its meaning and context."),
        HumanMessage(content=text)
    ]
    response = translator(messages)
    return response.content.strip()

def enforce_glossary(text: str, glossary: dict, threshold: float) -> str:
    """Applies glossary replacements based on semantic similarity."""
    glossary_items = tuple(sorted(glossary.items()))
    glossary_terms, glossary_embeddings = compute_glossary_embeddings_cached(glossary_items)

    sentences = nltk.tokenize.sent_tokenize(text) if not use_spacy else [sent.text for sent in nlp(text).sents]
    updated_sentences = []

    for sentence in sentences:
        if not sentence.strip():
            continue
        sentence_embedding = model.encode(sentence, convert_to_tensor=True)
        cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
        max_score, max_idx = torch.max(cos_scores, dim=1)

        if max_score.item() >= threshold:
            term = glossary_terms[max_idx]
            replacement = glossary[term]
            pattern = r'\b' + re.escape(term) + r'\b'
            sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)

        updated_sentences.append(sentence.strip())

    return " ".join(updated_sentences)

def validate_translation(original_text, final_text):
    """Uses GPT to check if the final translation retains the original meaning."""
    messages = [
        SystemMessage(content="You are an AI proofreader. Compare the original and final translation. Does the final translation retain the original meaning?"),
        HumanMessage(content=f"Original Text: {original_text}\nFinal Translation: {final_text}\n")
    ]
    response = translator(messages)
    return response.content.strip()

# Streamlit UI
st.title("AI-Powered English to Canadian French Translator")
st.write("This app uses AI agents for translation, glossary enforcement, and meaning validation.")

input_text = st.text_area("Enter text to translate:")
glossary_file = st.file_uploader("Upload Glossary File (Excel)", type=["xlsx"])
threshold = st.slider("Semantic Matching Threshold", 0.5, 1.0, 0.8)

if st.button("Translate"):
    if not input_text.strip():
        st.error("Please enter text to translate.")
    elif glossary_file is None:
        st.error("Glossary file is required.")
    else:
        glossary = load_glossary_from_excel(glossary_file)
        translated_text = translate_text(input_text)
        glossary_enforced_text = enforce_glossary(translated_text, glossary, threshold)
        validation_result = validate_translation(input_text, glossary_enforced_text)

        st.subheader("Final Translated Text:")
        st.write(glossary_enforced_text)

        st.subheader("Validation Check:")
        st.write(validation_result)