File size: 4,058 Bytes
b7c9c63
 
 
 
 
 
 
57ec4e3
fbf0833
57ec4e3
b7c9c63
6361829
b7c9c63
 
 
fbf0833
 
 
 
 
 
 
b7c9c63
 
 
fbf0833
 
 
 
b7c9c63
 
 
 
 
 
 
8447d74
b7c9c63
 
 
 
 
8447d74
b7c9c63
8447d74
 
 
 
 
 
 
 
 
 
b7c9c63
57ec4e3
 
 
 
 
 
fbf0833
b7c9c63
8447d74
fd72a6e
 
8447d74
 
 
b7c9c63
fbf0833
fd72a6e
57ec4e3
b7c9c63
 
 
 
fd72a6e
b7c9c63
 
8447d74
b7c9c63
57ec4e3
 
 
 
b7c9c63
 
 
 
fd72a6e
8447d74
b7c9c63
 
 
8447d74
b7c9c63
 
 
 
 
 
 
 
 
8447d74
 
b7c9c63
8447d74
 
fd72a6e
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import os
import re
import openai
import streamlit as st
import pandas as pd
import torch
import nltk
import time
import subprocess
from concurrent.futures import ThreadPoolExecutor

from langchain_openai import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
from sentence_transformers import SentenceTransformer, util

# Ensure necessary NLP models are available
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    print("Downloading NLTK punkt tokenizer...")
    nltk.download("punkt")

try:
    import spacy
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading SpaCy model...")
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm")

# Load AI models
translator = ChatOpenAI(model="gpt-3.5-turbo")
model = SentenceTransformer('all-MiniLM-L6-v2')

@st.cache_data
def load_glossary_from_excel(glossary_file_bytes) -> dict:
    """Load glossary from an Excel file."""
    df = pd.read_excel(glossary_file_bytes)
    glossary = {}

    for _, row in df.iterrows():
        if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench']):
            glossary[row['English'].strip().lower()] = row['CanadianFrench'].strip()

    return glossary

def retry_translate_text(text: str, glossary: dict, max_retries=3) -> str:
    """Ensures GPT prioritizes glossary terms using system messages."""
    glossary_prompt = "\n".join([f"{eng}{fr}" for eng, fr in glossary.items()])

    messages = [
        SystemMessage(content=f"Translate the following text to Canadian French while ensuring strict glossary replacements.\n\nGlossary:\n{glossary_prompt}"),
        HumanMessage(content=text)
    ]

    for attempt in range(max_retries):
        try:
            response = translator(messages)
            return response.content.strip()
        except Exception as e:
            print(f"Error in translation (attempt {attempt+1}): {e}")
            time.sleep(2)

    return "Translation failed. Please try again later."

def enforce_glossary_with_semantics(text: str, glossary: dict, threshold: float) -> str:
    """Uses embeddings to enforce glossary replacement intelligently."""
    glossary_terms = list(glossary.keys())
    glossary_embeddings = model.encode(glossary_terms, convert_to_tensor=True)

    sentences = nltk.tokenize.sent_tokenize(text) if not nlp else [sent.text for sent in nlp(text).sents]

    def process_sentence(sentence):
        sentence_embedding = model.encode(sentence, convert_to_tensor=True)
        cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
        max_score, max_idx = torch.max(cos_scores, dim=1)

        if max_score.item() >= threshold:
            term = glossary_terms[max_idx]
            replacement = glossary[term]
            sentence = sentence.replace(term, replacement)

        return sentence.strip()

    with ThreadPoolExecutor() as executor:
        updated_sentences = list(executor.map(process_sentence, sentences))

    return " ".join(updated_sentences)

# Streamlit UI
st.title("AI-Powered English to Canadian French Translator")
st.write("This version guarantees glossary enforcement.")

input_text = st.text_area("Enter text to translate:")
glossary_file = st.file_uploader("Upload Glossary File (Excel)", type=["xlsx"])
threshold = st.slider("Semantic Matching Threshold", 0.5, 1.0, 0.75)

if st.button("Translate"):
    if not input_text.strip():
        st.error("Please enter text to translate.")
    elif glossary_file is None:
        st.error("Glossary file is required.")
    else:
        glossary = load_glossary_from_excel(glossary_file)

        # Step 1: Translate Text with GPT (Forcing Glossary)
        translated_text = retry_translate_text(input_text, glossary)

        # Step 2: Apply Semantic Matching to Guarantee Glossary
        glossary_enforced_text = enforce_glossary_with_semantics(translated_text, glossary, threshold)

        st.subheader("Final Translated Text:")
        st.write(glossary_enforced_text)