File size: 5,985 Bytes
b7c9c63
 
 
 
 
 
 
57ec4e3
 
b7c9c63
 
 
 
 
57ec4e3
b7c9c63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57ec4e3
b7c9c63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57ec4e3
 
 
 
 
 
 
 
 
 
 
 
 
 
b7c9c63
 
57ec4e3
b7c9c63
 
 
 
57ec4e3
 
 
b7c9c63
57ec4e3
 
 
 
 
 
b7c9c63
 
 
 
57ec4e3
b7c9c63
 
 
 
 
57ec4e3
 
 
 
 
b7c9c63
 
 
 
 
 
 
 
 
 
 
 
57ec4e3
 
 
 
 
 
 
 
 
b7c9c63
57ec4e3
 
b7c9c63
 
 
 
 
 
 
 
 
 
 
 
57ec4e3
b7c9c63
57ec4e3
 
b7c9c63
 
57ec4e3
b7c9c63
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import os
import re
import openai
import streamlit as st
import pandas as pd
import torch
import nltk
import time
from concurrent.futures import ThreadPoolExecutor

from langchain.chat_models import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
from sentence_transformers import SentenceTransformer, util

# Load NLP libraries
try:
    import spacy
    nlp = spacy.load("en_core_web_sm")
    use_spacy = True
except Exception:
    st.warning("SpaCy model not found, falling back to NLTK for tokenization.")
    nltk.download("punkt")
    use_spacy = False

# Load AI models
translator = ChatOpenAI(model="gpt-3.5-turbo")
model = SentenceTransformer('all-MiniLM-L6-v2')

@st.cache_data
def load_glossary_from_excel(glossary_file_bytes) -> dict:
    """Load glossary from an Excel file, apply lemmatization, and sort by length."""
    df = pd.read_excel(glossary_file_bytes)
    glossary = {}

    for _, row in df.iterrows():
        if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench']):
            english_term = row['English'].strip().lower()
            french_term = row['CanadianFrench'].strip()
            doc = nlp(english_term) if use_spacy else english_term.split()
            lemmatized_term = " ".join([token.lemma_ for token in doc]) if use_spacy else english_term
            glossary[lemmatized_term] = french_term

    return dict(sorted(glossary.items(), key=lambda item: len(item[0]), reverse=True))

@st.cache_data
def compute_glossary_embeddings_cached(glossary_items: tuple):
    """Compute cached embeddings for glossary terms."""
    glossary = dict(glossary_items)
    glossary_terms = list(glossary.keys())
    embeddings = model.encode(glossary_terms, convert_to_tensor=True)
    return glossary_terms, embeddings

def retry_translate_text(text: str, max_retries=3) -> str:
    """Retries translation in case of API failure."""
    for attempt in range(max_retries):
        try:
            messages = [
                SystemMessage(content="You are a professional translator. Translate the following text to Canadian French while preserving its meaning and context."),
                HumanMessage(content=text)
            ]
            response = translator(messages)
            return response.content.strip()
        except Exception as e:
            print(f"Error in translation (attempt {attempt+1}): {e}")
            time.sleep(2)  # Wait before retrying
    return "Translation failed. Please try again later."

def enforce_glossary(text: str, glossary: dict, threshold: float) -> str:
    """Applies glossary replacements based on semantic similarity with batch processing."""
    glossary_items = tuple(sorted(glossary.items()))
    glossary_terms, glossary_embeddings = compute_glossary_embeddings_cached(glossary_items)

    sentences = nltk.tokenize.sent_tokenize(text) if not use_spacy else [sent.text for sent in nlp(text).sents]
    
    def process_sentence(sentence):
        """Processes a single sentence with glossary enforcement."""
        if not sentence.strip():
            return sentence
        
        # Dynamic threshold adjustment
        sentence_length = len(sentence.split())
        dynamic_threshold = 0.85 if sentence_length > 10 else 0.75  # Adjust threshold based on sentence length

        sentence_embedding = model.encode(sentence, convert_to_tensor=True)
        cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
        max_score, max_idx = torch.max(cos_scores, dim=1)

        if max_score.item() >= dynamic_threshold:
            term = glossary_terms[max_idx]
            replacement = glossary[term]
            pattern = r'\b' + re.escape(term) + r'\b'
            sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)

        return sentence.strip()

    # Process sentences in parallel for speed
    with ThreadPoolExecutor() as executor:
        updated_sentences = list(executor.map(process_sentence, sentences))

    return " ".join(updated_sentences)

def validate_translation(original_text, final_text):
    """Uses GPT to check if the final translation retains the original meaning."""
    messages = [
        SystemMessage(content="You are an AI proofreader. Compare the original and final translation. Does the final translation retain the original meaning?"),
        HumanMessage(content=f"Original Text: {original_text}\nFinal Translation: {final_text}\n")
    ]
    response = translator(messages)
    return response.content.strip()

def grammar_correction(text: str) -> str:
    """Uses GPT to fix grammar issues in the final translated text."""
    messages = [
        SystemMessage(content="You are a French grammar expert. Correct any grammatical mistakes in the following text."),
        HumanMessage(content=text)
    ]
    response = translator(messages)
    return response.content.strip()

# Streamlit UI
st.title("Optimized AI-Powered English to Canadian French Translator")
st.write("This version includes retries, batch processing, glossary tuning, and grammar correction.")

input_text = st.text_area("Enter text to translate:")
glossary_file = st.file_uploader("Upload Glossary File (Excel)", type=["xlsx"])
threshold = st.slider("Semantic Matching Threshold", 0.5, 1.0, 0.8)

if st.button("Translate"):
    if not input_text.strip():
        st.error("Please enter text to translate.")
    elif glossary_file is None:
        st.error("Glossary file is required.")
    else:
        glossary = load_glossary_from_excel(glossary_file)
        translated_text = retry_translate_text(input_text)
        glossary_enforced_text = enforce_glossary(translated_text, glossary, threshold)
        corrected_text = grammar_correction(glossary_enforced_text)
        validation_result = validate_translation(input_text, corrected_text)

        st.subheader("Final Translated Text:")
        st.write(corrected_text)

        st.subheader("Validation Check:")
        st.write(validation_result)