import os import re import openai import streamlit as st import pandas as pd import torch import nltk from langchain.chat_models import ChatOpenAI from langchain.schema import SystemMessage, HumanMessage from sentence_transformers import SentenceTransformer, util # Try to load spaCy for advanced NLP processing try: import spacy nlp = spacy.load("en_core_web_sm") use_spacy = True except Exception: st.warning("SpaCy model not found, falling back to NLTK for tokenization.") nltk.download("punkt") use_spacy = False # Load AI models translator = ChatOpenAI(model="gpt-3.5-turbo") model = SentenceTransformer('all-MiniLM-L6-v2') @st.cache_data def load_glossary_from_excel(glossary_file_bytes) -> dict: """Load glossary from an Excel file, applying lemmatization and sorting by length.""" df = pd.read_excel(glossary_file_bytes) glossary = {} for _, row in df.iterrows(): if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench']): english_term = row['English'].strip().lower() french_term = row['CanadianFrench'].strip() doc = nlp(english_term) if use_spacy else english_term.split() lemmatized_term = " ".join([token.lemma_ for token in doc]) if use_spacy else english_term glossary[lemmatized_term] = french_term return dict(sorted(glossary.items(), key=lambda item: len(item[0]), reverse=True)) @st.cache_data def compute_glossary_embeddings_cached(glossary_items: tuple): """Compute cached embeddings for glossary terms.""" glossary = dict(glossary_items) glossary_terms = list(glossary.keys()) embeddings = model.encode(glossary_terms, convert_to_tensor=True) return glossary_terms, embeddings def translate_text(text: str) -> str: """Uses OpenAI's GPT to translate text to Canadian French.""" messages = [ SystemMessage(content="You are a professional translator. Translate the following text to Canadian French while preserving its meaning and context."), HumanMessage(content=text) ] response = translator(messages) return response.content.strip() def enforce_glossary(text: str, glossary: dict, threshold: float) -> str: """Applies glossary replacements based on semantic similarity.""" glossary_items = tuple(sorted(glossary.items())) glossary_terms, glossary_embeddings = compute_glossary_embeddings_cached(glossary_items) sentences = nltk.tokenize.sent_tokenize(text) if not use_spacy else [sent.text for sent in nlp(text).sents] updated_sentences = [] for sentence in sentences: if not sentence.strip(): continue sentence_embedding = model.encode(sentence, convert_to_tensor=True) cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings) max_score, max_idx = torch.max(cos_scores, dim=1) if max_score.item() >= threshold: term = glossary_terms[max_idx] replacement = glossary[term] pattern = r'\b' + re.escape(term) + r'\b' sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE) updated_sentences.append(sentence.strip()) return " ".join(updated_sentences) def validate_translation(original_text, final_text): """Uses GPT to check if the final translation retains the original meaning.""" messages = [ SystemMessage(content="You are an AI proofreader. Compare the original and final translation. Does the final translation retain the original meaning?"), HumanMessage(content=f"Original Text: {original_text}\nFinal Translation: {final_text}\n") ] response = translator(messages) return response.content.strip() # Streamlit UI st.title("AI-Powered English to Canadian French Translator") st.write("This app uses AI agents for translation, glossary enforcement, and meaning validation.") input_text = st.text_area("Enter text to translate:") glossary_file = st.file_uploader("Upload Glossary File (Excel)", type=["xlsx"]) threshold = st.slider("Semantic Matching Threshold", 0.5, 1.0, 0.8) if st.button("Translate"): if not input_text.strip(): st.error("Please enter text to translate.") elif glossary_file is None: st.error("Glossary file is required.") else: glossary = load_glossary_from_excel(glossary_file) translated_text = translate_text(input_text) glossary_enforced_text = enforce_glossary(translated_text, glossary, threshold) validation_result = validate_translation(input_text, glossary_enforced_text) st.subheader("Final Translated Text:") st.write(glossary_enforced_text) st.subheader("Validation Check:") st.write(validation_result)