import os import re import openai import streamlit as st import pandas as pd import torch import nltk import time import subprocess from concurrent.futures import ThreadPoolExecutor from langchain_openai import ChatOpenAI from langchain.schema import SystemMessage, HumanMessage from sentence_transformers import SentenceTransformer, util # Ensure necessary NLP models are available try: nltk.data.find("tokenizers/punkt") except LookupError: print("Downloading NLTK punkt tokenizer...") nltk.download("punkt") try: import spacy nlp = spacy.load("en_core_web_sm") except OSError: print("Downloading SpaCy model...") subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"]) nlp = spacy.load("en_core_web_sm") # Load AI models translator = ChatOpenAI(model="gpt-3.5-turbo") model = SentenceTransformer('all-MiniLM-L6-v2') @st.cache_data def load_glossary_from_excel(glossary_file_bytes) -> dict: """Load glossary from an Excel file.""" df = pd.read_excel(glossary_file_bytes) glossary = {} for _, row in df.iterrows(): if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench']): glossary[row['English'].strip().lower()] = row['CanadianFrench'].strip() return glossary def retry_translate_text(text: str, glossary: dict, max_retries=3) -> str: """Ensures GPT prioritizes glossary terms using system messages.""" glossary_prompt = "\n".join([f"{eng} → {fr}" for eng, fr in glossary.items()]) messages = [ SystemMessage(content=f"Translate the following text to Canadian French while ensuring strict glossary replacements.\n\nGlossary:\n{glossary_prompt}"), HumanMessage(content=text) ] for attempt in range(max_retries): try: response = translator(messages) return response.content.strip() except Exception as e: print(f"Error in translation (attempt {attempt+1}): {e}") time.sleep(2) return "Translation failed. Please try again later." def enforce_glossary_with_semantics(text: str, glossary: dict, threshold: float) -> str: """Uses embeddings to enforce glossary replacement intelligently.""" glossary_terms = list(glossary.keys()) glossary_embeddings = model.encode(glossary_terms, convert_to_tensor=True) sentences = nltk.tokenize.sent_tokenize(text) if not nlp else [sent.text for sent in nlp(text).sents] def process_sentence(sentence): sentence_embedding = model.encode(sentence, convert_to_tensor=True) cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings) max_score, max_idx = torch.max(cos_scores, dim=1) if max_score.item() >= threshold: term = glossary_terms[max_idx] replacement = glossary[term] sentence = sentence.replace(term, replacement) return sentence.strip() with ThreadPoolExecutor() as executor: updated_sentences = list(executor.map(process_sentence, sentences)) return " ".join(updated_sentences) # Streamlit UI st.title("AI-Powered English to Canadian French Translator") st.write("This version guarantees glossary enforcement.") input_text = st.text_area("Enter text to translate:") glossary_file = st.file_uploader("Upload Glossary File (Excel)", type=["xlsx"]) threshold = st.slider("Semantic Matching Threshold", 0.5, 1.0, 0.75) if st.button("Translate"): if not input_text.strip(): st.error("Please enter text to translate.") elif glossary_file is None: st.error("Glossary file is required.") else: glossary = load_glossary_from_excel(glossary_file) # Step 1: Translate Text with GPT (Forcing Glossary) translated_text = retry_translate_text(input_text, glossary) # Step 2: Apply Semantic Matching to Guarantee Glossary glossary_enforced_text = enforce_glossary_with_semantics(translated_text, glossary, threshold) st.subheader("Final Translated Text:") st.write(glossary_enforced_text)