text_translator / app.py
garyd1's picture
Update app.py
8447d74 verified
raw
history blame
4.06 kB
import os
import re
import openai
import streamlit as st
import pandas as pd
import torch
import nltk
import time
import subprocess
from concurrent.futures import ThreadPoolExecutor
from langchain_openai import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
from sentence_transformers import SentenceTransformer, util
# Ensure necessary NLP models are available
try:
nltk.data.find("tokenizers/punkt")
except LookupError:
print("Downloading NLTK punkt tokenizer...")
nltk.download("punkt")
try:
import spacy
nlp = spacy.load("en_core_web_sm")
except OSError:
print("Downloading SpaCy model...")
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
nlp = spacy.load("en_core_web_sm")
# Load AI models
translator = ChatOpenAI(model="gpt-3.5-turbo")
model = SentenceTransformer('all-MiniLM-L6-v2')
@st.cache_data
def load_glossary_from_excel(glossary_file_bytes) -> dict:
"""Load glossary from an Excel file."""
df = pd.read_excel(glossary_file_bytes)
glossary = {}
for _, row in df.iterrows():
if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench']):
glossary[row['English'].strip().lower()] = row['CanadianFrench'].strip()
return glossary
def retry_translate_text(text: str, glossary: dict, max_retries=3) -> str:
"""Ensures GPT prioritizes glossary terms using system messages."""
glossary_prompt = "\n".join([f"{eng}{fr}" for eng, fr in glossary.items()])
messages = [
SystemMessage(content=f"Translate the following text to Canadian French while ensuring strict glossary replacements.\n\nGlossary:\n{glossary_prompt}"),
HumanMessage(content=text)
]
for attempt in range(max_retries):
try:
response = translator(messages)
return response.content.strip()
except Exception as e:
print(f"Error in translation (attempt {attempt+1}): {e}")
time.sleep(2)
return "Translation failed. Please try again later."
def enforce_glossary_with_semantics(text: str, glossary: dict, threshold: float) -> str:
"""Uses embeddings to enforce glossary replacement intelligently."""
glossary_terms = list(glossary.keys())
glossary_embeddings = model.encode(glossary_terms, convert_to_tensor=True)
sentences = nltk.tokenize.sent_tokenize(text) if not nlp else [sent.text for sent in nlp(text).sents]
def process_sentence(sentence):
sentence_embedding = model.encode(sentence, convert_to_tensor=True)
cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
max_score, max_idx = torch.max(cos_scores, dim=1)
if max_score.item() >= threshold:
term = glossary_terms[max_idx]
replacement = glossary[term]
sentence = sentence.replace(term, replacement)
return sentence.strip()
with ThreadPoolExecutor() as executor:
updated_sentences = list(executor.map(process_sentence, sentences))
return " ".join(updated_sentences)
# Streamlit UI
st.title("AI-Powered English to Canadian French Translator")
st.write("This version guarantees glossary enforcement.")
input_text = st.text_area("Enter text to translate:")
glossary_file = st.file_uploader("Upload Glossary File (Excel)", type=["xlsx"])
threshold = st.slider("Semantic Matching Threshold", 0.5, 1.0, 0.75)
if st.button("Translate"):
if not input_text.strip():
st.error("Please enter text to translate.")
elif glossary_file is None:
st.error("Glossary file is required.")
else:
glossary = load_glossary_from_excel(glossary_file)
# Step 1: Translate Text with GPT (Forcing Glossary)
translated_text = retry_translate_text(input_text, glossary)
# Step 2: Apply Semantic Matching to Guarantee Glossary
glossary_enforced_text = enforce_glossary_with_semantics(translated_text, glossary, threshold)
st.subheader("Final Translated Text:")
st.write(glossary_enforced_text)