text_translator / app.py
garyd1's picture
Update app.py
6361829 verified
raw
history blame
5.98 kB
import os
import re
import openai
import streamlit as st
import pandas as pd
import torch
import nltk
import time
from concurrent.futures import ThreadPoolExecutor
from langchain_openai import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
from sentence_transformers import SentenceTransformer, util
# Load NLP libraries
try:
import spacy
nlp = spacy.load("en_core_web_sm")
use_spacy = True
except Exception:
st.warning("SpaCy model not found, falling back to NLTK for tokenization.")
nltk.download("punkt")
use_spacy = False
# Load AI models
translator = ChatOpenAI(model="gpt-3.5-turbo")
model = SentenceTransformer('all-MiniLM-L6-v2')
@st.cache_data
def load_glossary_from_excel(glossary_file_bytes) -> dict:
"""Load glossary from an Excel file, apply lemmatization, and sort by length."""
df = pd.read_excel(glossary_file_bytes)
glossary = {}
for _, row in df.iterrows():
if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench']):
english_term = row['English'].strip().lower()
french_term = row['CanadianFrench'].strip()
doc = nlp(english_term) if use_spacy else english_term.split()
lemmatized_term = " ".join([token.lemma_ for token in doc]) if use_spacy else english_term
glossary[lemmatized_term] = french_term
return dict(sorted(glossary.items(), key=lambda item: len(item[0]), reverse=True))
@st.cache_data
def compute_glossary_embeddings_cached(glossary_items: tuple):
"""Compute cached embeddings for glossary terms."""
glossary = dict(glossary_items)
glossary_terms = list(glossary.keys())
embeddings = model.encode(glossary_terms, convert_to_tensor=True)
return glossary_terms, embeddings
def retry_translate_text(text: str, max_retries=3) -> str:
"""Retries translation in case of API failure."""
for attempt in range(max_retries):
try:
messages = [
SystemMessage(content="You are a professional translator. Translate the following text to Canadian French while preserving its meaning and context."),
HumanMessage(content=text)
]
response = translator(messages)
return response.content.strip()
except Exception as e:
print(f"Error in translation (attempt {attempt+1}): {e}")
time.sleep(2) # Wait before retrying
return "Translation failed. Please try again later."
def enforce_glossary(text: str, glossary: dict, threshold: float) -> str:
"""Applies glossary replacements based on semantic similarity with batch processing."""
glossary_items = tuple(sorted(glossary.items()))
glossary_terms, glossary_embeddings = compute_glossary_embeddings_cached(glossary_items)
sentences = nltk.tokenize.sent_tokenize(text) if not use_spacy else [sent.text for sent in nlp(text).sents]
def process_sentence(sentence):
"""Processes a single sentence with glossary enforcement."""
if not sentence.strip():
return sentence
# Dynamic threshold adjustment
sentence_length = len(sentence.split())
dynamic_threshold = 0.85 if sentence_length > 10 else 0.75 # Adjust threshold based on sentence length
sentence_embedding = model.encode(sentence, convert_to_tensor=True)
cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
max_score, max_idx = torch.max(cos_scores, dim=1)
if max_score.item() >= dynamic_threshold:
term = glossary_terms[max_idx]
replacement = glossary[term]
pattern = r'\b' + re.escape(term) + r'\b'
sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)
return sentence.strip()
# Process sentences in parallel for speed
with ThreadPoolExecutor() as executor:
updated_sentences = list(executor.map(process_sentence, sentences))
return " ".join(updated_sentences)
def validate_translation(original_text, final_text):
"""Uses GPT to check if the final translation retains the original meaning."""
messages = [
SystemMessage(content="You are an AI proofreader. Compare the original and final translation. Does the final translation retain the original meaning?"),
HumanMessage(content=f"Original Text: {original_text}\nFinal Translation: {final_text}\n")
]
response = translator(messages)
return response.content.strip()
def grammar_correction(text: str) -> str:
"""Uses GPT to fix grammar issues in the final translated text."""
messages = [
SystemMessage(content="You are a French grammar expert. Correct any grammatical mistakes in the following text."),
HumanMessage(content=text)
]
response = translator(messages)
return response.content.strip()
# Streamlit UI
st.title("Optimized AI-Powered English to Canadian French Translator")
st.write("This version includes retries, batch processing, glossary tuning, and grammar correction.")
input_text = st.text_area("Enter text to translate:")
glossary_file = st.file_uploader("Upload Glossary File (Excel)", type=["xlsx"])
threshold = st.slider("Semantic Matching Threshold", 0.5, 1.0, 0.8)
if st.button("Translate"):
if not input_text.strip():
st.error("Please enter text to translate.")
elif glossary_file is None:
st.error("Glossary file is required.")
else:
glossary = load_glossary_from_excel(glossary_file)
translated_text = retry_translate_text(input_text)
glossary_enforced_text = enforce_glossary(translated_text, glossary, threshold)
corrected_text = grammar_correction(glossary_enforced_text)
validation_result = validate_translation(input_text, corrected_text)
st.subheader("Final Translated Text:")
st.write(corrected_text)
st.subheader("Validation Check:")
st.write(validation_result)