Spaces:

garyd1
/

text_translator

Sleeping

File size: 5,985 Bytes

import os
import re
import openai
import streamlit as st
import pandas as pd
import torch
import nltk
import time
from concurrent.futures import ThreadPoolExecutor

from langchain.chat_models import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
from sentence_transformers import SentenceTransformer, util

# Load NLP libraries
try:
    import spacy
    nlp = spacy.load("en_core_web_sm")
    use_spacy = True
except Exception:
    st.warning("SpaCy model not found, falling back to NLTK for tokenization.")
    nltk.download("punkt")
    use_spacy = False

# Load AI models
translator = ChatOpenAI(model="gpt-3.5-turbo")
model = SentenceTransformer('all-MiniLM-L6-v2')

@st.cache_data
def load_glossary_from_excel(glossary_file_bytes) -> dict:
    """Load glossary from an Excel file, apply lemmatization, and sort by length."""
    df = pd.read_excel(glossary_file_bytes)
    glossary = {}

    for _, row in df.iterrows():
        if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench']):
            english_term = row['English'].strip().lower()
            french_term = row['CanadianFrench'].strip()
            doc = nlp(english_term) if use_spacy else english_term.split()
            lemmatized_term = " ".join([token.lemma_ for token in doc]) if use_spacy else english_term
            glossary[lemmatized_term] = french_term

    return dict(sorted(glossary.items(), key=lambda item: len(item[0]), reverse=True))

@st.cache_data
def compute_glossary_embeddings_cached(glossary_items: tuple):
    """Compute cached embeddings for glossary terms."""
    glossary = dict(glossary_items)
    glossary_terms = list(glossary.keys())
    embeddings = model.encode(glossary_terms, convert_to_tensor=True)
    return glossary_terms, embeddings

def retry_translate_text(text: str, max_retries=3) -> str:
    """Retries translation in case of API failure."""
    for attempt in range(max_retries):
        try:
            messages = [
                SystemMessage(content="You are a professional translator. Translate the following text to Canadian French while preserving its meaning and context."),
                HumanMessage(content=text)
            ]
            response = translator(messages)
            return response.content.strip()
        except Exception as e:
            print(f"Error in translation (attempt {attempt+1}): {e}")
            time.sleep(2)  # Wait before retrying
    return "Translation failed. Please try again later."

def enforce_glossary(text: str, glossary: dict, threshold: float) -> str:
    """Applies glossary replacements based on semantic similarity with batch processing."""
    glossary_items = tuple(sorted(glossary.items()))
    glossary_terms, glossary_embeddings = compute_glossary_embeddings_cached(glossary_items)

    sentences = nltk.tokenize.sent_tokenize(text) if not use_spacy else [sent.text for sent in nlp(text).sents]
    
    def process_sentence(sentence):
        """Processes a single sentence with glossary enforcement."""
        if not sentence.strip():
            return sentence
        
        # Dynamic threshold adjustment
        sentence_length = len(sentence.split())
        dynamic_threshold = 0.85 if sentence_length > 10 else 0.75  # Adjust threshold based on sentence length

        sentence_embedding = model.encode(sentence, convert_to_tensor=True)
        cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
        max_score, max_idx = torch.max(cos_scores, dim=1)

        if max_score.item() >= dynamic_threshold:
            term = glossary_terms[max_idx]
            replacement = glossary[term]
            pattern = r'\b' + re.escape(term) + r'\b'
            sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)

        return sentence.strip()

    # Process sentences in parallel for speed
    with ThreadPoolExecutor() as executor:
        updated_sentences = list(executor.map(process_sentence, sentences))

    return " ".join(updated_sentences)

def validate_translation(original_text, final_text):
    """Uses GPT to check if the final translation retains the original meaning."""
    messages = [
        SystemMessage(content="You are an AI proofreader. Compare the original and final translation. Does the final translation retain the original meaning?"),
        HumanMessage(content=f"Original Text: {original_text}\nFinal Translation: {final_text}\n")
    ]
    response = translator(messages)
    return response.content.strip()

def grammar_correction(text: str) -> str:
    """Uses GPT to fix grammar issues in the final translated text."""
    messages = [
        SystemMessage(content="You are a French grammar expert. Correct any grammatical mistakes in the following text."),
        HumanMessage(content=text)
    ]
    response = translator(messages)
    return response.content.strip()

# Streamlit UI
st.title("Optimized AI-Powered English to Canadian French Translator")
st.write("This version includes retries, batch processing, glossary tuning, and grammar correction.")

input_text = st.text_area("Enter text to translate:")
glossary_file = st.file_uploader("Upload Glossary File (Excel)", type=["xlsx"])
threshold = st.slider("Semantic Matching Threshold", 0.5, 1.0, 0.8)

if st.button("Translate"):
    if not input_text.strip():
        st.error("Please enter text to translate.")
    elif glossary_file is None:
        st.error("Glossary file is required.")
    else:
        glossary = load_glossary_from_excel(glossary_file)
        translated_text = retry_translate_text(input_text)
        glossary_enforced_text = enforce_glossary(translated_text, glossary, threshold)
        corrected_text = grammar_correction(glossary_enforced_text)
        validation_result = validate_translation(input_text, corrected_text)

        st.subheader("Final Translated Text:")
        st.write(corrected_text)

        st.subheader("Validation Check:")
        st.write(validation_result)