Spaces:
Sleeping
Sleeping
import os | |
import re | |
import openai | |
import streamlit as st | |
import pandas as pd | |
import torch | |
import nltk | |
import time | |
from concurrent.futures import ThreadPoolExecutor | |
from langchain_openai import ChatOpenAI | |
from langchain.schema import SystemMessage, HumanMessage | |
from sentence_transformers import SentenceTransformer, util | |
# Load NLP libraries | |
try: | |
import spacy | |
nlp = spacy.load("en_core_web_sm") | |
use_spacy = True | |
except Exception: | |
st.warning("SpaCy model not found, falling back to NLTK for tokenization.") | |
nltk.download("punkt") | |
use_spacy = False | |
# Load AI models | |
translator = ChatOpenAI(model="gpt-3.5-turbo") | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
def load_glossary_from_excel(glossary_file_bytes) -> dict: | |
"""Load glossary from an Excel file, apply lemmatization, and sort by length.""" | |
df = pd.read_excel(glossary_file_bytes) | |
glossary = {} | |
for _, row in df.iterrows(): | |
if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench']): | |
english_term = row['English'].strip().lower() | |
french_term = row['CanadianFrench'].strip() | |
doc = nlp(english_term) if use_spacy else english_term.split() | |
lemmatized_term = " ".join([token.lemma_ for token in doc]) if use_spacy else english_term | |
glossary[lemmatized_term] = french_term | |
return dict(sorted(glossary.items(), key=lambda item: len(item[0]), reverse=True)) | |
def compute_glossary_embeddings_cached(glossary_items: tuple): | |
"""Compute cached embeddings for glossary terms.""" | |
glossary = dict(glossary_items) | |
glossary_terms = list(glossary.keys()) | |
embeddings = model.encode(glossary_terms, convert_to_tensor=True) | |
return glossary_terms, embeddings | |
def retry_translate_text(text: str, max_retries=3) -> str: | |
"""Retries translation in case of API failure.""" | |
for attempt in range(max_retries): | |
try: | |
messages = [ | |
SystemMessage(content="You are a professional translator. Translate the following text to Canadian French while preserving its meaning and context."), | |
HumanMessage(content=text) | |
] | |
response = translator(messages) | |
return response.content.strip() | |
except Exception as e: | |
print(f"Error in translation (attempt {attempt+1}): {e}") | |
time.sleep(2) # Wait before retrying | |
return "Translation failed. Please try again later." | |
def enforce_glossary(text: str, glossary: dict, threshold: float) -> str: | |
"""Applies glossary replacements based on semantic similarity with batch processing.""" | |
glossary_items = tuple(sorted(glossary.items())) | |
glossary_terms, glossary_embeddings = compute_glossary_embeddings_cached(glossary_items) | |
sentences = nltk.tokenize.sent_tokenize(text) if not use_spacy else [sent.text for sent in nlp(text).sents] | |
def process_sentence(sentence): | |
"""Processes a single sentence with glossary enforcement.""" | |
if not sentence.strip(): | |
return sentence | |
# Dynamic threshold adjustment | |
sentence_length = len(sentence.split()) | |
dynamic_threshold = 0.85 if sentence_length > 10 else 0.75 # Adjust threshold based on sentence length | |
sentence_embedding = model.encode(sentence, convert_to_tensor=True) | |
cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings) | |
max_score, max_idx = torch.max(cos_scores, dim=1) | |
if max_score.item() >= dynamic_threshold: | |
term = glossary_terms[max_idx] | |
replacement = glossary[term] | |
pattern = r'\b' + re.escape(term) + r'\b' | |
sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE) | |
return sentence.strip() | |
# Process sentences in parallel for speed | |
with ThreadPoolExecutor() as executor: | |
updated_sentences = list(executor.map(process_sentence, sentences)) | |
return " ".join(updated_sentences) | |
def validate_translation(original_text, final_text): | |
"""Uses GPT to check if the final translation retains the original meaning.""" | |
messages = [ | |
SystemMessage(content="You are an AI proofreader. Compare the original and final translation. Does the final translation retain the original meaning?"), | |
HumanMessage(content=f"Original Text: {original_text}\nFinal Translation: {final_text}\n") | |
] | |
response = translator(messages) | |
return response.content.strip() | |
def grammar_correction(text: str) -> str: | |
"""Uses GPT to fix grammar issues in the final translated text.""" | |
messages = [ | |
SystemMessage(content="You are a French grammar expert. Correct any grammatical mistakes in the following text."), | |
HumanMessage(content=text) | |
] | |
response = translator(messages) | |
return response.content.strip() | |
# Streamlit UI | |
st.title("Optimized AI-Powered English to Canadian French Translator") | |
st.write("This version includes retries, batch processing, glossary tuning, and grammar correction.") | |
input_text = st.text_area("Enter text to translate:") | |
glossary_file = st.file_uploader("Upload Glossary File (Excel)", type=["xlsx"]) | |
threshold = st.slider("Semantic Matching Threshold", 0.5, 1.0, 0.8) | |
if st.button("Translate"): | |
if not input_text.strip(): | |
st.error("Please enter text to translate.") | |
elif glossary_file is None: | |
st.error("Glossary file is required.") | |
else: | |
glossary = load_glossary_from_excel(glossary_file) | |
translated_text = retry_translate_text(input_text) | |
glossary_enforced_text = enforce_glossary(translated_text, glossary, threshold) | |
corrected_text = grammar_correction(glossary_enforced_text) | |
validation_result = validate_translation(input_text, corrected_text) | |
st.subheader("Final Translated Text:") | |
st.write(corrected_text) | |
st.subheader("Validation Check:") | |
st.write(validation_result) | |