Spaces:
Sleeping
Sleeping
File size: 5,985 Bytes
b7c9c63 57ec4e3 b7c9c63 57ec4e3 b7c9c63 57ec4e3 b7c9c63 57ec4e3 b7c9c63 57ec4e3 b7c9c63 57ec4e3 b7c9c63 57ec4e3 b7c9c63 57ec4e3 b7c9c63 57ec4e3 b7c9c63 57ec4e3 b7c9c63 57ec4e3 b7c9c63 57ec4e3 b7c9c63 57ec4e3 b7c9c63 57ec4e3 b7c9c63 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import os
import re
import openai
import streamlit as st
import pandas as pd
import torch
import nltk
import time
from concurrent.futures import ThreadPoolExecutor
from langchain.chat_models import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
from sentence_transformers import SentenceTransformer, util
# Load NLP libraries
try:
import spacy
nlp = spacy.load("en_core_web_sm")
use_spacy = True
except Exception:
st.warning("SpaCy model not found, falling back to NLTK for tokenization.")
nltk.download("punkt")
use_spacy = False
# Load AI models
translator = ChatOpenAI(model="gpt-3.5-turbo")
model = SentenceTransformer('all-MiniLM-L6-v2')
@st.cache_data
def load_glossary_from_excel(glossary_file_bytes) -> dict:
"""Load glossary from an Excel file, apply lemmatization, and sort by length."""
df = pd.read_excel(glossary_file_bytes)
glossary = {}
for _, row in df.iterrows():
if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench']):
english_term = row['English'].strip().lower()
french_term = row['CanadianFrench'].strip()
doc = nlp(english_term) if use_spacy else english_term.split()
lemmatized_term = " ".join([token.lemma_ for token in doc]) if use_spacy else english_term
glossary[lemmatized_term] = french_term
return dict(sorted(glossary.items(), key=lambda item: len(item[0]), reverse=True))
@st.cache_data
def compute_glossary_embeddings_cached(glossary_items: tuple):
"""Compute cached embeddings for glossary terms."""
glossary = dict(glossary_items)
glossary_terms = list(glossary.keys())
embeddings = model.encode(glossary_terms, convert_to_tensor=True)
return glossary_terms, embeddings
def retry_translate_text(text: str, max_retries=3) -> str:
"""Retries translation in case of API failure."""
for attempt in range(max_retries):
try:
messages = [
SystemMessage(content="You are a professional translator. Translate the following text to Canadian French while preserving its meaning and context."),
HumanMessage(content=text)
]
response = translator(messages)
return response.content.strip()
except Exception as e:
print(f"Error in translation (attempt {attempt+1}): {e}")
time.sleep(2) # Wait before retrying
return "Translation failed. Please try again later."
def enforce_glossary(text: str, glossary: dict, threshold: float) -> str:
"""Applies glossary replacements based on semantic similarity with batch processing."""
glossary_items = tuple(sorted(glossary.items()))
glossary_terms, glossary_embeddings = compute_glossary_embeddings_cached(glossary_items)
sentences = nltk.tokenize.sent_tokenize(text) if not use_spacy else [sent.text for sent in nlp(text).sents]
def process_sentence(sentence):
"""Processes a single sentence with glossary enforcement."""
if not sentence.strip():
return sentence
# Dynamic threshold adjustment
sentence_length = len(sentence.split())
dynamic_threshold = 0.85 if sentence_length > 10 else 0.75 # Adjust threshold based on sentence length
sentence_embedding = model.encode(sentence, convert_to_tensor=True)
cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
max_score, max_idx = torch.max(cos_scores, dim=1)
if max_score.item() >= dynamic_threshold:
term = glossary_terms[max_idx]
replacement = glossary[term]
pattern = r'\b' + re.escape(term) + r'\b'
sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)
return sentence.strip()
# Process sentences in parallel for speed
with ThreadPoolExecutor() as executor:
updated_sentences = list(executor.map(process_sentence, sentences))
return " ".join(updated_sentences)
def validate_translation(original_text, final_text):
"""Uses GPT to check if the final translation retains the original meaning."""
messages = [
SystemMessage(content="You are an AI proofreader. Compare the original and final translation. Does the final translation retain the original meaning?"),
HumanMessage(content=f"Original Text: {original_text}\nFinal Translation: {final_text}\n")
]
response = translator(messages)
return response.content.strip()
def grammar_correction(text: str) -> str:
"""Uses GPT to fix grammar issues in the final translated text."""
messages = [
SystemMessage(content="You are a French grammar expert. Correct any grammatical mistakes in the following text."),
HumanMessage(content=text)
]
response = translator(messages)
return response.content.strip()
# Streamlit UI
st.title("Optimized AI-Powered English to Canadian French Translator")
st.write("This version includes retries, batch processing, glossary tuning, and grammar correction.")
input_text = st.text_area("Enter text to translate:")
glossary_file = st.file_uploader("Upload Glossary File (Excel)", type=["xlsx"])
threshold = st.slider("Semantic Matching Threshold", 0.5, 1.0, 0.8)
if st.button("Translate"):
if not input_text.strip():
st.error("Please enter text to translate.")
elif glossary_file is None:
st.error("Glossary file is required.")
else:
glossary = load_glossary_from_excel(glossary_file)
translated_text = retry_translate_text(input_text)
glossary_enforced_text = enforce_glossary(translated_text, glossary, threshold)
corrected_text = grammar_correction(glossary_enforced_text)
validation_result = validate_translation(input_text, corrected_text)
st.subheader("Final Translated Text:")
st.write(corrected_text)
st.subheader("Validation Check:")
st.write(validation_result)
|