Spaces:
Sleeping
Sleeping
File size: 5,927 Bytes
b7c9c63 57ec4e3 fbf0833 57ec4e3 b7c9c63 6361829 b7c9c63 fbf0833 b7c9c63 fbf0833 b7c9c63 57ec4e3 b7c9c63 fbf0833 b7c9c63 fd72a6e 57ec4e3 fd72a6e 57ec4e3 fbf0833 57ec4e3 b7c9c63 fd72a6e b7c9c63 fbf0833 fd72a6e 57ec4e3 b7c9c63 57ec4e3 b7c9c63 fd72a6e b7c9c63 57ec4e3 b7c9c63 fd72a6e b7c9c63 fd72a6e b7c9c63 fd72a6e b7c9c63 fd72a6e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
import os
import re
import openai
import streamlit as st
import pandas as pd
import torch
import nltk
import time
import subprocess
from concurrent.futures import ThreadPoolExecutor
from langchain_openai import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
from sentence_transformers import SentenceTransformer, util
# Ensure necessary NLP models are available
try:
nltk.data.find("tokenizers/punkt")
except LookupError:
print("Downloading NLTK punkt tokenizer...")
nltk.download("punkt")
try:
import spacy
nlp = spacy.load("en_core_web_sm")
except OSError:
print("Downloading SpaCy model...")
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
nlp = spacy.load("en_core_web_sm")
# Load AI models
translator = ChatOpenAI(model="gpt-3.5-turbo")
model = SentenceTransformer('all-MiniLM-L6-v2')
@st.cache_data
def load_glossary_from_excel(glossary_file_bytes) -> dict:
"""Load glossary from an Excel file, apply lemmatization, and sort by length."""
df = pd.read_excel(glossary_file_bytes)
glossary = {}
for _, row in df.iterrows():
if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench']):
english_term = row['English'].strip().lower()
french_term = row['CanadianFrench'].strip()
doc = nlp(english_term) if nlp else english_term.split()
lemmatized_term = " ".join([token.lemma_ for token in doc]) if nlp else english_term
glossary[lemmatized_term] = french_term
return dict(sorted(glossary.items(), key=lambda item: len(item[0]), reverse=True))
@st.cache_data
def compute_glossary_embeddings_cached(glossary_items: tuple):
"""Compute cached embeddings for glossary terms."""
glossary = dict(glossary_items)
glossary_terms = list(glossary.keys())
embeddings = model.encode(glossary_terms, convert_to_tensor=True)
return glossary_terms, embeddings
def enforce_glossary_pre_translation(text: str, glossary: dict) -> str:
"""Forces glossary terms in the English text before translation."""
for eng_term, fr_term in glossary.items():
pattern = r'\b' + re.escape(eng_term) + r'\b'
text = re.sub(pattern, eng_term.upper(), text, flags=re.IGNORECASE) # Capitalize for emphasis
return text
def retry_translate_text(text: str, max_retries=3) -> str:
"""Retries translation in case of API failure."""
for attempt in range(max_retries):
try:
messages = [
SystemMessage(content="You are a professional translator. Translate the following text to Canadian French while preserving its meaning and respecting these specific terms."),
HumanMessage(content=text)
]
response = translator(messages)
return response.content.strip()
except Exception as e:
print(f"Error in translation (attempt {attempt+1}): {e}")
time.sleep(2)
return "Translation failed. Please try again later."
def enforce_glossary_post_translation(text: str, glossary: dict) -> str:
"""Ensures glossary terms are applied after translation."""
for eng_term, fr_term in glossary.items():
pattern = r'\b' + re.escape(eng_term.upper()) + r'\b'
text = re.sub(pattern, fr_term, text, flags=re.IGNORECASE)
return text
def enforce_glossary_with_semantics(text: str, glossary: dict, threshold: float) -> str:
"""Applies glossary replacements based on semantic similarity."""
glossary_items = tuple(sorted(glossary.items()))
glossary_terms, glossary_embeddings = compute_glossary_embeddings_cached(glossary_items)
sentences = nltk.tokenize.sent_tokenize(text) if not nlp else [sent.text for sent in nlp(text).sents]
def process_sentence(sentence):
"""Processes a single sentence with glossary enforcement."""
if not sentence.strip():
return sentence
sentence_embedding = model.encode(sentence, convert_to_tensor=True)
cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
max_score, max_idx = torch.max(cos_scores, dim=1)
if max_score.item() >= threshold:
term = glossary_terms[max_idx]
replacement = glossary[term]
pattern = r'\b' + re.escape(term) + r'\b'
sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)
return sentence.strip()
with ThreadPoolExecutor() as executor:
updated_sentences = list(executor.map(process_sentence, sentences))
return " ".join(updated_sentences)
# Streamlit UI
st.title("AI-Powered English to Canadian French Translator")
st.write("This version ensures glossary priority, improves enforcement, and validates meaning.")
input_text = st.text_area("Enter text to translate:")
glossary_file = st.file_uploader("Upload Glossary File (Excel)", type=["xlsx"])
threshold = st.slider("Semantic Matching Threshold", 0.5, 1.0, 0.85)
if st.button("Translate"):
if not input_text.strip():
st.error("Please enter text to translate.")
elif glossary_file is None:
st.error("Glossary file is required.")
else:
glossary = load_glossary_from_excel(glossary_file)
# Step 1: Enforce Glossary Before Translation
pre_translated_text = enforce_glossary_pre_translation(input_text, glossary)
# Step 2: Translate Text with OpenAI
translated_text = retry_translate_text(pre_translated_text)
# Step 3: Enforce Glossary After Translation
post_translated_text = enforce_glossary_post_translation(translated_text, glossary)
# Step 4: Apply Semantic Matching to Catch Any Missed Glossary Terms
glossary_enforced_text = enforce_glossary_with_semantics(post_translated_text, glossary, threshold)
st.subheader("Final Translated Text:")
st.write(glossary_enforced_text)
|