Spaces:

garyd1
/

text_translator

Sleeping

App Files Files Community

text_translator / app.py

garyd1

Update app.py

8447d74 verified 4 months ago

raw

history blame

4.06 kB

	import os
	import re
	import openai
	import streamlit as st
	import pandas as pd
	import torch
	import nltk
	import time
	import subprocess
	from concurrent.futures import ThreadPoolExecutor

	from langchain_openai import ChatOpenAI
	from langchain.schema import SystemMessage, HumanMessage
	from sentence_transformers import SentenceTransformer, util

	# Ensure necessary NLP models are available
	try:
	nltk.data.find("tokenizers/punkt")
	except LookupError:
	print("Downloading NLTK punkt tokenizer...")
	nltk.download("punkt")

	try:
	import spacy
	nlp = spacy.load("en_core_web_sm")
	except OSError:
	print("Downloading SpaCy model...")
	subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
	nlp = spacy.load("en_core_web_sm")

	# Load AI models
	translator = ChatOpenAI(model="gpt-3.5-turbo")
	model = SentenceTransformer('all-MiniLM-L6-v2')

	@st.cache_data
	def load_glossary_from_excel(glossary_file_bytes) -> dict:
	"""Load glossary from an Excel file."""
	df = pd.read_excel(glossary_file_bytes)
	glossary = {}

	for _, row in df.iterrows():
	if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench']):
	glossary[row['English'].strip().lower()] = row['CanadianFrench'].strip()

	return glossary

	def retry_translate_text(text: str, glossary: dict, max_retries=3) -> str:
	"""Ensures GPT prioritizes glossary terms using system messages."""
	glossary_prompt = "\n".join([f"{eng} → {fr}" for eng, fr in glossary.items()])

	messages = [
	SystemMessage(content=f"Translate the following text to Canadian French while ensuring strict glossary replacements.\n\nGlossary:\n{glossary_prompt}"),
	HumanMessage(content=text)
	]

	for attempt in range(max_retries):
	try:
	response = translator(messages)
	return response.content.strip()
	except Exception as e:
	print(f"Error in translation (attempt {attempt+1}): {e}")
	time.sleep(2)

	return "Translation failed. Please try again later."

	def enforce_glossary_with_semantics(text: str, glossary: dict, threshold: float) -> str:
	"""Uses embeddings to enforce glossary replacement intelligently."""
	glossary_terms = list(glossary.keys())
	glossary_embeddings = model.encode(glossary_terms, convert_to_tensor=True)

	sentences = nltk.tokenize.sent_tokenize(text) if not nlp else [sent.text for sent in nlp(text).sents]

	def process_sentence(sentence):
	sentence_embedding = model.encode(sentence, convert_to_tensor=True)
	cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
	max_score, max_idx = torch.max(cos_scores, dim=1)

	if max_score.item() >= threshold:
	term = glossary_terms[max_idx]
	replacement = glossary[term]
	sentence = sentence.replace(term, replacement)

	return sentence.strip()

	with ThreadPoolExecutor() as executor:
	updated_sentences = list(executor.map(process_sentence, sentences))

	return " ".join(updated_sentences)

	# Streamlit UI
	st.title("AI-Powered English to Canadian French Translator")
	st.write("This version guarantees glossary enforcement.")

	input_text = st.text_area("Enter text to translate:")
	glossary_file = st.file_uploader("Upload Glossary File (Excel)", type=["xlsx"])
	threshold = st.slider("Semantic Matching Threshold", 0.5, 1.0, 0.75)

	if st.button("Translate"):
	if not input_text.strip():
	st.error("Please enter text to translate.")
	elif glossary_file is None:
	st.error("Glossary file is required.")
	else:
	glossary = load_glossary_from_excel(glossary_file)

	# Step 1: Translate Text with GPT (Forcing Glossary)
	translated_text = retry_translate_text(input_text, glossary)

	# Step 2: Apply Semantic Matching to Guarantee Glossary
	glossary_enforced_text = enforce_glossary_with_semantics(translated_text, glossary, threshold)

	st.subheader("Final Translated Text:")
	st.write(glossary_enforced_text)