garyd1 commited on
Commit
8447d74
·
verified ·
1 Parent(s): fd72a6e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -56
app.py CHANGED
@@ -34,69 +34,43 @@ model = SentenceTransformer('all-MiniLM-L6-v2')
34
 
35
  @st.cache_data
36
  def load_glossary_from_excel(glossary_file_bytes) -> dict:
37
- """Load glossary from an Excel file, apply lemmatization, and sort by length."""
38
  df = pd.read_excel(glossary_file_bytes)
39
  glossary = {}
40
 
41
  for _, row in df.iterrows():
42
  if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench']):
43
- english_term = row['English'].strip().lower()
44
- french_term = row['CanadianFrench'].strip()
45
- doc = nlp(english_term) if nlp else english_term.split()
46
- lemmatized_term = " ".join([token.lemma_ for token in doc]) if nlp else english_term
47
- glossary[lemmatized_term] = french_term
48
 
49
- return dict(sorted(glossary.items(), key=lambda item: len(item[0]), reverse=True))
 
 
 
 
 
 
 
 
 
50
 
51
- @st.cache_data
52
- def compute_glossary_embeddings_cached(glossary_items: tuple):
53
- """Compute cached embeddings for glossary terms."""
54
- glossary = dict(glossary_items)
55
- glossary_terms = list(glossary.keys())
56
- embeddings = model.encode(glossary_terms, convert_to_tensor=True)
57
- return glossary_terms, embeddings
58
-
59
- def enforce_glossary_pre_translation(text: str, glossary: dict) -> str:
60
- """Forces glossary terms in the English text before translation."""
61
- for eng_term, fr_term in glossary.items():
62
- pattern = r'\b' + re.escape(eng_term) + r'\b'
63
- text = re.sub(pattern, eng_term.upper(), text, flags=re.IGNORECASE) # Capitalize for emphasis
64
- return text
65
-
66
- def retry_translate_text(text: str, max_retries=3) -> str:
67
- """Retries translation in case of API failure."""
68
  for attempt in range(max_retries):
69
  try:
70
- messages = [
71
- SystemMessage(content="You are a professional translator. Translate the following text to Canadian French while preserving its meaning and respecting these specific terms."),
72
- HumanMessage(content=text)
73
- ]
74
  response = translator(messages)
75
  return response.content.strip()
76
  except Exception as e:
77
  print(f"Error in translation (attempt {attempt+1}): {e}")
78
  time.sleep(2)
79
- return "Translation failed. Please try again later."
80
 
81
- def enforce_glossary_post_translation(text: str, glossary: dict) -> str:
82
- """Ensures glossary terms are applied after translation."""
83
- for eng_term, fr_term in glossary.items():
84
- pattern = r'\b' + re.escape(eng_term.upper()) + r'\b'
85
- text = re.sub(pattern, fr_term, text, flags=re.IGNORECASE)
86
- return text
87
 
88
  def enforce_glossary_with_semantics(text: str, glossary: dict, threshold: float) -> str:
89
- """Applies glossary replacements based on semantic similarity."""
90
- glossary_items = tuple(sorted(glossary.items()))
91
- glossary_terms, glossary_embeddings = compute_glossary_embeddings_cached(glossary_items)
92
 
93
  sentences = nltk.tokenize.sent_tokenize(text) if not nlp else [sent.text for sent in nlp(text).sents]
94
 
95
  def process_sentence(sentence):
96
- """Processes a single sentence with glossary enforcement."""
97
- if not sentence.strip():
98
- return sentence
99
-
100
  sentence_embedding = model.encode(sentence, convert_to_tensor=True)
101
  cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
102
  max_score, max_idx = torch.max(cos_scores, dim=1)
@@ -104,8 +78,7 @@ def enforce_glossary_with_semantics(text: str, glossary: dict, threshold: float)
104
  if max_score.item() >= threshold:
105
  term = glossary_terms[max_idx]
106
  replacement = glossary[term]
107
- pattern = r'\b' + re.escape(term) + r'\b'
108
- sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)
109
 
110
  return sentence.strip()
111
 
@@ -116,11 +89,11 @@ def enforce_glossary_with_semantics(text: str, glossary: dict, threshold: float)
116
 
117
  # Streamlit UI
118
  st.title("AI-Powered English to Canadian French Translator")
119
- st.write("This version ensures glossary priority, improves enforcement, and validates meaning.")
120
 
121
  input_text = st.text_area("Enter text to translate:")
122
  glossary_file = st.file_uploader("Upload Glossary File (Excel)", type=["xlsx"])
123
- threshold = st.slider("Semantic Matching Threshold", 0.5, 1.0, 0.85)
124
 
125
  if st.button("Translate"):
126
  if not input_text.strip():
@@ -130,17 +103,11 @@ if st.button("Translate"):
130
  else:
131
  glossary = load_glossary_from_excel(glossary_file)
132
 
133
- # Step 1: Enforce Glossary Before Translation
134
- pre_translated_text = enforce_glossary_pre_translation(input_text, glossary)
135
-
136
- # Step 2: Translate Text with OpenAI
137
- translated_text = retry_translate_text(pre_translated_text)
138
-
139
- # Step 3: Enforce Glossary After Translation
140
- post_translated_text = enforce_glossary_post_translation(translated_text, glossary)
141
 
142
- # Step 4: Apply Semantic Matching to Catch Any Missed Glossary Terms
143
- glossary_enforced_text = enforce_glossary_with_semantics(post_translated_text, glossary, threshold)
144
 
145
  st.subheader("Final Translated Text:")
146
  st.write(glossary_enforced_text)
 
34
 
35
  @st.cache_data
36
  def load_glossary_from_excel(glossary_file_bytes) -> dict:
37
+ """Load glossary from an Excel file."""
38
  df = pd.read_excel(glossary_file_bytes)
39
  glossary = {}
40
 
41
  for _, row in df.iterrows():
42
  if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench']):
43
+ glossary[row['English'].strip().lower()] = row['CanadianFrench'].strip()
 
 
 
 
44
 
45
+ return glossary
46
+
47
+ def retry_translate_text(text: str, glossary: dict, max_retries=3) -> str:
48
+ """Ensures GPT prioritizes glossary terms using system messages."""
49
+ glossary_prompt = "\n".join([f"{eng} → {fr}" for eng, fr in glossary.items()])
50
+
51
+ messages = [
52
+ SystemMessage(content=f"Translate the following text to Canadian French while ensuring strict glossary replacements.\n\nGlossary:\n{glossary_prompt}"),
53
+ HumanMessage(content=text)
54
+ ]
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  for attempt in range(max_retries):
57
  try:
 
 
 
 
58
  response = translator(messages)
59
  return response.content.strip()
60
  except Exception as e:
61
  print(f"Error in translation (attempt {attempt+1}): {e}")
62
  time.sleep(2)
 
63
 
64
+ return "Translation failed. Please try again later."
 
 
 
 
 
65
 
66
  def enforce_glossary_with_semantics(text: str, glossary: dict, threshold: float) -> str:
67
+ """Uses embeddings to enforce glossary replacement intelligently."""
68
+ glossary_terms = list(glossary.keys())
69
+ glossary_embeddings = model.encode(glossary_terms, convert_to_tensor=True)
70
 
71
  sentences = nltk.tokenize.sent_tokenize(text) if not nlp else [sent.text for sent in nlp(text).sents]
72
 
73
  def process_sentence(sentence):
 
 
 
 
74
  sentence_embedding = model.encode(sentence, convert_to_tensor=True)
75
  cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
76
  max_score, max_idx = torch.max(cos_scores, dim=1)
 
78
  if max_score.item() >= threshold:
79
  term = glossary_terms[max_idx]
80
  replacement = glossary[term]
81
+ sentence = sentence.replace(term, replacement)
 
82
 
83
  return sentence.strip()
84
 
 
89
 
90
  # Streamlit UI
91
  st.title("AI-Powered English to Canadian French Translator")
92
+ st.write("This version guarantees glossary enforcement.")
93
 
94
  input_text = st.text_area("Enter text to translate:")
95
  glossary_file = st.file_uploader("Upload Glossary File (Excel)", type=["xlsx"])
96
+ threshold = st.slider("Semantic Matching Threshold", 0.5, 1.0, 0.75)
97
 
98
  if st.button("Translate"):
99
  if not input_text.strip():
 
103
  else:
104
  glossary = load_glossary_from_excel(glossary_file)
105
 
106
+ # Step 1: Translate Text with GPT (Forcing Glossary)
107
+ translated_text = retry_translate_text(input_text, glossary)
 
 
 
 
 
 
108
 
109
+ # Step 2: Apply Semantic Matching to Guarantee Glossary
110
+ glossary_enforced_text = enforce_glossary_with_semantics(translated_text, glossary, threshold)
111
 
112
  st.subheader("Final Translated Text:")
113
  st.write(glossary_enforced_text)