garyd1 commited on
Commit
fd72a6e
·
verified ·
1 Parent(s): 1c97f10

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -38
app.py CHANGED
@@ -56,12 +56,19 @@ def compute_glossary_embeddings_cached(glossary_items: tuple):
56
  embeddings = model.encode(glossary_terms, convert_to_tensor=True)
57
  return glossary_terms, embeddings
58
 
 
 
 
 
 
 
 
59
  def retry_translate_text(text: str, max_retries=3) -> str:
60
  """Retries translation in case of API failure."""
61
  for attempt in range(max_retries):
62
  try:
63
  messages = [
64
- SystemMessage(content="You are a professional translator. Translate the following text to Canadian French while preserving its meaning and context."),
65
  HumanMessage(content=text)
66
  ]
67
  response = translator(messages)
@@ -71,27 +78,30 @@ def retry_translate_text(text: str, max_retries=3) -> str:
71
  time.sleep(2)
72
  return "Translation failed. Please try again later."
73
 
74
- def enforce_glossary(text: str, glossary: dict, threshold: float) -> str:
75
- """Applies glossary replacements based on semantic similarity with batch processing."""
 
 
 
 
 
 
 
76
  glossary_items = tuple(sorted(glossary.items()))
77
  glossary_terms, glossary_embeddings = compute_glossary_embeddings_cached(glossary_items)
78
 
79
  sentences = nltk.tokenize.sent_tokenize(text) if not nlp else [sent.text for sent in nlp(text).sents]
80
-
81
  def process_sentence(sentence):
82
  """Processes a single sentence with glossary enforcement."""
83
  if not sentence.strip():
84
  return sentence
85
 
86
- # Dynamic threshold adjustment
87
- sentence_length = len(sentence.split())
88
- dynamic_threshold = 0.85 if sentence_length > 10 else 0.75 # Adjust threshold based on sentence length
89
-
90
  sentence_embedding = model.encode(sentence, convert_to_tensor=True)
91
  cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
92
  max_score, max_idx = torch.max(cos_scores, dim=1)
93
 
94
- if max_score.item() >= dynamic_threshold:
95
  term = glossary_terms[max_idx]
96
  replacement = glossary[term]
97
  pattern = r'\b' + re.escape(term) + r'\b'
@@ -104,31 +114,13 @@ def enforce_glossary(text: str, glossary: dict, threshold: float) -> str:
104
 
105
  return " ".join(updated_sentences)
106
 
107
- def validate_translation(original_text, final_text):
108
- """Uses GPT to check if the final translation retains the original meaning."""
109
- messages = [
110
- SystemMessage(content="You are an AI proofreader. Compare the original and final translation. Does the final translation retain the original meaning?"),
111
- HumanMessage(content=f"Original Text: {original_text}\nFinal Translation: {final_text}\n")
112
- ]
113
- response = translator(messages)
114
- return response.content.strip()
115
-
116
- def grammar_correction(text: str) -> str:
117
- """Uses GPT to fix grammar issues in the final translated text."""
118
- messages = [
119
- SystemMessage(content="You are a French grammar expert. Correct any grammatical mistakes in the following text."),
120
- HumanMessage(content=text)
121
- ]
122
- response = translator(messages)
123
- return response.content.strip()
124
-
125
  # Streamlit UI
126
- st.title("Optimized AI-Powered English to Canadian French Translator")
127
- st.write("This version includes retries, batch processing, glossary tuning, and grammar correction.")
128
 
129
  input_text = st.text_area("Enter text to translate:")
130
  glossary_file = st.file_uploader("Upload Glossary File (Excel)", type=["xlsx"])
131
- threshold = st.slider("Semantic Matching Threshold", 0.5, 1.0, 0.8)
132
 
133
  if st.button("Translate"):
134
  if not input_text.strip():
@@ -137,13 +129,18 @@ if st.button("Translate"):
137
  st.error("Glossary file is required.")
138
  else:
139
  glossary = load_glossary_from_excel(glossary_file)
140
- translated_text = retry_translate_text(input_text)
141
- glossary_enforced_text = enforce_glossary(translated_text, glossary, threshold)
142
- corrected_text = grammar_correction(glossary_enforced_text)
143
- validation_result = validate_translation(input_text, corrected_text)
144
 
145
- st.subheader("Final Translated Text:")
146
- st.write(corrected_text)
 
 
 
 
 
 
147
 
148
- st.subheader("Validation Check:")
149
- st.write(validation_result)
 
 
 
 
56
  embeddings = model.encode(glossary_terms, convert_to_tensor=True)
57
  return glossary_terms, embeddings
58
 
59
+ def enforce_glossary_pre_translation(text: str, glossary: dict) -> str:
60
+ """Forces glossary terms in the English text before translation."""
61
+ for eng_term, fr_term in glossary.items():
62
+ pattern = r'\b' + re.escape(eng_term) + r'\b'
63
+ text = re.sub(pattern, eng_term.upper(), text, flags=re.IGNORECASE) # Capitalize for emphasis
64
+ return text
65
+
66
  def retry_translate_text(text: str, max_retries=3) -> str:
67
  """Retries translation in case of API failure."""
68
  for attempt in range(max_retries):
69
  try:
70
  messages = [
71
+ SystemMessage(content="You are a professional translator. Translate the following text to Canadian French while preserving its meaning and respecting these specific terms."),
72
  HumanMessage(content=text)
73
  ]
74
  response = translator(messages)
 
78
  time.sleep(2)
79
  return "Translation failed. Please try again later."
80
 
81
+ def enforce_glossary_post_translation(text: str, glossary: dict) -> str:
82
+ """Ensures glossary terms are applied after translation."""
83
+ for eng_term, fr_term in glossary.items():
84
+ pattern = r'\b' + re.escape(eng_term.upper()) + r'\b'
85
+ text = re.sub(pattern, fr_term, text, flags=re.IGNORECASE)
86
+ return text
87
+
88
+ def enforce_glossary_with_semantics(text: str, glossary: dict, threshold: float) -> str:
89
+ """Applies glossary replacements based on semantic similarity."""
90
  glossary_items = tuple(sorted(glossary.items()))
91
  glossary_terms, glossary_embeddings = compute_glossary_embeddings_cached(glossary_items)
92
 
93
  sentences = nltk.tokenize.sent_tokenize(text) if not nlp else [sent.text for sent in nlp(text).sents]
94
+
95
  def process_sentence(sentence):
96
  """Processes a single sentence with glossary enforcement."""
97
  if not sentence.strip():
98
  return sentence
99
 
 
 
 
 
100
  sentence_embedding = model.encode(sentence, convert_to_tensor=True)
101
  cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
102
  max_score, max_idx = torch.max(cos_scores, dim=1)
103
 
104
+ if max_score.item() >= threshold:
105
  term = glossary_terms[max_idx]
106
  replacement = glossary[term]
107
  pattern = r'\b' + re.escape(term) + r'\b'
 
114
 
115
  return " ".join(updated_sentences)
116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  # Streamlit UI
118
+ st.title("AI-Powered English to Canadian French Translator")
119
+ st.write("This version ensures glossary priority, improves enforcement, and validates meaning.")
120
 
121
  input_text = st.text_area("Enter text to translate:")
122
  glossary_file = st.file_uploader("Upload Glossary File (Excel)", type=["xlsx"])
123
+ threshold = st.slider("Semantic Matching Threshold", 0.5, 1.0, 0.85)
124
 
125
  if st.button("Translate"):
126
  if not input_text.strip():
 
129
  st.error("Glossary file is required.")
130
  else:
131
  glossary = load_glossary_from_excel(glossary_file)
 
 
 
 
132
 
133
+ # Step 1: Enforce Glossary Before Translation
134
+ pre_translated_text = enforce_glossary_pre_translation(input_text, glossary)
135
+
136
+ # Step 2: Translate Text with OpenAI
137
+ translated_text = retry_translate_text(pre_translated_text)
138
+
139
+ # Step 3: Enforce Glossary After Translation
140
+ post_translated_text = enforce_glossary_post_translation(translated_text, glossary)
141
 
142
+ # Step 4: Apply Semantic Matching to Catch Any Missed Glossary Terms
143
+ glossary_enforced_text = enforce_glossary_with_semantics(post_translated_text, glossary, threshold)
144
+
145
+ st.subheader("Final Translated Text:")
146
+ st.write(glossary_enforced_text)