Spaces:

garyd1
/

text_translator

Sleeping

App Files Files Community

garyd1 commited on Feb 26

Commit

fd72a6e

verified ·

1 Parent(s): 1c97f10

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -38

app.py CHANGED Viewed

@@ -56,12 +56,19 @@ def compute_glossary_embeddings_cached(glossary_items: tuple):
     embeddings = model.encode(glossary_terms, convert_to_tensor=True)
     return glossary_terms, embeddings
 def retry_translate_text(text: str, max_retries=3) -> str:
     """Retries translation in case of API failure."""
     for attempt in range(max_retries):
         try:
             messages = [
-                SystemMessage(content="You are a professional translator. Translate the following text to Canadian French while preserving its meaning and context."),
                 HumanMessage(content=text)
             ]
             response = translator(messages)
@@ -71,27 +78,30 @@ def retry_translate_text(text: str, max_retries=3) -> str:
             time.sleep(2)
     return "Translation failed. Please try again later."
-def enforce_glossary(text: str, glossary: dict, threshold: float) -> str:
-    """Applies glossary replacements based on semantic similarity with batch processing."""
     glossary_items = tuple(sorted(glossary.items()))
     glossary_terms, glossary_embeddings = compute_glossary_embeddings_cached(glossary_items)
     sentences = nltk.tokenize.sent_tokenize(text) if not nlp else [sent.text for sent in nlp(text).sents]
     def process_sentence(sentence):
         """Processes a single sentence with glossary enforcement."""
         if not sentence.strip():
             return sentence
-        # Dynamic threshold adjustment
-        sentence_length = len(sentence.split())
-        dynamic_threshold = 0.85 if sentence_length > 10 else 0.75  # Adjust threshold based on sentence length
         sentence_embedding = model.encode(sentence, convert_to_tensor=True)
         cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
         max_score, max_idx = torch.max(cos_scores, dim=1)
-        if max_score.item() >= dynamic_threshold:
             term = glossary_terms[max_idx]
             replacement = glossary[term]
             pattern = r'\b' + re.escape(term) + r'\b'
@@ -104,31 +114,13 @@ def enforce_glossary(text: str, glossary: dict, threshold: float) -> str:
     return " ".join(updated_sentences)
-def validate_translation(original_text, final_text):
-    """Uses GPT to check if the final translation retains the original meaning."""
-    messages = [
-        SystemMessage(content="You are an AI proofreader. Compare the original and final translation. Does the final translation retain the original meaning?"),
-        HumanMessage(content=f"Original Text: {original_text}\nFinal Translation: {final_text}\n")
-    ]
-    response = translator(messages)
-    return response.content.strip()
-def grammar_correction(text: str) -> str:
-    """Uses GPT to fix grammar issues in the final translated text."""
-    messages = [
-        SystemMessage(content="You are a French grammar expert. Correct any grammatical mistakes in the following text."),
-        HumanMessage(content=text)
-    ]
-    response = translator(messages)
-    return response.content.strip()
 # Streamlit UI
-st.title("Optimized AI-Powered English to Canadian French Translator")
-st.write("This version includes retries, batch processing, glossary tuning, and grammar correction.")
 input_text = st.text_area("Enter text to translate:")
 glossary_file = st.file_uploader("Upload Glossary File (Excel)", type=["xlsx"])
-threshold = st.slider("Semantic Matching Threshold", 0.5, 1.0, 0.8)
 if st.button("Translate"):
     if not input_text.strip():
@@ -137,13 +129,18 @@ if st.button("Translate"):
         st.error("Glossary file is required.")
     else:
         glossary = load_glossary_from_excel(glossary_file)
-        translated_text = retry_translate_text(input_text)
-        glossary_enforced_text = enforce_glossary(translated_text, glossary, threshold)
-        corrected_text = grammar_correction(glossary_enforced_text)
-        validation_result = validate_translation(input_text, corrected_text)
-        st.subheader("Final Translated Text:")
-        st.write(corrected_text)
-        st.subheader("Validation Check:")
-        st.write(validation_result)

     embeddings = model.encode(glossary_terms, convert_to_tensor=True)
     return glossary_terms, embeddings
+def enforce_glossary_pre_translation(text: str, glossary: dict) -> str:
+    """Forces glossary terms in the English text before translation."""
+    for eng_term, fr_term in glossary.items():
+        pattern = r'\b' + re.escape(eng_term) + r'\b'
+        text = re.sub(pattern, eng_term.upper(), text, flags=re.IGNORECASE)  # Capitalize for emphasis
+    return text
 def retry_translate_text(text: str, max_retries=3) -> str:
     """Retries translation in case of API failure."""
     for attempt in range(max_retries):
         try:
             messages = [
+                SystemMessage(content="You are a professional translator. Translate the following text to Canadian French while preserving its meaning and respecting these specific terms."),
                 HumanMessage(content=text)
             ]
             response = translator(messages)
             time.sleep(2)
     return "Translation failed. Please try again later."
+def enforce_glossary_post_translation(text: str, glossary: dict) -> str:
+    """Ensures glossary terms are applied after translation."""
+    for eng_term, fr_term in glossary.items():
+        pattern = r'\b' + re.escape(eng_term.upper()) + r'\b'
+        text = re.sub(pattern, fr_term, text, flags=re.IGNORECASE)
+    return text
+def enforce_glossary_with_semantics(text: str, glossary: dict, threshold: float) -> str:
+    """Applies glossary replacements based on semantic similarity."""
     glossary_items = tuple(sorted(glossary.items()))
     glossary_terms, glossary_embeddings = compute_glossary_embeddings_cached(glossary_items)
     sentences = nltk.tokenize.sent_tokenize(text) if not nlp else [sent.text for sent in nlp(text).sents]
     def process_sentence(sentence):
         """Processes a single sentence with glossary enforcement."""
         if not sentence.strip():
             return sentence
         sentence_embedding = model.encode(sentence, convert_to_tensor=True)
         cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
         max_score, max_idx = torch.max(cos_scores, dim=1)
+        if max_score.item() >= threshold:
             term = glossary_terms[max_idx]
             replacement = glossary[term]
             pattern = r'\b' + re.escape(term) + r'\b'
     return " ".join(updated_sentences)
 # Streamlit UI
+st.title("AI-Powered English to Canadian French Translator")
+st.write("This version ensures glossary priority, improves enforcement, and validates meaning.")
 input_text = st.text_area("Enter text to translate:")
 glossary_file = st.file_uploader("Upload Glossary File (Excel)", type=["xlsx"])
+threshold = st.slider("Semantic Matching Threshold", 0.5, 1.0, 0.85)
 if st.button("Translate"):
     if not input_text.strip():
         st.error("Glossary file is required.")
     else:
         glossary = load_glossary_from_excel(glossary_file)
+        # Step 1: Enforce Glossary Before Translation
+        pre_translated_text = enforce_glossary_pre_translation(input_text, glossary)
+        # Step 2: Translate Text with OpenAI
+        translated_text = retry_translate_text(pre_translated_text)
+        # Step 3: Enforce Glossary After Translation
+        post_translated_text = enforce_glossary_post_translation(translated_text, glossary)
+        # Step 4: Apply Semantic Matching to Catch Any Missed Glossary Terms
+        glossary_enforced_text = enforce_glossary_with_semantics(post_translated_text, glossary, threshold)
+        st.subheader("Final Translated Text:")
+        st.write(glossary_enforced_text)