garyd1 commited on
Commit
57ec4e3
·
verified ·
1 Parent(s): 8953790

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -22
app.py CHANGED
@@ -5,12 +5,14 @@ import streamlit as st
5
  import pandas as pd
6
  import torch
7
  import nltk
 
 
8
 
9
  from langchain.chat_models import ChatOpenAI
10
  from langchain.schema import SystemMessage, HumanMessage
11
  from sentence_transformers import SentenceTransformer, util
12
 
13
- # Try to load spaCy for advanced NLP processing
14
  try:
15
  import spacy
16
  nlp = spacy.load("en_core_web_sm")
@@ -26,7 +28,7 @@ model = SentenceTransformer('all-MiniLM-L6-v2')
26
 
27
  @st.cache_data
28
  def load_glossary_from_excel(glossary_file_bytes) -> dict:
29
- """Load glossary from an Excel file, applying lemmatization and sorting by length."""
30
  df = pd.read_excel(glossary_file_bytes)
31
  glossary = {}
32
 
@@ -48,37 +50,52 @@ def compute_glossary_embeddings_cached(glossary_items: tuple):
48
  embeddings = model.encode(glossary_terms, convert_to_tensor=True)
49
  return glossary_terms, embeddings
50
 
51
- def translate_text(text: str) -> str:
52
- """Uses OpenAI's GPT to translate text to Canadian French."""
53
- messages = [
54
- SystemMessage(content="You are a professional translator. Translate the following text to Canadian French while preserving its meaning and context."),
55
- HumanMessage(content=text)
56
- ]
57
- response = translator(messages)
58
- return response.content.strip()
 
 
 
 
 
 
59
 
60
  def enforce_glossary(text: str, glossary: dict, threshold: float) -> str:
61
- """Applies glossary replacements based on semantic similarity."""
62
  glossary_items = tuple(sorted(glossary.items()))
63
  glossary_terms, glossary_embeddings = compute_glossary_embeddings_cached(glossary_items)
64
 
65
  sentences = nltk.tokenize.sent_tokenize(text) if not use_spacy else [sent.text for sent in nlp(text).sents]
66
- updated_sentences = []
67
-
68
- for sentence in sentences:
69
  if not sentence.strip():
70
- continue
 
 
 
 
 
71
  sentence_embedding = model.encode(sentence, convert_to_tensor=True)
72
  cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
73
  max_score, max_idx = torch.max(cos_scores, dim=1)
74
 
75
- if max_score.item() >= threshold:
76
  term = glossary_terms[max_idx]
77
  replacement = glossary[term]
78
  pattern = r'\b' + re.escape(term) + r'\b'
79
  sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)
80
 
81
- updated_sentences.append(sentence.strip())
 
 
 
 
82
 
83
  return " ".join(updated_sentences)
84
 
@@ -91,9 +108,18 @@ def validate_translation(original_text, final_text):
91
  response = translator(messages)
92
  return response.content.strip()
93
 
 
 
 
 
 
 
 
 
 
94
  # Streamlit UI
95
- st.title("AI-Powered English to Canadian French Translator")
96
- st.write("This app uses AI agents for translation, glossary enforcement, and meaning validation.")
97
 
98
  input_text = st.text_area("Enter text to translate:")
99
  glossary_file = st.file_uploader("Upload Glossary File (Excel)", type=["xlsx"])
@@ -106,12 +132,13 @@ if st.button("Translate"):
106
  st.error("Glossary file is required.")
107
  else:
108
  glossary = load_glossary_from_excel(glossary_file)
109
- translated_text = translate_text(input_text)
110
  glossary_enforced_text = enforce_glossary(translated_text, glossary, threshold)
111
- validation_result = validate_translation(input_text, glossary_enforced_text)
 
112
 
113
  st.subheader("Final Translated Text:")
114
- st.write(glossary_enforced_text)
115
 
116
  st.subheader("Validation Check:")
117
  st.write(validation_result)
 
5
  import pandas as pd
6
  import torch
7
  import nltk
8
+ import time
9
+ from concurrent.futures import ThreadPoolExecutor
10
 
11
  from langchain.chat_models import ChatOpenAI
12
  from langchain.schema import SystemMessage, HumanMessage
13
  from sentence_transformers import SentenceTransformer, util
14
 
15
+ # Load NLP libraries
16
  try:
17
  import spacy
18
  nlp = spacy.load("en_core_web_sm")
 
28
 
29
  @st.cache_data
30
  def load_glossary_from_excel(glossary_file_bytes) -> dict:
31
+ """Load glossary from an Excel file, apply lemmatization, and sort by length."""
32
  df = pd.read_excel(glossary_file_bytes)
33
  glossary = {}
34
 
 
50
  embeddings = model.encode(glossary_terms, convert_to_tensor=True)
51
  return glossary_terms, embeddings
52
 
53
+ def retry_translate_text(text: str, max_retries=3) -> str:
54
+ """Retries translation in case of API failure."""
55
+ for attempt in range(max_retries):
56
+ try:
57
+ messages = [
58
+ SystemMessage(content="You are a professional translator. Translate the following text to Canadian French while preserving its meaning and context."),
59
+ HumanMessage(content=text)
60
+ ]
61
+ response = translator(messages)
62
+ return response.content.strip()
63
+ except Exception as e:
64
+ print(f"Error in translation (attempt {attempt+1}): {e}")
65
+ time.sleep(2) # Wait before retrying
66
+ return "Translation failed. Please try again later."
67
 
68
  def enforce_glossary(text: str, glossary: dict, threshold: float) -> str:
69
+ """Applies glossary replacements based on semantic similarity with batch processing."""
70
  glossary_items = tuple(sorted(glossary.items()))
71
  glossary_terms, glossary_embeddings = compute_glossary_embeddings_cached(glossary_items)
72
 
73
  sentences = nltk.tokenize.sent_tokenize(text) if not use_spacy else [sent.text for sent in nlp(text).sents]
74
+
75
+ def process_sentence(sentence):
76
+ """Processes a single sentence with glossary enforcement."""
77
  if not sentence.strip():
78
+ return sentence
79
+
80
+ # Dynamic threshold adjustment
81
+ sentence_length = len(sentence.split())
82
+ dynamic_threshold = 0.85 if sentence_length > 10 else 0.75 # Adjust threshold based on sentence length
83
+
84
  sentence_embedding = model.encode(sentence, convert_to_tensor=True)
85
  cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
86
  max_score, max_idx = torch.max(cos_scores, dim=1)
87
 
88
+ if max_score.item() >= dynamic_threshold:
89
  term = glossary_terms[max_idx]
90
  replacement = glossary[term]
91
  pattern = r'\b' + re.escape(term) + r'\b'
92
  sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)
93
 
94
+ return sentence.strip()
95
+
96
+ # Process sentences in parallel for speed
97
+ with ThreadPoolExecutor() as executor:
98
+ updated_sentences = list(executor.map(process_sentence, sentences))
99
 
100
  return " ".join(updated_sentences)
101
 
 
108
  response = translator(messages)
109
  return response.content.strip()
110
 
111
+ def grammar_correction(text: str) -> str:
112
+ """Uses GPT to fix grammar issues in the final translated text."""
113
+ messages = [
114
+ SystemMessage(content="You are a French grammar expert. Correct any grammatical mistakes in the following text."),
115
+ HumanMessage(content=text)
116
+ ]
117
+ response = translator(messages)
118
+ return response.content.strip()
119
+
120
  # Streamlit UI
121
+ st.title("Optimized AI-Powered English to Canadian French Translator")
122
+ st.write("This version includes retries, batch processing, glossary tuning, and grammar correction.")
123
 
124
  input_text = st.text_area("Enter text to translate:")
125
  glossary_file = st.file_uploader("Upload Glossary File (Excel)", type=["xlsx"])
 
132
  st.error("Glossary file is required.")
133
  else:
134
  glossary = load_glossary_from_excel(glossary_file)
135
+ translated_text = retry_translate_text(input_text)
136
  glossary_enforced_text = enforce_glossary(translated_text, glossary, threshold)
137
+ corrected_text = grammar_correction(glossary_enforced_text)
138
+ validation_result = validate_translation(input_text, corrected_text)
139
 
140
  st.subheader("Final Translated Text:")
141
+ st.write(corrected_text)
142
 
143
  st.subheader("Validation Check:")
144
  st.write(validation_result)