garyd1 commited on
Commit
fbf0833
·
verified ·
1 Parent(s): 3589128

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -13
app.py CHANGED
@@ -6,23 +6,27 @@ import pandas as pd
6
  import torch
7
  import nltk
8
  import time
 
9
  from concurrent.futures import ThreadPoolExecutor
10
 
11
-
12
  from langchain_openai import ChatOpenAI
13
-
14
  from langchain.schema import SystemMessage, HumanMessage
15
  from sentence_transformers import SentenceTransformer, util
16
 
17
- # Load NLP libraries
 
 
 
 
 
 
18
  try:
19
  import spacy
20
  nlp = spacy.load("en_core_web_sm")
21
- use_spacy = True
22
- except Exception:
23
- st.warning("SpaCy model not found, falling back to NLTK for tokenization.")
24
- nltk.download("punkt")
25
- use_spacy = False
26
 
27
  # Load AI models
28
  translator = ChatOpenAI(model="gpt-3.5-turbo")
@@ -38,8 +42,8 @@ def load_glossary_from_excel(glossary_file_bytes) -> dict:
38
  if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench']):
39
  english_term = row['English'].strip().lower()
40
  french_term = row['CanadianFrench'].strip()
41
- doc = nlp(english_term) if use_spacy else english_term.split()
42
- lemmatized_term = " ".join([token.lemma_ for token in doc]) if use_spacy else english_term
43
  glossary[lemmatized_term] = french_term
44
 
45
  return dict(sorted(glossary.items(), key=lambda item: len(item[0]), reverse=True))
@@ -64,7 +68,7 @@ def retry_translate_text(text: str, max_retries=3) -> str:
64
  return response.content.strip()
65
  except Exception as e:
66
  print(f"Error in translation (attempt {attempt+1}): {e}")
67
- time.sleep(2) # Wait before retrying
68
  return "Translation failed. Please try again later."
69
 
70
  def enforce_glossary(text: str, glossary: dict, threshold: float) -> str:
@@ -72,7 +76,7 @@ def enforce_glossary(text: str, glossary: dict, threshold: float) -> str:
72
  glossary_items = tuple(sorted(glossary.items()))
73
  glossary_terms, glossary_embeddings = compute_glossary_embeddings_cached(glossary_items)
74
 
75
- sentences = nltk.tokenize.sent_tokenize(text) if not use_spacy else [sent.text for sent in nlp(text).sents]
76
 
77
  def process_sentence(sentence):
78
  """Processes a single sentence with glossary enforcement."""
@@ -95,7 +99,6 @@ def enforce_glossary(text: str, glossary: dict, threshold: float) -> str:
95
 
96
  return sentence.strip()
97
 
98
- # Process sentences in parallel for speed
99
  with ThreadPoolExecutor() as executor:
100
  updated_sentences = list(executor.map(process_sentence, sentences))
101
 
 
6
  import torch
7
  import nltk
8
  import time
9
+ import subprocess
10
  from concurrent.futures import ThreadPoolExecutor
11
 
 
12
  from langchain_openai import ChatOpenAI
 
13
  from langchain.schema import SystemMessage, HumanMessage
14
  from sentence_transformers import SentenceTransformer, util
15
 
16
+ # Ensure necessary NLP models are available
17
+ try:
18
+ nltk.data.find("tokenizers/punkt")
19
+ except LookupError:
20
+ print("Downloading NLTK punkt tokenizer...")
21
+ nltk.download("punkt")
22
+
23
  try:
24
  import spacy
25
  nlp = spacy.load("en_core_web_sm")
26
+ except OSError:
27
+ print("Downloading SpaCy model...")
28
+ subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
29
+ nlp = spacy.load("en_core_web_sm")
 
30
 
31
  # Load AI models
32
  translator = ChatOpenAI(model="gpt-3.5-turbo")
 
42
  if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench']):
43
  english_term = row['English'].strip().lower()
44
  french_term = row['CanadianFrench'].strip()
45
+ doc = nlp(english_term) if nlp else english_term.split()
46
+ lemmatized_term = " ".join([token.lemma_ for token in doc]) if nlp else english_term
47
  glossary[lemmatized_term] = french_term
48
 
49
  return dict(sorted(glossary.items(), key=lambda item: len(item[0]), reverse=True))
 
68
  return response.content.strip()
69
  except Exception as e:
70
  print(f"Error in translation (attempt {attempt+1}): {e}")
71
+ time.sleep(2)
72
  return "Translation failed. Please try again later."
73
 
74
  def enforce_glossary(text: str, glossary: dict, threshold: float) -> str:
 
76
  glossary_items = tuple(sorted(glossary.items()))
77
  glossary_terms, glossary_embeddings = compute_glossary_embeddings_cached(glossary_items)
78
 
79
+ sentences = nltk.tokenize.sent_tokenize(text) if not nlp else [sent.text for sent in nlp(text).sents]
80
 
81
  def process_sentence(sentence):
82
  """Processes a single sentence with glossary enforcement."""
 
99
 
100
  return sentence.strip()
101
 
 
102
  with ThreadPoolExecutor() as executor:
103
  updated_sentences = list(executor.map(process_sentence, sentences))
104