Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -6,23 +6,27 @@ import pandas as pd
|
|
6 |
import torch
|
7 |
import nltk
|
8 |
import time
|
|
|
9 |
from concurrent.futures import ThreadPoolExecutor
|
10 |
|
11 |
-
|
12 |
from langchain_openai import ChatOpenAI
|
13 |
-
|
14 |
from langchain.schema import SystemMessage, HumanMessage
|
15 |
from sentence_transformers import SentenceTransformer, util
|
16 |
|
17 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
try:
|
19 |
import spacy
|
20 |
nlp = spacy.load("en_core_web_sm")
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
use_spacy = False
|
26 |
|
27 |
# Load AI models
|
28 |
translator = ChatOpenAI(model="gpt-3.5-turbo")
|
@@ -38,8 +42,8 @@ def load_glossary_from_excel(glossary_file_bytes) -> dict:
|
|
38 |
if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench']):
|
39 |
english_term = row['English'].strip().lower()
|
40 |
french_term = row['CanadianFrench'].strip()
|
41 |
-
doc = nlp(english_term) if
|
42 |
-
lemmatized_term = " ".join([token.lemma_ for token in doc]) if
|
43 |
glossary[lemmatized_term] = french_term
|
44 |
|
45 |
return dict(sorted(glossary.items(), key=lambda item: len(item[0]), reverse=True))
|
@@ -64,7 +68,7 @@ def retry_translate_text(text: str, max_retries=3) -> str:
|
|
64 |
return response.content.strip()
|
65 |
except Exception as e:
|
66 |
print(f"Error in translation (attempt {attempt+1}): {e}")
|
67 |
-
time.sleep(2)
|
68 |
return "Translation failed. Please try again later."
|
69 |
|
70 |
def enforce_glossary(text: str, glossary: dict, threshold: float) -> str:
|
@@ -72,7 +76,7 @@ def enforce_glossary(text: str, glossary: dict, threshold: float) -> str:
|
|
72 |
glossary_items = tuple(sorted(glossary.items()))
|
73 |
glossary_terms, glossary_embeddings = compute_glossary_embeddings_cached(glossary_items)
|
74 |
|
75 |
-
sentences = nltk.tokenize.sent_tokenize(text) if not
|
76 |
|
77 |
def process_sentence(sentence):
|
78 |
"""Processes a single sentence with glossary enforcement."""
|
@@ -95,7 +99,6 @@ def enforce_glossary(text: str, glossary: dict, threshold: float) -> str:
|
|
95 |
|
96 |
return sentence.strip()
|
97 |
|
98 |
-
# Process sentences in parallel for speed
|
99 |
with ThreadPoolExecutor() as executor:
|
100 |
updated_sentences = list(executor.map(process_sentence, sentences))
|
101 |
|
|
|
6 |
import torch
|
7 |
import nltk
|
8 |
import time
|
9 |
+
import subprocess
|
10 |
from concurrent.futures import ThreadPoolExecutor
|
11 |
|
|
|
12 |
from langchain_openai import ChatOpenAI
|
|
|
13 |
from langchain.schema import SystemMessage, HumanMessage
|
14 |
from sentence_transformers import SentenceTransformer, util
|
15 |
|
16 |
+
# Ensure necessary NLP models are available
|
17 |
+
try:
|
18 |
+
nltk.data.find("tokenizers/punkt")
|
19 |
+
except LookupError:
|
20 |
+
print("Downloading NLTK punkt tokenizer...")
|
21 |
+
nltk.download("punkt")
|
22 |
+
|
23 |
try:
|
24 |
import spacy
|
25 |
nlp = spacy.load("en_core_web_sm")
|
26 |
+
except OSError:
|
27 |
+
print("Downloading SpaCy model...")
|
28 |
+
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
|
29 |
+
nlp = spacy.load("en_core_web_sm")
|
|
|
30 |
|
31 |
# Load AI models
|
32 |
translator = ChatOpenAI(model="gpt-3.5-turbo")
|
|
|
42 |
if pd.notnull(row['English']) and pd.notnull(row['CanadianFrench']):
|
43 |
english_term = row['English'].strip().lower()
|
44 |
french_term = row['CanadianFrench'].strip()
|
45 |
+
doc = nlp(english_term) if nlp else english_term.split()
|
46 |
+
lemmatized_term = " ".join([token.lemma_ for token in doc]) if nlp else english_term
|
47 |
glossary[lemmatized_term] = french_term
|
48 |
|
49 |
return dict(sorted(glossary.items(), key=lambda item: len(item[0]), reverse=True))
|
|
|
68 |
return response.content.strip()
|
69 |
except Exception as e:
|
70 |
print(f"Error in translation (attempt {attempt+1}): {e}")
|
71 |
+
time.sleep(2)
|
72 |
return "Translation failed. Please try again later."
|
73 |
|
74 |
def enforce_glossary(text: str, glossary: dict, threshold: float) -> str:
|
|
|
76 |
glossary_items = tuple(sorted(glossary.items()))
|
77 |
glossary_terms, glossary_embeddings = compute_glossary_embeddings_cached(glossary_items)
|
78 |
|
79 |
+
sentences = nltk.tokenize.sent_tokenize(text) if not nlp else [sent.text for sent in nlp(text).sents]
|
80 |
|
81 |
def process_sentence(sentence):
|
82 |
"""Processes a single sentence with glossary enforcement."""
|
|
|
99 |
|
100 |
return sentence.strip()
|
101 |
|
|
|
102 |
with ThreadPoolExecutor() as executor:
|
103 |
updated_sentences = list(executor.map(process_sentence, sentences))
|
104 |
|