Update pipeline.py
Browse files- pipeline.py +1 -6
pipeline.py
CHANGED
@@ -23,6 +23,7 @@ import nltk
|
|
23 |
nltk.download('punkt')
|
24 |
nltk.download('stopwords')
|
25 |
nltk.download('wordnet')
|
|
|
26 |
import html
|
27 |
|
28 |
from config import config_file
|
@@ -297,21 +298,15 @@ class Branch_Classifier(object):
|
|
297 |
def format_data(self,data:list)-> list:
|
298 |
try:
|
299 |
X = text_cleaning.text_cleaning(data, steam=False, lemma=True,single_input=True)[0]
|
300 |
-
print("\n1: ",X)
|
301 |
### Add Bigrams and keep only the good ones(pre-selected)
|
302 |
X_bigrmas = self.phrase_bigrams[X]
|
303 |
-
print("\n2: ",X_bigrmas)
|
304 |
|
305 |
data_clean = self.clean_bigram(X_bigrmas)
|
306 |
-
print("\n3: ",data_clean)
|
307 |
X_bigrams_clean = ' '.join(map(str, data_clean))
|
308 |
-
print("\n4: ",X_bigrams_clean)
|
309 |
pre_processed = self.vectorizer.transform([X_bigrams_clean]).toarray(),X_bigrams_clean
|
310 |
-
print("\n5: ",pre_processed)
|
311 |
|
312 |
except Exception as e:
|
313 |
logging.exception("Error occurred while formatting and cleaning data" +" Info: " + str(e))
|
314 |
-
raise e
|
315 |
exit()
|
316 |
|
317 |
return pre_processed
|
|
|
23 |
nltk.download('punkt')
|
24 |
nltk.download('stopwords')
|
25 |
nltk.download('wordnet')
|
26 |
+
nltk.download('omw-1.4')
|
27 |
import html
|
28 |
|
29 |
from config import config_file
|
|
|
298 |
def format_data(self,data:list)-> list:
|
299 |
try:
|
300 |
X = text_cleaning.text_cleaning(data, steam=False, lemma=True,single_input=True)[0]
|
|
|
301 |
### Add Bigrams and keep only the good ones(pre-selected)
|
302 |
X_bigrmas = self.phrase_bigrams[X]
|
|
|
303 |
|
304 |
data_clean = self.clean_bigram(X_bigrmas)
|
|
|
305 |
X_bigrams_clean = ' '.join(map(str, data_clean))
|
|
|
306 |
pre_processed = self.vectorizer.transform([X_bigrams_clean]).toarray(),X_bigrams_clean
|
|
|
307 |
|
308 |
except Exception as e:
|
309 |
logging.exception("Error occurred while formatting and cleaning data" +" Info: " + str(e))
|
|
|
310 |
exit()
|
311 |
|
312 |
return pre_processed
|