Spaces:
Runtime error
Runtime error
Create NLP_sentiment_model
Browse files- NLP_sentiment_model +140 -0
NLP_sentiment_model
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# utilities
|
2 |
+
import re
|
3 |
+
import pickle
|
4 |
+
import numpy as np
|
5 |
+
import pandas as pd
|
6 |
+
# plotting
|
7 |
+
import seaborn as sns
|
8 |
+
from wordcloud import WordCloud
|
9 |
+
import matplotlib.pyplot as plt
|
10 |
+
# nltk
|
11 |
+
from nltk.stem import WordNetLemmatizer
|
12 |
+
# sklearn
|
13 |
+
from sklearn.svm import LinearSVC
|
14 |
+
from sklearn.naive_bayes import BernoulliNB
|
15 |
+
from sklearn.linear_model import LogisticRegression
|
16 |
+
|
17 |
+
from sklearn.model_selection import train_test_split
|
18 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
19 |
+
from sklearn.metrics import confusion_matrix, classification_report
|
20 |
+
|
21 |
+
pip install datasets
|
22 |
+
from datasets import load_dataset
|
23 |
+
dataset = load_dataset("training.1600000.processed.noemoticon.csv")
|
24 |
+
|
25 |
+
DATASET_COLUMNS = ["sentiment", "ids", "date", "flag", "user", "text"]
|
26 |
+
DATASET_ENCODING = "ISO-8859-1"
|
27 |
+
dataset = pd.read_csv('training.1600000.processed.noemoticon.csv',
|
28 |
+
encoding=DATASET_ENCODING , names=DATASET_COLUMNS)
|
29 |
+
|
30 |
+
# Removing the unnecessary columns.
|
31 |
+
dataset = dataset[['sentiment','text']]
|
32 |
+
# Replacing the values to ease understanding.
|
33 |
+
dataset['sentiment'] = dataset['sentiment'].replace(4,1)
|
34 |
+
|
35 |
+
# Plotting the distribution for dataset.
|
36 |
+
ax = dataset.groupby('sentiment').count().plot(kind='bar', title='Distribution of data',
|
37 |
+
legend=False)
|
38 |
+
ax.set_xticklabels(['Negative','Positive'], rotation=0)
|
39 |
+
|
40 |
+
# Storing data in lists.
|
41 |
+
text, sentiment = list(dataset['text']), list(dataset['sentiment'])
|
42 |
+
|
43 |
+
# Defining dictionary containing all emojis with their meanings.
|
44 |
+
emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad',
|
45 |
+
':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
|
46 |
+
':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed',
|
47 |
+
':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
|
48 |
+
'@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
|
49 |
+
'<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink',
|
50 |
+
';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}
|
51 |
+
|
52 |
+
## Defining set containing all stopwords in english.
|
53 |
+
stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
|
54 |
+
'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
|
55 |
+
'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
|
56 |
+
'does', 'doing', 'down', 'during', 'each','few', 'for', 'from',
|
57 |
+
'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
|
58 |
+
'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
|
59 |
+
'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
|
60 |
+
'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
|
61 |
+
'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're',
|
62 |
+
's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
|
63 |
+
't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
|
64 |
+
'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
|
65 |
+
'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
|
66 |
+
'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
|
67 |
+
'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
|
68 |
+
"youve", 'your', 'yours', 'yourself', 'yourselves']
|
69 |
+
|
70 |
+
## Defining set containing all stopwords in english.
|
71 |
+
stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
|
72 |
+
'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
|
73 |
+
'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
|
74 |
+
'does', 'doing', 'down', 'during', 'each','few', 'for', 'from',
|
75 |
+
'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
|
76 |
+
'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
|
77 |
+
'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
|
78 |
+
'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
|
79 |
+
'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're',
|
80 |
+
's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
|
81 |
+
't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
|
82 |
+
'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
|
83 |
+
'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
|
84 |
+
'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
|
85 |
+
'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
|
86 |
+
"youve", 'your', 'yours', 'yourself', 'yourselves']
|
87 |
+
|
88 |
+
def preprocess(textdata):
|
89 |
+
processedText = []
|
90 |
+
|
91 |
+
# Create Lemmatizer and Stemmer.
|
92 |
+
wordLemm = WordNetLemmatizer()
|
93 |
+
|
94 |
+
# Defining regex patterns.
|
95 |
+
urlPattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
|
96 |
+
userPattern = '@[^\s]+'
|
97 |
+
alphaPattern = "[^a-zA-Z0-9]"
|
98 |
+
sequencePattern = r"(.)\1\1+"
|
99 |
+
seqReplacePattern = r"\1\1"
|
100 |
+
|
101 |
+
for tweet in textdata:
|
102 |
+
tweet = tweet.lower()
|
103 |
+
|
104 |
+
# Replace all URls with 'URL'
|
105 |
+
tweet = re.sub(urlPattern,' URL',tweet)
|
106 |
+
# Replace all emojis.
|
107 |
+
for emoji in emojis.keys():
|
108 |
+
tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji])
|
109 |
+
# Replace @USERNAME to 'USER'.
|
110 |
+
tweet = re.sub(userPattern,' USER', tweet)
|
111 |
+
# Replace all non alphabets.
|
112 |
+
tweet = re.sub(alphaPattern, " ", tweet)
|
113 |
+
# Replace 3 or more consecutive letters by 2 letter.
|
114 |
+
tweet = re.sub(sequencePattern, seqReplacePattern, tweet)
|
115 |
+
|
116 |
+
tweetwords = ''
|
117 |
+
for word in tweet.split():
|
118 |
+
# Checking if the word is a stopword.
|
119 |
+
#if word not in stopwordlist:
|
120 |
+
if len(word)>1:
|
121 |
+
# Lemmatizing the word.
|
122 |
+
word = wordLemm.lemmatize(word)
|
123 |
+
tweetwords += (word+' ')
|
124 |
+
|
125 |
+
processedText.append(tweetwords)
|
126 |
+
|
127 |
+
return processedText
|
128 |
+
|
129 |
+
import nltk
|
130 |
+
nltk.download()
|
131 |
+
|
132 |
+
import time
|
133 |
+
t = time.time()
|
134 |
+
processedtext = preprocess(text)
|
135 |
+
print(f'Text Preprocessing complete.')
|
136 |
+
print(f'Time Taken: {round(time.time()-t)} seconds')
|
137 |
+
|
138 |
+
X_train, X_test, y_train, y_test = train_test_split(processedtext, sentiment,
|
139 |
+
test_size = 0.05, random_state = 0)
|
140 |
+
print(f'Data Split done.')
|