vibha-mah commited on
Commit
172e877
·
1 Parent(s): 71a4891

Create NLP_sentiment_model

Browse files
Files changed (1) hide show
  1. NLP_sentiment_model +140 -0
NLP_sentiment_model ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utilities
2
+ import re
3
+ import pickle
4
+ import numpy as np
5
+ import pandas as pd
6
+ # plotting
7
+ import seaborn as sns
8
+ from wordcloud import WordCloud
9
+ import matplotlib.pyplot as plt
10
+ # nltk
11
+ from nltk.stem import WordNetLemmatizer
12
+ # sklearn
13
+ from sklearn.svm import LinearSVC
14
+ from sklearn.naive_bayes import BernoulliNB
15
+ from sklearn.linear_model import LogisticRegression
16
+
17
+ from sklearn.model_selection import train_test_split
18
+ from sklearn.feature_extraction.text import TfidfVectorizer
19
+ from sklearn.metrics import confusion_matrix, classification_report
20
+
21
+ pip install datasets
22
+ from datasets import load_dataset
23
+ dataset = load_dataset("training.1600000.processed.noemoticon.csv")
24
+
25
+ DATASET_COLUMNS = ["sentiment", "ids", "date", "flag", "user", "text"]
26
+ DATASET_ENCODING = "ISO-8859-1"
27
+ dataset = pd.read_csv('training.1600000.processed.noemoticon.csv',
28
+ encoding=DATASET_ENCODING , names=DATASET_COLUMNS)
29
+
30
+ # Removing the unnecessary columns.
31
+ dataset = dataset[['sentiment','text']]
32
+ # Replacing the values to ease understanding.
33
+ dataset['sentiment'] = dataset['sentiment'].replace(4,1)
34
+
35
+ # Plotting the distribution for dataset.
36
+ ax = dataset.groupby('sentiment').count().plot(kind='bar', title='Distribution of data',
37
+ legend=False)
38
+ ax.set_xticklabels(['Negative','Positive'], rotation=0)
39
+
40
+ # Storing data in lists.
41
+ text, sentiment = list(dataset['text']), list(dataset['sentiment'])
42
+
43
+ # Defining dictionary containing all emojis with their meanings.
44
+ emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad',
45
+ ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
46
+ ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed',
47
+ ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
48
+ '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
49
+ '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink',
50
+ ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}
51
+
52
+ ## Defining set containing all stopwords in english.
53
+ stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
54
+ 'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
55
+ 'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
56
+ 'does', 'doing', 'down', 'during', 'each','few', 'for', 'from',
57
+ 'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
58
+ 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
59
+ 'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
60
+ 'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
61
+ 'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're',
62
+ 's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
63
+ 't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
64
+ 'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
65
+ 'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
66
+ 'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
67
+ 'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
68
+ "youve", 'your', 'yours', 'yourself', 'yourselves']
69
+
70
+ ## Defining set containing all stopwords in english.
71
+ stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
72
+ 'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
73
+ 'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
74
+ 'does', 'doing', 'down', 'during', 'each','few', 'for', 'from',
75
+ 'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
76
+ 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
77
+ 'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
78
+ 'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
79
+ 'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're',
80
+ 's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
81
+ 't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
82
+ 'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
83
+ 'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
84
+ 'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
85
+ 'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
86
+ "youve", 'your', 'yours', 'yourself', 'yourselves']
87
+
88
+ def preprocess(textdata):
89
+ processedText = []
90
+
91
+ # Create Lemmatizer and Stemmer.
92
+ wordLemm = WordNetLemmatizer()
93
+
94
+ # Defining regex patterns.
95
+ urlPattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
96
+ userPattern = '@[^\s]+'
97
+ alphaPattern = "[^a-zA-Z0-9]"
98
+ sequencePattern = r"(.)\1\1+"
99
+ seqReplacePattern = r"\1\1"
100
+
101
+ for tweet in textdata:
102
+ tweet = tweet.lower()
103
+
104
+ # Replace all URls with 'URL'
105
+ tweet = re.sub(urlPattern,' URL',tweet)
106
+ # Replace all emojis.
107
+ for emoji in emojis.keys():
108
+ tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji])
109
+ # Replace @USERNAME to 'USER'.
110
+ tweet = re.sub(userPattern,' USER', tweet)
111
+ # Replace all non alphabets.
112
+ tweet = re.sub(alphaPattern, " ", tweet)
113
+ # Replace 3 or more consecutive letters by 2 letter.
114
+ tweet = re.sub(sequencePattern, seqReplacePattern, tweet)
115
+
116
+ tweetwords = ''
117
+ for word in tweet.split():
118
+ # Checking if the word is a stopword.
119
+ #if word not in stopwordlist:
120
+ if len(word)>1:
121
+ # Lemmatizing the word.
122
+ word = wordLemm.lemmatize(word)
123
+ tweetwords += (word+' ')
124
+
125
+ processedText.append(tweetwords)
126
+
127
+ return processedText
128
+
129
+ import nltk
130
+ nltk.download()
131
+
132
+ import time
133
+ t = time.time()
134
+ processedtext = preprocess(text)
135
+ print(f'Text Preprocessing complete.')
136
+ print(f'Time Taken: {round(time.time()-t)} seconds')
137
+
138
+ X_train, X_test, y_train, y_test = train_test_split(processedtext, sentiment,
139
+ test_size = 0.05, random_state = 0)
140
+ print(f'Data Split done.')