Spaces:

vibha-mah
/

Twitter_Emoticon_Analysis_NLP

Runtime error

App Files Files Community

vibha-mah commited on Sep 26, 2023

Commit

172e877

1 Parent(s): 71a4891

Create NLP_sentiment_model

Browse files

Files changed (1) hide show

NLP_sentiment_model +140 -0

NLP_sentiment_model ADDED Viewed

	@@ -0,0 +1,140 @@

+# utilities
+import re
+import pickle
+import numpy as np
+import pandas as pd
+# plotting
+import seaborn as sns
+from wordcloud import WordCloud
+import matplotlib.pyplot as plt
+# nltk
+from nltk.stem import WordNetLemmatizer
+# sklearn
+from sklearn.svm import LinearSVC
+from sklearn.naive_bayes import BernoulliNB
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics import confusion_matrix, classification_report
+pip install datasets
+from datasets import load_dataset
+dataset = load_dataset("training.1600000.processed.noemoticon.csv")
+DATASET_COLUMNS  = ["sentiment", "ids", "date", "flag", "user", "text"]
+DATASET_ENCODING = "ISO-8859-1"
+dataset = pd.read_csv('training.1600000.processed.noemoticon.csv',
+                      encoding=DATASET_ENCODING , names=DATASET_COLUMNS)
+# Removing the unnecessary columns.
+dataset = dataset[['sentiment','text']]
+# Replacing the values to ease understanding.
+dataset['sentiment'] = dataset['sentiment'].replace(4,1)
+# Plotting the distribution for dataset.
+ax = dataset.groupby('sentiment').count().plot(kind='bar', title='Distribution of data',
+                                               legend=False)
+ax.set_xticklabels(['Negative','Positive'], rotation=0)
+# Storing data in lists.
+text, sentiment = list(dataset['text']), list(dataset['sentiment'])
+# Defining dictionary containing all emojis with their meanings.
+emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad',
+          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
+          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed',
+          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
+          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
+          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink',
+          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}
+## Defining set containing all stopwords in english.
+stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
+             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
+             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
+             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from',
+             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
+             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
+             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
+             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
+             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're',
+             's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
+             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
+             'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
+             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
+             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
+             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
+             "youve", 'your', 'yours', 'yourself', 'yourselves']
+## Defining set containing all stopwords in english.
+stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
+             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
+             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
+             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from',
+             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
+             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
+             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
+             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
+             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're',
+             's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
+             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
+             'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
+             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
+             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
+             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
+             "youve", 'your', 'yours', 'yourself', 'yourselves']
+def preprocess(textdata):
+    processedText = []
+    # Create Lemmatizer and Stemmer.
+    wordLemm = WordNetLemmatizer()
+    # Defining regex patterns.
+    urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
+    userPattern       = '@[^\s]+'
+    alphaPattern      = "[^a-zA-Z0-9]"
+    sequencePattern   = r"(.)\1\1+"
+    seqReplacePattern = r"\1\1"
+    for tweet in textdata:
+        tweet = tweet.lower()
+        # Replace all URls with 'URL'
+        tweet = re.sub(urlPattern,' URL',tweet)
+        # Replace all emojis.
+        for emoji in emojis.keys():
+            tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji])
+        # Replace @USERNAME to 'USER'.
+        tweet = re.sub(userPattern,' USER', tweet)
+        # Replace all non alphabets.
+        tweet = re.sub(alphaPattern, " ", tweet)
+        # Replace 3 or more consecutive letters by 2 letter.
+        tweet = re.sub(sequencePattern, seqReplacePattern, tweet)
+        tweetwords = ''
+        for word in tweet.split():
+            # Checking if the word is a stopword.
+            #if word not in stopwordlist:
+            if len(word)>1:
+                # Lemmatizing the word.
+                word = wordLemm.lemmatize(word)
+                tweetwords += (word+' ')
+        processedText.append(tweetwords)
+    return processedText
+import nltk
+nltk.download()
+import time
+t = time.time()
+processedtext = preprocess(text)
+print(f'Text Preprocessing complete.')
+print(f'Time Taken: {round(time.time()-t)} seconds')
+X_train, X_test, y_train, y_test = train_test_split(processedtext, sentiment,
+                                                    test_size = 0.05, random_state = 0)
+print(f'Data Split done.')