prasanth345
/

translation_and_speech

Model card Files Files and versions Community

prasanth345 commited on Sep 30, 2024

Commit

2f7cb3f

verified ·

1 Parent(s): 2ac8202

Upload 4 files

Browse files

Files changed (4) hide show

app.py +69 -0
model.pkl +3 -0
vectorizer.pkl +3 -0
ytcomments.py +349 -0

app.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from flask import Flask, render_template, request
+import pickle
+import string
+import nltk
+from nltk.corpus import stopwords
+from nltk.stem import PorterStemmer
+nltk.download('stopwords')
+app = Flask(__name__)
+# Load the trained model and vectorizer
+model = pickle.load(open('model.pkl', 'rb'))
+vectorizer = pickle.load(open('vectorizer.pkl', 'rb'))
+@app.route('/')
+def home():
+    return render_template('index.html')
+@app.route('/new-url', methods=['POST'])
+def new_url_predict():
+    comment = request.form['comment']
+    processed_comment = preprocess_comment(comment)
+    features = vectorizer.transform([processed_comment])
+    prediction = model.predict(features)[0]
+    sentiment = get_sentiment_label(prediction)
+    return render_template('result.html', comment=comment, sentiment=sentiment)
+@app.route('/predict', methods=['POST'])
+def predict():
+    comment = request.form['comment']
+    processed_comment = preprocess_comment(comment)
+    features = vectorizer.transform([processed_comment])
+    prediction = model.predict(features)[0]
+    sentiment = get_sentiment_label(prediction)
+    return render_template('result.html', comment=comment, sentiment=sentiment)
+def preprocess_comment(comment):
+    # Comment preprocessing code here
+    comment = comment.lower()
+    comment = comment.translate(str.maketrans('', '', string.punctuation))
+    comment = remove_stopwords(comment)
+    comment = stem_words(comment)
+    return comment
+def remove_stopwords(comment):
+    stopwords_english = set(stopwords.words('english'))
+    comment_tokens = comment.split()
+    comment = ' '.join([word for word in comment_tokens if word not in stopwords_english])
+    return comment
+def stem_words(comment):
+    stemmer = PorterStemmer()
+    comment_tokens = comment.split()
+    comment = ' '.join([stemmer.stem(word) for word in comment_tokens])
+    return comment
+def get_sentiment_label(prediction):
+    if prediction == 0:
+        return 'negative '
+    elif prediction == 1:
+        return 'neutral '
+    elif prediction == 2:
+        return 'positive '
+    else:
+        return 'unknown'
+if __name__ == '__main__':
+    app.run(debug=True, port=5001)

model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3e12d9cad8afa3c46acae3adba5beeb1f3609c10e24e749c9b47fe93d44364a
+size 72758

vectorizer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1ff764be18861f5abfab97327eff36f8b377d6813eff0f133e0894f4c2fea802
+size 404491

ytcomments.py ADDED Viewed

	@@ -0,0 +1,349 @@

+# -*- coding: utf-8 -*-
+"""ytcomments.ipynb
+Automatically generated by Colaboratory.
+Original file is located at
+    https://colab.research.google.com/drive/1IAkt_1sG94cjURWKBvghkoK2KlZiYzX9
+"""
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.model_selection import train_test_split
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score
+from sklearn.preprocessing import LabelEncoder
+df=pd.read_csv("/content/comments.csv")
+df.head()
+df=df.iloc[:,[2,4]]
+df.head(2)
+df.info()
+df.isnull().sum()
+df.dropna(axis=0,how='any',inplace=True)
+df.columns
+df['Sentiment']=df['Sentiment'].astype('int')
+#######################################
+#basic_preprocessing
+#######################################
+df['Comment']=df['Comment'].str.lower()
+import string
+string.punctuation
+exclude=string.punctuation
+def remove_punc(text):
+  for char in exclude:
+    text=text.replace(char,'')
+  return text
+df['Comment']=df['Comment'].apply(remove_punc)
+import nltk
+nltk.download('stopwords')
+from nltk.corpus import stopwords
+stopwords.words('english')
+def remove_stopwords(text):
+  new_text=[]
+  for word in text.split():
+    if word in stopwords.words('english'):
+      new_text.append('')
+    else:
+      new_text.append(word)
+  x=new_text[:]
+  new_text.clear()
+  return " ".join(x)
+df['Comment']=df['Comment'].apply(remove_stopwords)
+from nltk.stem.porter import PorterStemmer
+ps=PorterStemmer()
+def stem_words(text):
+  return " ".join([ps.stem(word) for word in text.split()])
+df['Comment']=df['Comment'].apply(stem_words)
+####################################################
+#EDA
+####################################################
+plt.pie(df['Sentiment'].value_counts(), labels=['negative','neutral','positive'],autopct="%0.2f")
+plt.show()
+import nltk
+nltk.download('punkt')
+df['total_characters']=df['Comment'].apply(len)
+df.head(2)
+df['total_words'] = df['Comment'].apply(lambda x:len(nltk.word_tokenize(x)))
+df.head(2)
+df['total_sentences'] = df['Comment'].apply(lambda x:len(nltk.sent_tokenize(x)))
+df.head(2)
+df[['total_characters','total_sentences','total_words']].describe()
+mask0=df['Sentiment']==0
+mask1=df['Sentiment']==1
+mask2=df['Sentiment']==2
+df[mask0][['total_sentences','total_words','total_characters']].describe()
+df[mask1][['total_sentences','total_words','total_characters']].describe()
+df[mask2][['total_sentences','total_words','total_characters']].describe()
+plt.figure(figsize=(12,6))
+sns.histplot(df[df['Sentiment'] == 0]['total_characters'],color='green')
+sns.histplot(df[df['Sentiment'] == 1]['total_characters'],color='red')
+sns.histplot(df[df['Sentiment'] == 2]['total_characters'],color='pink')
+plt.figure(figsize=(10,4))
+sns.histplot(df[df['Sentiment'] == 0]['total_words'],color='green')
+sns.histplot(df[df['Sentiment'] == 1]['total_words'],color='red')
+sns.histplot(df[df['Sentiment'] == 2]['total_words'],color='pink')
+sns.pairplot(df,hue='Sentiment')
+sns.heatmap(df.corr(),annot=True)
+from wordcloud import WordCloud
+wc=WordCloud(width=500,height=500,min_font_size=10,background_color='white')
+negative_wc = wc.generate(df[df['Sentiment'] == 0]['Comment'].str.cat(sep=" "))
+neutral_wc = wc.generate(df[df['Sentiment'] == 1]['Comment'].str.cat(sep=" "))
+positive_wc = wc.generate(df[df['Sentiment'] == 2]['Comment'].str.cat(sep=" "))
+plt.figure(figsize=(6,6))
+plt.imshow(negative_wc)
+plt.figure(figsize=(6,6))
+plt.imshow(neutral_wc)
+plt.figure(figsize=(6,6))
+plt.imshow(positive_wc)
+negative_corpus = []
+for msg in df[df['Sentiment'] == 0]['Comment'].tolist():
+    for word in msg.split():
+        negative_corpus.append(word)
+neutral_corpus = []
+for msg in df[df['Sentiment'] == 1]['Comment'].tolist():
+    for word in msg.split():
+        neutral_corpus.append(word)
+positive_corpus = []
+for msg in df[df['Sentiment'] == 1]['Comment'].tolist():
+    for word in msg.split():
+        positive_corpus.append(word)
+print(len(negative_corpus))
+print(len(neutral_corpus))
+print(len(positive_corpus))
+from collections import Counter
+pd.DataFrame(Counter(negative_corpus).most_common(30))
+##############################################
+#bag of words#
+##############################################
+cv=CountVectorizer(lowercase=True,stop_words='english',max_features=3000)
+tfidf=TfidfVectorizer(max_features=3000)
+features_cv=cv.fit_transform(df['Comment']).toarray()
+features_tfidf=tfidf.fit_transform(df['Comment']).toarray()
+type(features_cv)
+features_cv
+dict1 = pd.DataFrame(features_cv)
+dict2 = pd.DataFrame(features_tfidf)
+dict1.shape
+df.columns
+x_cv=dict1.iloc[:,:]
+y_cv=df[['Sentiment']]
+y_cv.columns
+np.unique(y_cv)
+x_tfidf=dict1.iloc[:,:]
+y_tfidf=df.iloc[:,-1]
+print(x_cv.shape)
+print(x_tfidf.shape)
+print(y_cv.shape)
+print(y_tfidf.shape)
+x_cv_train,x_cv_test,y_cv_train,y_cv_test=train_test_split(x_cv,y_cv,test_size=0.2)
+y_cv_test.shape
+x_tfidf_train,x_tfidf_test,y_tfidf_train,y_tfidf_test=train_test_split(x_tfidf,y_tfidf,test_size=0.2)
+print(x_cv_train.shape)
+print(y_cv_test.shape)
+print(x_tfidf_train.shape)
+print(y_tfidf_test.shape)
+from sklearn.linear_model import LogisticRegression
+from sklearn.svm import SVC
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import AdaBoostClassifier
+from sklearn.ensemble import BaggingClassifier
+from sklearn.ensemble import ExtraTreesClassifier
+from sklearn.ensemble import GradientBoostingClassifier
+from xgboost import XGBClassifier
+from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
+from sklearn.metrics import accuracy_score,precision_score
+svc = SVC(kernel='sigmoid', gamma=1.0)
+knc = KNeighborsClassifier()
+mnb = MultinomialNB()
+dtc = DecisionTreeClassifier(max_depth=5)
+lrc = LogisticRegression(solver='liblinear', penalty='l1')
+rfc = RandomForestClassifier(n_estimators=50, random_state=2)
+abc = AdaBoostClassifier(n_estimators=50, random_state=2)
+bc = BaggingClassifier(n_estimators=50, random_state=2)
+etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
+gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)
+xgb = XGBClassifier(n_estimators=50,random_state=2)
+clfs = {
+    'SVC' : svc,
+    'KN' : knc,
+    'NB': mnb,
+    'DT': dtc,
+    'LR': lrc,
+    'RF': rfc,
+    'AdaBoost': abc,
+    'BgC': bc,
+    'ETC': etc,
+    'GBDT':gbdt,
+    'xgb':xgb
+}
+def train_classifier(clf,X_train,y_train,X_test,y_test):
+    clf.fit(x_cv_train,y_cv_train)
+    y_cv_pred = clf.predict(x_cv_test)
+    accuracy = accuracy_score(y_cv_test,y_cv_pred)
+    precision = precision_score(y_cv_test,y_cv_pred,average='micro')
+    return accuracy,precision
+#######################################################
+#********************Counvectorizer*******************#
+#######################################################
+np.unique(y_cv_test)
+accuracy_scores = []
+precision_scores = []
+for name,clf in clfs.items():
+    current_accuracy,current_precision = train_classifier(clf, x_cv_train,y_cv_train,x_cv_test,y_cv_test)
+    print("For ",name)
+    print("Accuracy - ",current_accuracy)
+    print("Precision - ",current_precision)
+    accuracy_scores.append(current_accuracy)
+    precision_scores.append(current_precision)
+performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending=False)
+performance_df
+from sklearn.ensemble import VotingClassifier
+svc = SVC(kernel='sigmoid', gamma=1.0,probability=True)
+mnb = MultinomialNB()
+etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
+voting = VotingClassifier(estimators=[('svm', svc), ('nb', mnb), ('et', etc)],voting='soft')
+voting.fit(x_cv_train,y_cv_train)
+y_pred = voting.predict(x_cv_test)
+print("Accuracy",accuracy_score(y_cv_test,y_pred))
+print("Precision",precision_score(y_cv_test,y_pred,average='micro'))
+import pickle
+pickle.dump(cv,open('vectorizer.pkl','wb'))
+pickle.dump(lrc,open('model.pkl','wb'))
+gnb = GaussianNB()
+mnb = MultinomialNB()
+bnb = BernoulliNB()
+gnb.fit(x_cv_train,y_cv_train)
+pred_gnb=gnb.predict(x_cv_test)
+print(accuracy_score(y_cv_test,pred_gnb))
+print(precision_score(y_cv_test,pred_gnb,average='micro'))
+mnb.fit(x_cv_train,y_cv_train)
+pred_mnb=mnb.predict(x_cv_test)
+print(accuracy_score(y_cv_test,pred_mnb))
+print(precision_score(y_cv_test,pred_mnb,average='micro'))
+bnb.fit(x_cv_train,y_cv_train)
+pred_bnb=bnb.predict(x_cv_test)
+print(accuracy_score(y_cv_test,pred_bnb))
+print(precision_score(y_cv_test,pred_bnb,average='micro'))