jaredjoss commited on
Commit
42c5746
·
1 Parent(s): 015caa4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +152 -0
app.py CHANGED
@@ -1,6 +1,158 @@
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  def greet(name):
 
4
  return "Hello " + name + "!!"
5
 
6
  iface = gr.Interface(fn=greet, inputs="text", outputs="text")
 
1
  import gradio as gr
2
+ from pytube import extract
3
+ import os
4
+ import pandas as pd
5
+ import string
6
+ import re
7
+ import nltk
8
+ import nltk.sentiment.util
9
+ from nltk.corpus import stopwords
10
+ from nltk.stem import WordNetLemmatizer
11
+ from textblob import TextBlob
12
+ import numpy as np
13
+ import math
14
 
15
+ sw = stopwords.words('english')
16
+ lemmatizer = WordNetLemmatizer()
17
+
18
+
19
+ ## get YouTube ID
20
+ def getID(url):
21
+ print("Getting YouTube ID...")
22
+ return extract.video_id(url)
23
+
24
+ ## function to clean comments
25
+ def clean_text(text):
26
+ # remove symbols and Emojis
27
+ text = text.lower()
28
+ text = re.sub('@', '', text)
29
+ text = re.sub('\[.*?\]', '', text)
30
+ text = re.sub('https?://\S+|www\.\S+', '', text)
31
+ text = re.sub('<.*?>+', '', text)
32
+ text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
33
+ text = re.sub('\n', '', text)
34
+ text = re.sub('\w*\d\w*', '', text)
35
+ text = re.sub(r"[^a-zA-Z ]+", "", text)
36
+
37
+ # tokenize the data
38
+ text = nltk.word_tokenize(text)
39
+
40
+ # lemmatize
41
+ text = [lemmatizer.lemmatize(t) for t in text]
42
+ text = [lemmatizer.lemmatize(t, 'v') for t in text]
43
+
44
+ # mark Negation
45
+ tokens_neg_marked = nltk.sentiment.util.mark_negation(text)
46
+
47
+ # remove stopwords
48
+ text = [t for t in tokens_neg_marked
49
+ if t.replace("_NEG", "").isalnum() and
50
+ t.replace("_NEG", "") not in sw]
51
+
52
+ return text
53
+
54
+ ## download comments
55
+ def downloadComments(videoID):
56
+ print("Downloading Comments...")
57
+ os.system("youtube-comment-downloader --youtubeid=" + videoID + " --output Comments/" + videoID + ".json")
58
+
59
+
60
+ def getSentenceTrain():
61
+ # open sentences_train file
62
+ sentences_train_f = open('../Deep learning/pickles/sentences_train.pickle', "rb")
63
+ sentences_train = pickle.load(sentences_train_f)
64
+ sentences_train_f.close()
65
+ return sentences_train
66
+
67
+ ## voting function
68
+ def vote(test_point, _test):
69
+ print("Voting on video effectivess...\n")
70
+ pos_weighting = []
71
+ result = ''
72
+ confidence = 0
73
+ algos_score = 0
74
+
75
+ algorithms = [
76
+ {'name': 'Random Forest', 'accuracy': accuracy_score(y_test, y_pred_RandFor)*100, 'trained': randFor_train},
77
+ {'name': 'SGD', 'accuracy': accuracy_score(y_test, y_pred_SGD)*100, 'trained': SGD_train},
78
+ {'name': 'XGBoost', 'accuracy': accuracy_score(y_test, y_pred_SGD)*100, 'trained': XGB_train},
79
+ {'name': 'Logistic Regression', 'accuracy': accuracy_score(y_test, y_pred_logreg)*100, 'trained': logreg_train},
80
+ {'name': 'CNN', 'accuracy': CNN_accuracy*100, 'trained': model}
81
+ ]
82
+
83
+ for algo in algorithms:
84
+ weight = algo['accuracy']
85
+ algos_score += weight
86
+ if algo['name'] == "CNN":
87
+ pred = algo['trained'].predict(_test)
88
+ if pred[0][0] > 0.5:
89
+ pos_weighting.append(weight)
90
+ print("CNN voted for: effective" if pred[0][0]>0.5 else "CNN voted for: ineffective")
91
+ else:
92
+ pred = algo['trained'].predict(test_point)
93
+ if pred == 'pos':
94
+ pos_weighting.append(weight)
95
+ print(algo['name'] + " voted for: effective" if pred=='pos' else algo['name'] + " voted for: ineffective")
96
+
97
+ pos_result = sum(pos_weighting)/algos_score
98
+ if pos_result < 0.5:
99
+ result = 'ineffective'
100
+ confidence = 1-pos_result
101
+ else:
102
+ result = 'effective'
103
+ confidence = pos_result
104
+
105
+ print("\nThis video is \033[1m" + result + "\033[0m with a confidence of \033[1m" + str(round(confidence*100,2)) + "% \033[0m")
106
+
107
+ def quantizeEffectiveness(url):
108
+ # 1. Get YouTube ID
109
+ videoID = getID(url)
110
+
111
+ # 2. Download comments
112
+ downloadComments(videoID)
113
+
114
+ # 3. Clean comments
115
+ print("Cleaning Comments...")
116
+ df = pd.read_json('Comments/'+ videoID + '.json', lines=True)
117
+ df['text'] = df['text'].apply(lambda x: clean_text(x))
118
+
119
+ all_words = []
120
+ for i in range(len(df)):
121
+ all_words = all_words + df['text'][i]
122
+
123
+ df_csv = pd.DataFrame(all_words)
124
+ df_csv.to_csv('Processed Comments/' + videoID + '_all_words.csv', index=False)
125
+
126
+ # 4. Create test dataframe
127
+ test = pd.DataFrame([[videoID]], columns=['VideoID'])
128
+
129
+ # 5. Get documnets (pre-processd comments)
130
+ test_documents = []
131
+ comment = pd.read_csv("Processed Comments/" + videoID + "_all_words.csv")
132
+ test_documents.append(list(comment["0"]))
133
+ test['cleaned'] = test_documents
134
+ test['cleaned_string'] = [' '.join(map(str, l)) for l in test['cleaned']]
135
+
136
+ # 6. Get ML test point
137
+ test_point = test.cleaned_string
138
+ test_sentence = test['cleaned_string'].values
139
+
140
+ # 7. Get trained sentences
141
+ sentences_train = getSentenceTrain()
142
+
143
+ # 8. Tokenize the data
144
+ print("Tokenizing the data...")
145
+ tokenizer = Tokenizer(num_words=5000)
146
+ tokenizer.fit_on_texts(sentences_train)
147
+
148
+ # 9. Get DL test point
149
+ _test = pad_sequences(tokenizer.texts_to_sequences(test_sentence), padding='post', maxlen=100)
150
+
151
+ # 10. Vote on video effectiveness
152
+ vote(test_point,_test)
153
+
154
  def greet(name):
155
+ # vote = quantizeEffectiveness("https://www.youtube.com/watch?v=DhhVr5iLF-c")
156
  return "Hello " + name + "!!"
157
 
158
  iface = gr.Interface(fn=greet, inputs="text", outputs="text")