Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,158 @@
|
|
1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
def greet(name):
|
|
|
4 |
return "Hello " + name + "!!"
|
5 |
|
6 |
iface = gr.Interface(fn=greet, inputs="text", outputs="text")
|
|
|
1 |
import gradio as gr
|
2 |
+
from pytube import extract
|
3 |
+
import os
|
4 |
+
import pandas as pd
|
5 |
+
import string
|
6 |
+
import re
|
7 |
+
import nltk
|
8 |
+
import nltk.sentiment.util
|
9 |
+
from nltk.corpus import stopwords
|
10 |
+
from nltk.stem import WordNetLemmatizer
|
11 |
+
from textblob import TextBlob
|
12 |
+
import numpy as np
|
13 |
+
import math
|
14 |
|
15 |
+
sw = stopwords.words('english')
|
16 |
+
lemmatizer = WordNetLemmatizer()
|
17 |
+
|
18 |
+
|
19 |
+
## get YouTube ID
|
20 |
+
def getID(url):
|
21 |
+
print("Getting YouTube ID...")
|
22 |
+
return extract.video_id(url)
|
23 |
+
|
24 |
+
## function to clean comments
|
25 |
+
def clean_text(text):
|
26 |
+
# remove symbols and Emojis
|
27 |
+
text = text.lower()
|
28 |
+
text = re.sub('@', '', text)
|
29 |
+
text = re.sub('\[.*?\]', '', text)
|
30 |
+
text = re.sub('https?://\S+|www\.\S+', '', text)
|
31 |
+
text = re.sub('<.*?>+', '', text)
|
32 |
+
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
|
33 |
+
text = re.sub('\n', '', text)
|
34 |
+
text = re.sub('\w*\d\w*', '', text)
|
35 |
+
text = re.sub(r"[^a-zA-Z ]+", "", text)
|
36 |
+
|
37 |
+
# tokenize the data
|
38 |
+
text = nltk.word_tokenize(text)
|
39 |
+
|
40 |
+
# lemmatize
|
41 |
+
text = [lemmatizer.lemmatize(t) for t in text]
|
42 |
+
text = [lemmatizer.lemmatize(t, 'v') for t in text]
|
43 |
+
|
44 |
+
# mark Negation
|
45 |
+
tokens_neg_marked = nltk.sentiment.util.mark_negation(text)
|
46 |
+
|
47 |
+
# remove stopwords
|
48 |
+
text = [t for t in tokens_neg_marked
|
49 |
+
if t.replace("_NEG", "").isalnum() and
|
50 |
+
t.replace("_NEG", "") not in sw]
|
51 |
+
|
52 |
+
return text
|
53 |
+
|
54 |
+
## download comments
|
55 |
+
def downloadComments(videoID):
|
56 |
+
print("Downloading Comments...")
|
57 |
+
os.system("youtube-comment-downloader --youtubeid=" + videoID + " --output Comments/" + videoID + ".json")
|
58 |
+
|
59 |
+
|
60 |
+
def getSentenceTrain():
|
61 |
+
# open sentences_train file
|
62 |
+
sentences_train_f = open('../Deep learning/pickles/sentences_train.pickle', "rb")
|
63 |
+
sentences_train = pickle.load(sentences_train_f)
|
64 |
+
sentences_train_f.close()
|
65 |
+
return sentences_train
|
66 |
+
|
67 |
+
## voting function
|
68 |
+
def vote(test_point, _test):
|
69 |
+
print("Voting on video effectivess...\n")
|
70 |
+
pos_weighting = []
|
71 |
+
result = ''
|
72 |
+
confidence = 0
|
73 |
+
algos_score = 0
|
74 |
+
|
75 |
+
algorithms = [
|
76 |
+
{'name': 'Random Forest', 'accuracy': accuracy_score(y_test, y_pred_RandFor)*100, 'trained': randFor_train},
|
77 |
+
{'name': 'SGD', 'accuracy': accuracy_score(y_test, y_pred_SGD)*100, 'trained': SGD_train},
|
78 |
+
{'name': 'XGBoost', 'accuracy': accuracy_score(y_test, y_pred_SGD)*100, 'trained': XGB_train},
|
79 |
+
{'name': 'Logistic Regression', 'accuracy': accuracy_score(y_test, y_pred_logreg)*100, 'trained': logreg_train},
|
80 |
+
{'name': 'CNN', 'accuracy': CNN_accuracy*100, 'trained': model}
|
81 |
+
]
|
82 |
+
|
83 |
+
for algo in algorithms:
|
84 |
+
weight = algo['accuracy']
|
85 |
+
algos_score += weight
|
86 |
+
if algo['name'] == "CNN":
|
87 |
+
pred = algo['trained'].predict(_test)
|
88 |
+
if pred[0][0] > 0.5:
|
89 |
+
pos_weighting.append(weight)
|
90 |
+
print("CNN voted for: effective" if pred[0][0]>0.5 else "CNN voted for: ineffective")
|
91 |
+
else:
|
92 |
+
pred = algo['trained'].predict(test_point)
|
93 |
+
if pred == 'pos':
|
94 |
+
pos_weighting.append(weight)
|
95 |
+
print(algo['name'] + " voted for: effective" if pred=='pos' else algo['name'] + " voted for: ineffective")
|
96 |
+
|
97 |
+
pos_result = sum(pos_weighting)/algos_score
|
98 |
+
if pos_result < 0.5:
|
99 |
+
result = 'ineffective'
|
100 |
+
confidence = 1-pos_result
|
101 |
+
else:
|
102 |
+
result = 'effective'
|
103 |
+
confidence = pos_result
|
104 |
+
|
105 |
+
print("\nThis video is \033[1m" + result + "\033[0m with a confidence of \033[1m" + str(round(confidence*100,2)) + "% \033[0m")
|
106 |
+
|
107 |
+
def quantizeEffectiveness(url):
|
108 |
+
# 1. Get YouTube ID
|
109 |
+
videoID = getID(url)
|
110 |
+
|
111 |
+
# 2. Download comments
|
112 |
+
downloadComments(videoID)
|
113 |
+
|
114 |
+
# 3. Clean comments
|
115 |
+
print("Cleaning Comments...")
|
116 |
+
df = pd.read_json('Comments/'+ videoID + '.json', lines=True)
|
117 |
+
df['text'] = df['text'].apply(lambda x: clean_text(x))
|
118 |
+
|
119 |
+
all_words = []
|
120 |
+
for i in range(len(df)):
|
121 |
+
all_words = all_words + df['text'][i]
|
122 |
+
|
123 |
+
df_csv = pd.DataFrame(all_words)
|
124 |
+
df_csv.to_csv('Processed Comments/' + videoID + '_all_words.csv', index=False)
|
125 |
+
|
126 |
+
# 4. Create test dataframe
|
127 |
+
test = pd.DataFrame([[videoID]], columns=['VideoID'])
|
128 |
+
|
129 |
+
# 5. Get documnets (pre-processd comments)
|
130 |
+
test_documents = []
|
131 |
+
comment = pd.read_csv("Processed Comments/" + videoID + "_all_words.csv")
|
132 |
+
test_documents.append(list(comment["0"]))
|
133 |
+
test['cleaned'] = test_documents
|
134 |
+
test['cleaned_string'] = [' '.join(map(str, l)) for l in test['cleaned']]
|
135 |
+
|
136 |
+
# 6. Get ML test point
|
137 |
+
test_point = test.cleaned_string
|
138 |
+
test_sentence = test['cleaned_string'].values
|
139 |
+
|
140 |
+
# 7. Get trained sentences
|
141 |
+
sentences_train = getSentenceTrain()
|
142 |
+
|
143 |
+
# 8. Tokenize the data
|
144 |
+
print("Tokenizing the data...")
|
145 |
+
tokenizer = Tokenizer(num_words=5000)
|
146 |
+
tokenizer.fit_on_texts(sentences_train)
|
147 |
+
|
148 |
+
# 9. Get DL test point
|
149 |
+
_test = pad_sequences(tokenizer.texts_to_sequences(test_sentence), padding='post', maxlen=100)
|
150 |
+
|
151 |
+
# 10. Vote on video effectiveness
|
152 |
+
vote(test_point,_test)
|
153 |
+
|
154 |
def greet(name):
|
155 |
+
# vote = quantizeEffectiveness("https://www.youtube.com/watch?v=DhhVr5iLF-c")
|
156 |
return "Hello " + name + "!!"
|
157 |
|
158 |
iface = gr.Interface(fn=greet, inputs="text", outputs="text")
|