vasevooo commited on
Commit
93b8631
·
1 Parent(s): 7479f89

Upload imdb.py

Browse files
Files changed (1) hide show
  1. pages/imdb.py +178 -0
pages/imdb.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import pandas as pd
4
+
5
+ import matplotlib.pyplot as plt
6
+ import streamlit as st
7
+ import re
8
+ import string
9
+ from collections import Counter
10
+
11
+ from gensim.models import Word2Vec
12
+ from string import punctuation
13
+ import transformers
14
+ import warnings
15
+ warnings.filterwarnings('ignore')
16
+
17
+ from sklearn.model_selection import train_test_split
18
+ import time
19
+
20
+ from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
21
+ from sklearn.linear_model import LogisticRegression
22
+ import pickle
23
+ import torch
24
+ from torch.utils.data import DataLoader, TensorDataset
25
+ import torch.nn as nn
26
+ import torchutils as tu
27
+ from torchmetrics.classification import BinaryAccuracy
28
+ from data.rnn_preprocessing import (
29
+ data_preprocessing,
30
+ preprocess_single_string
31
+ )
32
+
33
+ def main():
34
+ device = 'cpu'
35
+ df = pd.read_csv('data/imdb.csv')
36
+ df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
37
+ reviews = df['review'].tolist()
38
+ preprocessed = [data_preprocessing(review) for review in reviews]
39
+
40
+ wv = Word2Vec.load('models/word2vec32.model')
41
+
42
+ words_list = [word for review in preprocessed for word in review.lower().split()]
43
+ for i in words_list:
44
+ ''.join([j for j in i if j not in punctuation])
45
+
46
+ # делаем множество уникальных слов.
47
+ unique_words = set(words_list)
48
+
49
+ # word -> index
50
+ vocab_to_int = {word: idx+1 for idx, word in enumerate(sorted(unique_words))}
51
+
52
+ word_seq = [i.split() for i in preprocessed]
53
+ VOCAB_SIZE = len(vocab_to_int) + 1 # add 1 for the padding token
54
+ EMBEDDING_DIM = 32
55
+ HIDDEN_DIM = 64
56
+ SEQ_LEN = 32
57
+
58
+ embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
59
+
60
+ for word, i in vocab_to_int.items():
61
+ try:
62
+ embedding_vector = wv.wv[word]
63
+ embedding_matrix[i] = embedding_vector
64
+ except KeyError:
65
+ pass
66
+
67
+ embedding_layer32 = torch.nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
68
+
69
+
70
+ class LSTMClassifierBi32(nn.Module):
71
+ def __init__(self, embedding_dim: int, hidden_size:int = 32) -> None:
72
+ super().__init__()
73
+
74
+ self.embedding_dim = embedding_dim
75
+ self.hidden_size = hidden_size
76
+ self.embedding = embedding_layer32
77
+ self.lstm = nn.LSTM(
78
+ input_size=self.embedding_dim,
79
+ hidden_size=self.hidden_size,
80
+ batch_first=True,
81
+ bidirectional=True
82
+ )
83
+ self.clf = nn.Sequential(nn.Linear(self.hidden_size*2, 128),
84
+ nn.Dropout(),
85
+ nn.Sigmoid(),
86
+ nn.Linear(128, 64),
87
+ nn.Dropout(),
88
+ nn.Sigmoid(),
89
+ nn.Linear(64, 1)
90
+ )
91
+
92
+ def forward(self, x):
93
+ embeddings = self.embedding(x)
94
+ out, (_, _) = self.lstm(embeddings)
95
+ out = self.clf(out[:,-1,:])
96
+ return out
97
+
98
+ model = LSTMClassifierBi32(embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_DIM)
99
+ model.load_state_dict(torch.load('models/ltsm_bi1.pt'))
100
+ model.eval()
101
+
102
+ def predict_sentence(text:str, model: nn.Module):
103
+ result = model.to(device)(preprocess_single_string(text, seq_len=SEQ_LEN, vocab_to_int=vocab_to_int).unsqueeze(0)).sigmoid().round().item()
104
+ return 'negative' if result == 0.0 else 'positive'
105
+
106
+ #Bag Tfidf
107
+ # bagvectorizer = CountVectorizer(max_df=0.5,
108
+ # min_df=5,
109
+ # stop_words="english",)
110
+ # bvect = bagvectorizer.fit(preprocessed)
111
+ # X_bag = bvect.transform(preprocessed)
112
+
113
+ tfid_vectorizer = TfidfVectorizer(
114
+ max_df=0.5,
115
+ min_df=5)
116
+ vect = tfid_vectorizer.fit(preprocessed)
117
+ X_tfidf = vect.transform(preprocessed)
118
+
119
+ tfidf_model = pickle.load(open('models/modeltfidf.sav', 'rb'))
120
+ # bag_model = pickle.load(open('models/modelbag.sav', 'rb'))
121
+ # def predictbag(text):
122
+ # result = bag_model.predict(vect.transform([text]))
123
+ # return 'negative' if result == [0] else 'positive'
124
+
125
+ def predicttf(text):
126
+ result = tfidf_model.predict(vect.transform([text]))
127
+ return 'negative' if result == [0] else 'positive'
128
+
129
+
130
+
131
+
132
+
133
+
134
+
135
+
136
+
137
+ review = st.text_input('Enter review')
138
+
139
+ start1 = time.time()
140
+
141
+ automodel = transformers.AutoModelForSequenceClassification.from_pretrained(
142
+ 'distilbert-base-uncased-finetuned-sst-2-english'
143
+ )
144
+ autotoken = transformers.AutoTokenizer.from_pretrained(
145
+ 'distilbert-base-uncased-finetuned-sst-2-english'
146
+ )
147
+
148
+
149
+ input_tokens = autotoken(
150
+ review,
151
+ return_tensors='pt',
152
+ padding=True,
153
+ max_length=10
154
+ )
155
+ outputs = automodel(**input_tokens)
156
+ st.write('Sentiment Predictions')
157
+ st.write(f'\nBERT: {[automodel.config.id2label[i.item()] for i in outputs.logits.argmax(-1)]}')
158
+ end1 = time.time()
159
+ st.write(f'{(end1 - start1):.2f} sec')
160
+ start2 = time.time()
161
+
162
+ st.write(f'LTSM: {predict_sentence(review, model)}')
163
+ end2 = time.time()
164
+ st.write(f'{(end2 - start2):.2f} sec')
165
+ # start3 = time.time()
166
+ # st.write(f'bag+log: {predictbag(review)}')
167
+ # end3 = time.time()
168
+ # st.write(f'{(end3 - start3):.2f} sec')
169
+ start4 = time.time()
170
+ st.write(f'tfidf+log: {predicttf(review)}')
171
+ end4 = time.time()
172
+ st.write(f'{(end4 - start4):.2f} sec')
173
+
174
+
175
+
176
+
177
+ if __name__ == '__main__':
178
+ main()