vasevooo commited on
Commit
dd68f2f
·
1 Parent(s): e84abd6

Update pages/imdb.py

Browse files
Files changed (1) hide show
  1. pages/imdb.py +80 -138
pages/imdb.py CHANGED
@@ -1,157 +1,99 @@
1
- import os
2
- import numpy as np
3
  import pandas as pd
4
-
5
- import matplotlib.pyplot as plt
6
  import streamlit as st
7
- import re
8
- import string
9
- from collections import Counter
10
-
11
- from transformers import AutoConfig, AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments
12
-
13
- from gensim.models import Word2Vec
14
- from string import punctuation
15
  import transformers
16
- import warnings
17
- warnings.filterwarnings('ignore')
18
-
19
- from sklearn.model_selection import train_test_split
20
  import time
21
-
22
- from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
23
- from sklearn.linear_model import LogisticRegression
24
  import pickle
25
- import torch
26
- from torch.utils.data import DataLoader, TensorDataset
 
 
27
  import torch.nn as nn
28
- import torchutils as tu
29
- from torchmetrics.classification import BinaryAccuracy
30
  from data.rnn_preprocessing import (
31
- data_preprocessing,
32
- preprocess_single_string
33
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  def main():
36
- device = 'cpu'
37
  df = pd.read_csv('data/imdb.csv')
38
  df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
39
  reviews = df['review'].tolist()
40
  preprocessed = [data_preprocessing(review) for review in reviews]
41
 
42
- wv = Word2Vec.load('models/word2vec32.model')
43
-
44
- words_list = [word for review in preprocessed for word in review.lower().split()]
45
- for i in words_list:
46
- ''.join([j for j in i if j not in punctuation])
47
-
48
- # делаем множество уникальных слов.
49
- unique_words = set(words_list)
50
-
51
- # word -> index
52
- vocab_to_int = {word: idx+1 for idx, word in enumerate(sorted(unique_words))}
53
-
54
- word_seq = [i.split() for i in preprocessed]
55
- VOCAB_SIZE = len(vocab_to_int) + 1 # add 1 for the padding token
56
- EMBEDDING_DIM = 32
57
- HIDDEN_DIM = 64
58
- SEQ_LEN = 32
59
-
60
- embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
61
-
62
- for word, i in vocab_to_int.items():
63
- try:
64
- embedding_vector = wv.wv[word]
65
- embedding_matrix[i] = embedding_vector
66
- except KeyError:
67
- pass
68
-
69
- embedding_layer32 = torch.nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
70
-
71
-
72
- class LSTMClassifierBi32(nn.Module):
73
- def __init__(self, embedding_dim: int, hidden_size:int = 32) -> None:
74
- super().__init__()
75
-
76
- self.embedding_dim = embedding_dim
77
- self.hidden_size = hidden_size
78
- self.embedding = embedding_layer32
79
- self.lstm = nn.LSTM(
80
- input_size=self.embedding_dim,
81
- hidden_size=self.hidden_size,
82
- batch_first=True,
83
- bidirectional=True
84
- )
85
- self.clf = nn.Sequential(nn.Linear(self.hidden_size*2, 128),
86
- nn.Dropout(),
87
- nn.Sigmoid(),
88
- nn.Linear(128, 64),
89
- nn.Dropout(),
90
- nn.Sigmoid(),
91
- nn.Linear(64, 1)
92
- )
93
-
94
- def forward(self, x):
95
- embeddings = self.embedding(x)
96
- out, (_, _) = self.lstm(embeddings)
97
- out = self.clf(out[:,-1,:])
98
- return out
99
-
100
- model = LSTMClassifierBi32(embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_DIM)
101
- model.load_state_dict(torch.load('models/ltsm_bi1.pt'))
102
- model.eval()
103
-
104
- def predict_sentence(text:str, model: nn.Module):
105
- result = model.to(device)(preprocess_single_string(text, seq_len=SEQ_LEN, vocab_to_int=vocab_to_int).unsqueeze(0)).sigmoid().round().item()
106
- return 'negative' if result == 0.0 else 'positive'
107
-
108
- #Bag Tfidf
109
- # bagvectorizer = CountVectorizer(max_df=0.5,
110
- # min_df=5,
111
- # stop_words="english",)
112
- # bvect = bagvectorizer.fit(preprocessed)
113
- # X_bag = bvect.transform(preprocessed)
114
-
115
- tfid_vectorizer = TfidfVectorizer(
116
- max_df=0.5,
117
- min_df=5)
118
  vect = tfid_vectorizer.fit(preprocessed)
119
  X_tfidf = vect.transform(preprocessed)
120
-
121
- tfidf_model = pickle.load(open('models/modeltfidf.sav', 'rb'))
122
- # bag_model = pickle.load(open('models/modelbag.sav', 'rb'))
123
- # def predictbag(text):
124
- # result = bag_model.predict(vect.transform([text]))
125
- # return 'negative' if result == [0] else 'positive'
126
-
127
- def predicttf(text):
128
- result = tfidf_model.predict(vect.transform([text]))
129
- return 'negative' if result == [0] else 'positive'
130
-
131
-
132
-
133
-
134
-
135
-
136
-
137
-
138
-
139
  review = st.text_input('Enter review')
140
 
141
  start1 = time.time()
142
-
143
- tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
144
- config = AutoConfig.from_pretrained('distilbert-base-uncased', num_labels=2)
145
-
146
- automodel = AutoModelForSequenceClassification.from_config(config)
147
  autotoken = transformers.AutoTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
148
 
149
  input_tokens = autotoken(
150
- review,
151
- return_tensors='pt',
152
- padding=True,
153
  max_length=10
154
  )
 
 
 
155
  outputs = automodel(**input_tokens)
156
  st.write('Sentiment Predictions')
157
  st.write(f'\nBERT: {[automodel.config.id2label[i.item()] for i in outputs.logits.argmax(-1)]}')
@@ -159,20 +101,20 @@ def main():
159
  st.write(f'{(end1 - start1):.2f} sec')
160
  start2 = time.time()
161
 
162
- st.write(f'LTSM: {predict_sentence(review, model)}')
163
  end2 = time.time()
164
  st.write(f'{(end2 - start2):.2f} sec')
165
- # start3 = time.time()
166
- # st.write(f'bag+log: {predictbag(review)}')
167
- # end3 = time.time()
168
- # st.write(f'{(end3 - start3):.2f} sec')
169
  start4 = time.time()
170
- st.write(f'tfidf+log: {predicttf(review)}')
171
  end4 = time.time()
172
  st.write(f'{(end4 - start4):.2f} sec')
173
 
174
 
175
-
 
 
 
176
 
177
  if __name__ == '__main__':
178
- main()
 
 
 
1
  import pandas as pd
 
 
2
  import streamlit as st
3
+ import torch
 
 
 
 
 
 
 
4
  import transformers
 
 
 
 
5
  import time
 
 
 
6
  import pickle
7
+ import numpy as np
8
+ from gensim.models import Word2Vec
9
+ from sklearn.feature_extraction.text import TfidfVectorizer
10
+ from sklearn.linear_model import LogisticRegression
11
  import torch.nn as nn
 
 
12
  from data.rnn_preprocessing import (
13
+ data_preprocessing,
14
+ preprocess_single_string
15
+ )
16
+
17
+ # Load Word2Vec model
18
+ wv = Word2Vec.load('models/word2vec32.model')
19
+ embedding_matrix = wv.wv.vectors
20
+ vocab_to_int = {word: idx + 1 for idx, word in enumerate(wv.wv.index_to_key)}
21
+
22
+ # Load TF-IDF model
23
+ tfidf_model = pickle.load(open('models/modeltfidf.sav', 'rb'))
24
+
25
+ # Load LSTM model
26
+ embedding_layer32 = torch.nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
27
+ VOCAB_SIZE, EMBEDDING_DIM = embedding_matrix.shape
28
+ HIDDEN_DIM = 64
29
+ SEQ_LEN = 32
30
+
31
+
32
+ class LSTMClassifierBi32(nn.Module):
33
+ def __init__(self, embedding_dim: int, hidden_size: int = 32) -> None:
34
+ super().__init__()
35
+
36
+ self.embedding_dim = embedding_dim
37
+ self.hidden_size = hidden_size
38
+ self.embedding = embedding_layer32
39
+ self.lstm = nn.LSTM(
40
+ input_size=self.embedding_dim,
41
+ hidden_size=self.hidden_size,
42
+ batch_first=True,
43
+ bidirectional=True
44
+ )
45
+ self.clf = nn.Sequential(
46
+ nn.Linear(self.hidden_size * 2, 128),
47
+ nn.Dropout(),
48
+ nn.Sigmoid(),
49
+ nn.Linear(128, 64),
50
+ nn.Dropout(),
51
+ nn.Sigmoid(),
52
+ nn.Linear(64, 1)
53
+ )
54
+
55
+ def forward(self, x):
56
+ embeddings = self.embedding(x)
57
+ out, (_, _) = self.lstm(embeddings)
58
+ out = self.clf(out[:, -1, :])
59
+ return out
60
+
61
+
62
+ model = LSTMClassifierBi32(embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_DIM)
63
+ model.load_state_dict(torch.load('models/ltsm_bi1.pt'))
64
+ model.eval()
65
+
66
+
67
+ def predict_sentence(text: str, model: nn.Module):
68
+ result = model(preprocess_single_string(text, seq_len=SEQ_LEN, vocab_to_int=vocab_to_int).unsqueeze(0)).sigmoid().round().item()
69
+ return 'negative' if result == 0.0 else 'positive'
70
+
71
 
72
  def main():
 
73
  df = pd.read_csv('data/imdb.csv')
74
  df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
75
  reviews = df['review'].tolist()
76
  preprocessed = [data_preprocessing(review) for review in reviews]
77
 
78
+ tfid_vectorizer = TfidfVectorizer(max_df=0.5, min_df=5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  vect = tfid_vectorizer.fit(preprocessed)
80
  X_tfidf = vect.transform(preprocessed)
81
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  review = st.text_input('Enter review')
83
 
84
  start1 = time.time()
85
+
 
 
 
 
86
  autotoken = transformers.AutoTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
87
 
88
  input_tokens = autotoken(
89
+ review,
90
+ return_tensors='pt',
91
+ padding=True,
92
  max_length=10
93
  )
94
+
95
+ config = transformers.AutoConfig.from_pretrained('distilbert-base-uncased', num_labels=2)
96
+ automodel = transformers.AutoModelForSequenceClassification.from_config(config)
97
  outputs = automodel(**input_tokens)
98
  st.write('Sentiment Predictions')
99
  st.write(f'\nBERT: {[automodel.config.id2label[i.item()] for i in outputs.logits.argmax(-1)]}')
 
101
  st.write(f'{(end1 - start1):.2f} sec')
102
  start2 = time.time()
103
 
104
+ st.write(f'LSTM: {predict_sentence(review, model)}')
105
  end2 = time.time()
106
  st.write(f'{(end2 - start2):.2f} sec')
107
+
 
 
 
108
  start4 = time.time()
109
+ st.write(f'TF-IDF+Logistic Regression: {predicttf(review)}')
110
  end4 = time.time()
111
  st.write(f'{(end4 - start4):.2f} sec')
112
 
113
 
114
+ def predicttf(text):
115
+ result = tfidf_model.predict(vect.transform([text]))
116
+ return 'negative' if result == [0] else 'positive'
117
+
118
 
119
  if __name__ == '__main__':
120
+ main()