DevBM commited on
Commit
569edb3
·
verified ·
1 Parent(s): f713c7c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -117
app.py CHANGED
@@ -1,118 +1,118 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import numpy as np
4
- import matplotlib.pyplot as plt
5
- from sklearn.feature_extraction.text import TfidfVectorizer
6
- from sklearn.model_selection import train_test_split
7
- from sklearn.svm import SVC
8
- from sklearn.linear_model import LogisticRegression
9
- from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
10
- from sklearn.metrics import accuracy_score, classification_report
11
- from transformers import BertTokenizer, BertForSequenceClassification
12
- import torch
13
-
14
- @st.cache_data
15
- def load_data():
16
- return pd.read_csv('IMDB Dataset.csv')
17
-
18
- if 'models' not in st.session_state:
19
- st.session_state.models = {}
20
- if 'reports' not in st.session_state:
21
- st.session_state.reports = {}
22
- if 'accuracy' not in st.session_state:
23
- st.session_state.accuracy = {}
24
-
25
- df = load_data()
26
-
27
- df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
28
-
29
- X = df['review']
30
- y = df['sentiment']
31
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
32
-
33
- if not st.session_state.models:
34
- vectorizer = TfidfVectorizer()
35
- X_train_tfidf = vectorizer.fit_transform(X_train)
36
-
37
- # models
38
- models = {
39
- "SVM": SVC(kernel='linear'),
40
- "Logistic Regression": LogisticRegression(max_iter=1000),
41
- "Random Forest": RandomForestClassifier(n_estimators=10),
42
- "Gradient Boosting": GradientBoostingClassifier()
43
- }
44
-
45
- for name, model in models.items():
46
- model.fit(X_train_tfidf, y_train)
47
- st.session_state.models[name] = model
48
- X_test_tfidf = vectorizer.transform(X_test)
49
- y_pred = model.predict(X_test_tfidf)
50
- st.session_state.accuracy[name] = accuracy_score(y_test, y_pred)
51
- report = classification_report(y_test, y_pred, output_dict=True)
52
- st.session_state.reports[name] = pd.DataFrame(report).transpose()
53
-
54
- st.session_state.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
55
- st.session_state.bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
56
-
57
- train_encodings = st.session_state.bert_tokenizer(list(X_train), truncation=True, padding=True, return_tensors='pt')
58
- train_labels = torch.tensor(y_train.values)
59
-
60
- train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
61
-
62
- training_args = torch.optim.AdamW(st.session_state.bert_model.parameters(), lr=1e-5)
63
- st.session_state.bert_model.train()
64
-
65
- for epoch in range(1):
66
- for batch in train_dataset:
67
- inputs = batch[0], batch[1]
68
- labels = batch[2]
69
- outputs = st.session_state.bert_model(*inputs, labels=labels)
70
- loss = outputs.loss
71
- loss.backward()
72
- training_args.step()
73
- training_args.zero_grad()
74
-
75
- st.session_state.bert_model.eval()
76
- test_encodings = st.session_state.bert_tokenizer(list(X_test), truncation=True, padding=True, return_tensors='pt')
77
- with torch.no_grad():
78
- outputs = st.session_state.bert_model(test_encodings['input_ids'], test_encodings['attention_mask'])
79
- predictions = torch.argmax(outputs.logits, dim=1).numpy()
80
- st.session_state.accuracy["BERT"] = accuracy_score(y_test, predictions)
81
- report = classification_report(y_test, predictions, output_dict=True)
82
- st.session_state.reports["BERT"] = pd.DataFrame(report).transpose()
83
-
84
- if st.session_state.accuracy:
85
-
86
- plt.figure(figsize=(10, 5))
87
- plt.bar(st.session_state.accuracy.keys(), st.session_state.accuracy.values(), color=['blue', 'orange', 'green','red', 'purple'])
88
- plt.ylabel('Accuracy')
89
- plt.title('Model Accuracy Comparison')
90
- st.pyplot(plt)
91
-
92
- for name, report_df in st.session_state.reports.items():
93
- st.header(f"{name}",divider='orange')
94
- st.dataframe(report_df)
95
-
96
- st.header("Manual Tryouts")
97
- user_input = st.text_area("Review", "")
98
-
99
- if st.button("Predict"):
100
- if user_input:
101
- user_input_tfidf = vectorizer.transform([user_input])
102
-
103
- predictions = {}
104
- for name, model in st.session_state.models.items():
105
- prediction = model.predict(user_input_tfidf)
106
- predictions[name] = "Positive" if prediction[0] == 1 else "Negative"
107
-
108
- inputs = st.session_state.bert_tokenizer(user_input, return_tensors='pt', truncation=True, padding=True)
109
- with torch.no_grad():
110
- output = st.session_state.bert_model(inputs['input_ids'], inputs['attention_mask'])
111
- bert_prediction = torch.argmax(output.logits, dim=1).item()
112
- predictions["BERT"] = "Positive" if bert_prediction == 1 else "Negative"
113
-
114
- st.write("Predicted Sentiment:")
115
- for name in predictions:
116
- st.write(f"{name}: **{predictions[name]}**")
117
- else:
118
  st.write("Please enter a review.")
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ from sklearn.feature_extraction.text import TfidfVectorizer
6
+ from sklearn.model_selection import train_test_split
7
+ from sklearn.svm import SVC
8
+ from sklearn.linear_model import LogisticRegression
9
+ from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
10
+ from sklearn.metrics import accuracy_score, classification_report
11
+ from transformers import BertTokenizer, BertForSequenceClassification
12
+ import torch
13
+
14
+ @st.cache_data
15
+ def load_data():
16
+ return pd.read_csv('IMDB Dataset.csv')
17
+
18
+ if 'models' not in st.session_state:
19
+ st.session_state.models = {}
20
+ if 'reports' not in st.session_state:
21
+ st.session_state.reports = {}
22
+ if 'accuracy' not in st.session_state:
23
+ st.session_state.accuracy = {}
24
+
25
+ df = load_data()
26
+
27
+ df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
28
+
29
+ X = df['review']
30
+ y = df['sentiment']
31
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
32
+
33
+ if not st.session_state.models:
34
+ vectorizer = TfidfVectorizer()
35
+ X_train_tfidf = vectorizer.fit_transform(X_train)
36
+
37
+ # models
38
+ models = {
39
+ # "SVM": SVC(kernel='linear'),
40
+ "Logistic Regression": LogisticRegression(max_iter=1000),
41
+ # "Random Forest": RandomForestClassifier(n_estimators=10),
42
+ # "Gradient Boosting": GradientBoostingClassifier()
43
+ }
44
+
45
+ for name, model in models.items():
46
+ model.fit(X_train_tfidf, y_train)
47
+ st.session_state.models[name] = model
48
+ X_test_tfidf = vectorizer.transform(X_test)
49
+ y_pred = model.predict(X_test_tfidf)
50
+ st.session_state.accuracy[name] = accuracy_score(y_test, y_pred)
51
+ report = classification_report(y_test, y_pred, output_dict=True)
52
+ st.session_state.reports[name] = pd.DataFrame(report).transpose()
53
+
54
+ # st.session_state.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
55
+ # st.session_state.bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
56
+
57
+ # train_encodings = st.session_state.bert_tokenizer(list(X_train), truncation=True, padding=True, return_tensors='pt')
58
+ # train_labels = torch.tensor(y_train.values)
59
+
60
+ # train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
61
+
62
+ # training_args = torch.optim.AdamW(st.session_state.bert_model.parameters(), lr=1e-5)
63
+ # st.session_state.bert_model.train()
64
+
65
+ # for epoch in range(1):
66
+ # for batch in train_dataset:
67
+ # inputs = batch[0], batch[1]
68
+ # labels = batch[2]
69
+ # outputs = st.session_state.bert_model(*inputs, labels=labels)
70
+ # loss = outputs.loss
71
+ # loss.backward()
72
+ # training_args.step()
73
+ # training_args.zero_grad()
74
+
75
+ # st.session_state.bert_model.eval()
76
+ # test_encodings = st.session_state.bert_tokenizer(list(X_test), truncation=True, padding=True, return_tensors='pt')
77
+ # with torch.no_grad():
78
+ # outputs = st.session_state.bert_model(test_encodings['input_ids'], test_encodings['attention_mask'])
79
+ # predictions = torch.argmax(outputs.logits, dim=1).numpy()
80
+ # st.session_state.accuracy["BERT"] = accuracy_score(y_test, predictions)
81
+ # report = classification_report(y_test, predictions, output_dict=True)
82
+ # st.session_state.reports["BERT"] = pd.DataFrame(report).transpose()
83
+
84
+ if st.session_state.accuracy:
85
+
86
+ plt.figure(figsize=(10, 5))
87
+ plt.bar(st.session_state.accuracy.keys(), st.session_state.accuracy.values(), color=['blue', 'orange', 'green','red', 'purple'])
88
+ plt.ylabel('Accuracy')
89
+ plt.title('Model Accuracy Comparison')
90
+ st.pyplot(plt)
91
+
92
+ for name, report_df in st.session_state.reports.items():
93
+ st.header(f"{name}",divider='orange')
94
+ st.dataframe(report_df)
95
+
96
+ st.header("Manual Tryouts")
97
+ user_input = st.text_area("Review", "")
98
+
99
+ if st.button("Predict"):
100
+ if user_input:
101
+ user_input_tfidf = vectorizer.transform([user_input])
102
+
103
+ predictions = {}
104
+ for name, model in st.session_state.models.items():
105
+ prediction = model.predict(user_input_tfidf)
106
+ predictions[name] = "Positive" if prediction[0] == 1 else "Negative"
107
+
108
+ # inputs = st.session_state.bert_tokenizer(user_input, return_tensors='pt', truncation=True, padding=True)
109
+ # with torch.no_grad():
110
+ # output = st.session_state.bert_model(inputs['input_ids'], inputs['attention_mask'])
111
+ # bert_prediction = torch.argmax(output.logits, dim=1).item()
112
+ # predictions["BERT"] = "Positive" if bert_prediction == 1 else "Negative"
113
+
114
+ st.write("Predicted Sentiment:")
115
+ for name in predictions:
116
+ st.write(f"{name}: **{predictions[name]}**")
117
+ else:
118
  st.write("Please enter a review.")