File size: 4,785 Bytes
571bf3f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import streamlit as st
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import pickle
import matplotlib.pyplot as plt

st.title(":blue[IMDB Dataset of 50k reviews]")


@st.cache_data
def load_data():
    return pd.read_csv('IMDB Dataset.csv')
if 'models' not in st.session_state:
    st.session_state.models = {}
if 'vectorizer' not in st.session_state:
    st.session_state.vectorizer = None
if 'accuracy' not in st.session_state:
    st.session_state.accuracy = {}
if 'report' not in st.session_state:
    st.session_state.report = {}

# Dataset
st.header("Dataset")
df = load_data()
with st.expander("Show Data"):
    st.write(df)
df['sentiment'] = df['sentiment'].map({'positive':1,'negative':0})
X = df['review']
y = df['sentiment']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=41)

tfidf_vectorizer = TfidfVectorizer()  
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)  
X_test_tfidf = tfidf_vectorizer.transform(X_test)  


if not st.session_state.models:  
    st.session_state.vectorizer = TfidfVectorizer()  
    X_train_tfidf = st.session_state.vectorizer.fit_transform(X_train)  

    # models  
    models = {  
        # "SVM": SVC(kernel='linear'),  
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Naive Bayes": MultinomialNB()  
    }  

    for name, model in models.items():  
        model.fit(X_train_tfidf, y_train)  
        st.session_state.models[name] = model  
        X_test_tfidf = st.session_state.vectorizer.transform(X_test)  
        y_pred = model.predict(X_test_tfidf)  
        st.session_state.accuracy[name] = accuracy_score(y_test, y_pred)  
        st.session_state.report[name] = classification_report(y_test, y_pred)  

if st.session_state.accuracy:  
    
    plt.figure(figsize=(10, 5))  
    plt.bar(st.session_state.accuracy.keys(), st.session_state.accuracy.values(), color=['blue', 'orange', 'green'])  
    plt.ylabel('Accuracy')  
    plt.title('Model Accuracy Comparison')  
    st.pyplot(plt)  

    for name in st.session_state.report:  
        st.write(f"### Classification Report for {name}:")  
        # st.text(st.session_state.report[name])  
        st.dataframe(st.session_state.report[name])

st.header("Manual Tryouts",divider='orange')
# Input text from the user  
user_input = st.text_area("Enter your Review", "")  

if st.button("Predict"):  
    if user_input:  
        # Vectorize user input for all models  
        user_input_tfidf = st.session_state.vectorizer.transform([user_input])  

        # Predict using all models  
        predictions = {}  
        for name, model in st.session_state.models.items():  
            prediction = model.predict(user_input_tfidf)  
            predictions[name] = "Positive" if prediction[0] == 1 else "Negative"  
        
        # Display predictions for each model  
        st.write("Predicted Sentiment:")  
        for name in predictions:  
            st.write(f"{name}: **{predictions[name]}**")  
    else:  
        st.write("Please enter a review.")
# # Linear Regression
# st.header('Linear Regression',divider='orange')
# model = LogisticRegression()  
# model.fit(X_train_tfidf, y_train)  

# y_pred = model.predict(X_test_tfidf)  

# print("Accuracy:", accuracy_score(y_test, y_pred))  
# print(classification_report(y_test, y_pred))

# filename = 'linear_regression_model.pkl'
# with open(filename, 'wb') as model_file:
#     pickle.dump(model, model_file)

# st.write("Accuracy:", accuracy_score(y_test, y_pred))  
# st.markdown(body=classification_report(y_test, y_pred),unsafe_allow_html=True)  

# # Naive Bayes
# st.header("Naive Bayes",divider='orange')
# model_nb = MultinomialNB()  
# model_nb.fit(X_train_tfidf, y_train)  

# # Evaluate the model  
# y_pred = model_nb.predict(X_test_tfidf)  
# st.write("Accuracy:", accuracy_score(y_test, y_pred))  
# st.markdown(body=classification_report(y_test, y_pred),unsafe_allow_html=True)  

# # SVM
# st.header("Support Vector Machine")
# st.caption("Kernal type is linear.")
# model = SVC(kernel='linear')  # You can also try 'rbf', 'poly', etc.  
# model.fit(X_train_tfidf, y_train)  

# y_pred = model.predict(X_test_tfidf)  
# st.write("Accuracy:", accuracy_score(y_test, y_pred))  
# st.markdown(body=classification_report(y_test, y_pred),unsafe_allow_html=True)