import streamlit as st
import pandas as pd
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import os

# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

st.title("🎤 Lyric Artist Classifier")
st.write("Ever wondered who might have written a set of lyrics? This app predicts the artist based on lyrical patterns!")

# Load datasets
@st.cache_data
def load_data():
    train_df = pd.read_csv("training_songs.csv")
    test_df = pd.read_csv("test_songs.csv")
    return train_df, test_df

train_df, test_df = load_data()

if 'Lyrics' not in train_df.columns or 'Artist' not in train_df.columns:
    st.error("Dataset must contain 'Lyrics' and 'Artist' columns.")
    st.stop()

# Text preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

train_df['Lyrics'] = train_df['Lyrics'].apply(preprocess_text)
test_df['Lyrics'] = test_df['Lyrics'].apply(preprocess_text)

# Train model
@st.cache_resource
def train_model():
    vectorizer = TfidfVectorizer(stop_words='english', max_features=10000, ngram_range=(1,3))
    X_train = vectorizer.fit_transform(train_df['Lyrics'])
    y_train = train_df['Artist']
    smote = SMOTE()
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    model = SGDClassifier(loss='log_loss', max_iter=1000)
    param_grid = {'alpha': [0.0001, 0.001, 0.01]}
    grid_search = GridSearchCV(model, param_grid, cv=3)
    grid_search.fit(X_train_resampled, y_train_resampled)
    best_model = grid_search.best_estimator_
    joblib.dump((vectorizer, best_model), "model.pkl")
    return vectorizer, best_model

if os.path.exists("model.pkl"):
    vectorizer, model = joblib.load("model.pkl")
else:
    vectorizer, model = train_model()

X_test = vectorizer.transform(test_df['Lyrics'])
y_test = test_df['Artist']
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

# Tabs
tabs = st.tabs(["Home", "Prediction", "Dataset", "Visualizations", "Model Performance"])

with tabs[0]:
    st.header("Welcome to Lyric Artist Classifier!")
    st.write("This AI-powered app predicts the artist of a song based on its lyrics.")
    st.write("The model has been trained on a dataset of various artists and uses text analysis techniques to make predictions.")
    st.subheader("Model Performance")
    st.write(f"Current Model Accuracy: **{accuracy:.2f}**")
    st.write("While the model performs well, predictions might be less accurate for artists with fewer songs in the dataset.")

with tabs[1]:
    st.header("Predict the Artist!")
    lyrics_input = st.text_area("Enter Lyrics:", height=200)
    def predict_artist(lyrics):
        X_input = vectorizer.transform([preprocess_text(lyrics)])
        predictions = model.predict_proba(X_input)[0]
        top_artists = sorted(zip(model.classes_, predictions), key=lambda x: x[1], reverse=True)[:3]
        return top_artists
    if st.button("Predict Artist"):
        if lyrics_input.strip():
            top_artists = predict_artist(lyrics_input)
            st.success("Top Predictions:")
            for artist, prob in top_artists:
                st.write(f"{artist}: {prob:.2f}")
        else:
            st.warning("Please enter some lyrics!")

with tabs[2]:
    st.header("Sample Training Data")
    st.dataframe(train_df[['Artist', 'Song', 'Lyrics']], height=400)

with tabs[3]:
    st.header("Visualizations")
    st.subheader("Artist Distribution")
    fig, ax = plt.subplots(figsize=(8, 6))
    top_artists = train_df['Artist'].value_counts().nlargest(20)
    sns.barplot(x=top_artists.values, y=top_artists.index, palette='coolwarm', ax=ax)
    ax.set_xlabel("Number of Songs")
    ax.set_ylabel("Artist")
    st.pyplot(fig)
    st.subheader("Word Cloud")
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(train_df['Lyrics']))
    fig, ax = plt.subplots()
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.axis("off")
    st.pyplot(fig)

with tabs[4]:
    st.header("Model Performance")
    st.subheader("Classification Report")
    st.dataframe(pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).T, height=400)