import streamlit as st
import numpy as np
import joblib
import re
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

# Define max_sequence_length (same as what was used during training)
max_sequence_length = 186

# Define bad_words list
bad_words = ["মাদারচুদ", "বাইন চুদ", "জুকার", "ফালতু", "শালা", "লেংটা", "বাটপার", "মাদারচুদ", "ফালতু", "শালা", "নাস্তিকের বাচ্চা", "শুয়ার", "কুত্তা", "পুটকি", "নগ্নতায়", "সমকামি", "চুদছে", "চুদতে", "চুদা", "আবাল চোদা", "শুয়োরের বাচ্চা", "কুত্তার বাচ্চা", "হারামির বাচ্চা", "হারামজাদা", "শালার পো", "চুতমারানি", "চুদির ভাই","হাউয়ার নাতি", "খানকি", "মারা", "হোগা", "খানকির পোলা", "চোদা", "মিয়া খলিফা", "জনি সিন্স", "মাগির পোলা", "মাগি", "মাগী", "পর্ণ", "গরুচোদা", "হিজরার", "হিজরা"]

# Load the model
loaded_model = load_model('cyberbullying_model.h5')

# Load the tokenizer and label encoder
loaded_tokenizer = joblib.load('tokenizer.pkl')
loaded_label_encoder = joblib.load('label_encoder.pkl')

# Function to filter out bad words from text and return filtered words
def filter_bad_words_with_model(text):
    filtered_words = []
    for bad_word in bad_words:
        # Create a regular expression pattern for the bad word, ignoring case
        pattern = re.compile(re.escape(bad_word), re.IGNORECASE)
        # Replace occurrences of the bad word with asterisks (*) of the same length
        text, num_replacements = pattern.subn('*' * len(bad_word), text)
        if num_replacements > 0:
            filtered_words.append(bad_word)
    return text, filtered_words


# Streamlit UI
st.title("Cyberbullying Detection App (Bangla)")

# Input text for prediction
input_text = st.text_area("Enter Text:")

if st.button("Predict"):
    if input_text:
        # Filter out bad words using the loaded model
        filtered_text, filtered_bad_words = filter_bad_words_with_model(input_text)

        # Tokenize and pad the filtered input text
        input_sequence = loaded_tokenizer.texts_to_sequences([filtered_text])
        input_sequence = pad_sequences(input_sequence, maxlen=max_sequence_length)

        # Make a prediction using the loaded model
        predicted_probabilities = loaded_model.predict(input_sequence)

        # Get the class with the highest probability as the predicted class
        predicted_class = np.argmax(predicted_probabilities)

        # Decode the predicted class back to the original label using the loaded label encoder
        predicted_label = loaded_label_encoder.classes_[predicted_class]

        st.subheader("Prediction Result:")
        if predicted_label == "not bully":
            st.write("Prediction: Not Cyberbullying")
            st.write("No bad words found.")
        else:
            st.write("Prediction: Cyberbullying")
            st.write(f"Cyberbullying Type: {predicted_label}")

        if filtered_bad_words: 
            st.write(f"Bad Words: {', '.join(filtered_bad_words)}")
        else:
            st.write("<span style='color:cyan;'>No bad words found.</span>", unsafe_allow_html=True)

        if filtered_bad_words:
            st.write("Filtered Text:")
            filtered_text_with_asterisks = filtered_text
            for bad_word in filtered_bad_words:
                filtered_text_with_asterisks = re.sub(re.escape(bad_word), '*' * len(bad_word), filtered_text_with_asterisks, flags=re.IGNORECASE)

            st.write(f"<span style='color:red; font-weight:bold'>{filtered_text_with_asterisks}</span>", unsafe_allow_html=True)
        else:
            st.write("Original Text:")
            st.write(f"{input_text}", unsafe_allow_html=True)


st.header("Sample Texts")
st.write("ভেবেছিলাম তুই একটা ছেলে!!! এখন দেখি এটা একটা" + "<span style='color:red; font-weight:bold'> হিজরা</span>?", unsafe_allow_html=True)
st.write("প্রতিটি নাটক কয়েকবার করে দেখা হয়ে গেছে")