import streamlit as st
import numpy as np
import joblib
import re
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
# Define max_sequence_length (same as what was used during training)
max_sequence_length = 186
# Define bad_words list
bad_words = ["মাদারচুদ", "বাইন চুদ", "জুকার", "ফালতু", "শালা", "লেংটা", "বাটপার", "মাদারচুদ", "ফালতু", "শালা", "নাস্তিকের বাচ্চা", "শুয়ার", "কুত্তা", "পুটকি", "নগ্নতায়", "সমকামি", "চুদছে", "চুদতে", "চুদা", "আবাল চোদা", "শুয়োরের বাচ্চা", "কুত্তার বাচ্চা", "হারামির বাচ্চা", "হারামজাদা", "শালার পো", "চুতমারানি", "চুদির ভাই","হাউয়ার নাতি", "খানকি", "মারা", "হোগা", "খানকির পোলা", "চোদা", "মিয়া খলিফা", "জনি সিন্স", "মাগির পোলা", "মাগি", "মাগী", "পর্ণ", "গরুচোদা", "হিজরার", "হিজরা"]
# Load the model
loaded_model = load_model('cyberbullying_model.h5')
# Load the tokenizer and label encoder
loaded_tokenizer = joblib.load('tokenizer.pkl')
loaded_label_encoder = joblib.load('label_encoder.pkl')
# Function to filter out bad words from text and return filtered words
def filter_bad_words_with_model(text):
filtered_words = []
for bad_word in bad_words:
# Create a regular expression pattern for the bad word, ignoring case
pattern = re.compile(re.escape(bad_word), re.IGNORECASE)
# Replace occurrences of the bad word with asterisks (*) of the same length
text, num_replacements = pattern.subn('*' * len(bad_word), text)
if num_replacements > 0:
filtered_words.append(bad_word)
return text, filtered_words
# Streamlit UI
st.title("Cyberbullying Detection App (Bangla)")
# Input text for prediction
input_text = st.text_area("Enter Text:")
if st.button("Predict"):
if input_text:
# Filter out bad words using the loaded model
filtered_text, filtered_bad_words = filter_bad_words_with_model(input_text)
# Tokenize and pad the filtered input text
input_sequence = loaded_tokenizer.texts_to_sequences([filtered_text])
input_sequence = pad_sequences(input_sequence, maxlen=max_sequence_length)
# Make a prediction using the loaded model
predicted_probabilities = loaded_model.predict(input_sequence)
# Get the class with the highest probability as the predicted class
predicted_class = np.argmax(predicted_probabilities)
# Decode the predicted class back to the original label using the loaded label encoder
predicted_label = loaded_label_encoder.classes_[predicted_class]
st.subheader("Prediction Result:")
if predicted_label == "not bully":
st.write("Prediction: Not Cyberbullying")
st.write("No bad words found.")
else:
st.write("Prediction: Cyberbullying")
st.write(f"Cyberbullying Type: {predicted_label}")
if filtered_bad_words:
st.write(f"Bad Words: {', '.join(filtered_bad_words)}")
else:
st.write("No bad words found.", unsafe_allow_html=True)
if filtered_bad_words:
st.write("Filtered Text:")
filtered_text_with_asterisks = filtered_text
for bad_word in filtered_bad_words:
filtered_text_with_asterisks = re.sub(re.escape(bad_word), '*' * len(bad_word), filtered_text_with_asterisks, flags=re.IGNORECASE)
st.write(f"{filtered_text_with_asterisks}", unsafe_allow_html=True)
else:
st.write("Original Text:")
st.write(f"{input_text}", unsafe_allow_html=True)
st.header("Sample Texts")
st.write("ভেবেছিলাম তুই একটা ছেলে!!! এখন দেখি এটা একটা" + " হিজরা?", unsafe_allow_html=True)
st.write("প্রতিটি নাটক কয়েকবার করে দেখা হয়ে গেছে")