import streamlit as st import numpy as np import joblib import re from keras.models import load_model from keras.preprocessing.sequence import pad_sequences # Define max_sequence_length (same as what was used during training) max_sequence_length = 186 # Define bad_words list bad_words = ["মাদারচুদ", "বাইন চুদ", "জুকার", "ফালতু", "শালা", "লেংটা", "বাটপার", "মাদারচুদ", "ফালতু", "শালা", "নাস্তিকের বাচ্চা", "শুয়ার", "কুত্তা", "পুটকি", "নগ্নতায়", "সমকামি", "চুদছে", "চুদতে", "চুদা", "আবাল চোদা", "শুয়োরের বাচ্চা", "কুত্তার বাচ্চা", "হারামির বাচ্চা", "হারামজাদা", "শালার পো", "চুতমারানি", "চুদির ভাই","হাউয়ার নাতি", "খানকি", "মারা", "হোগা", "খানকির পোলা", "চোদা", "মিয়া খলিফা", "জনি সিন্স", "মাগির পোলা", "মাগি", "মাগী", "পর্ণ", "গরুচোদা", "হিজরার", "হিজরা"] # Load the model loaded_model = load_model('cyberbullying_model.h5') # Load the tokenizer and label encoder loaded_tokenizer = joblib.load('tokenizer.pkl') loaded_label_encoder = joblib.load('label_encoder.pkl') # Function to filter out bad words from text and return filtered words def filter_bad_words_with_model(text): filtered_words = [] for bad_word in bad_words: # Create a regular expression pattern for the bad word, ignoring case pattern = re.compile(re.escape(bad_word), re.IGNORECASE) # Replace occurrences of the bad word with asterisks (*) of the same length text, num_replacements = pattern.subn('*' * len(bad_word), text) if num_replacements > 0: filtered_words.append(bad_word) return text, filtered_words # Streamlit UI st.title("Cyberbullying Detection App (Bangla)") # Input text for prediction input_text = st.text_area("Enter Text:") if st.button("Predict"): if input_text: # Filter out bad words using the loaded model filtered_text, filtered_bad_words = filter_bad_words_with_model(input_text) # Tokenize and pad the filtered input text input_sequence = loaded_tokenizer.texts_to_sequences([filtered_text]) input_sequence = pad_sequences(input_sequence, maxlen=max_sequence_length) # Make a prediction using the loaded model predicted_probabilities = loaded_model.predict(input_sequence) # Get the class with the highest probability as the predicted class predicted_class = np.argmax(predicted_probabilities) # Decode the predicted class back to the original label using the loaded label encoder predicted_label = loaded_label_encoder.classes_[predicted_class] st.subheader("Prediction Result:") if predicted_label == "not bully": st.write("Prediction: Not Cyberbullying") st.write("No bad words found.") else: st.write("Prediction: Cyberbullying") st.write(f"Cyberbullying Type: {predicted_label}") if filtered_bad_words: st.write(f"Bad Words: {', '.join(filtered_bad_words)}") else: st.write("No bad words found.", unsafe_allow_html=True) if filtered_bad_words: st.write("Filtered Text:") filtered_text_with_asterisks = filtered_text for bad_word in filtered_bad_words: filtered_text_with_asterisks = re.sub(re.escape(bad_word), '*' * len(bad_word), filtered_text_with_asterisks, flags=re.IGNORECASE) st.write(f"{filtered_text_with_asterisks}", unsafe_allow_html=True) else: st.write("Original Text:") st.write(f"{input_text}", unsafe_allow_html=True) st.header("Sample Texts") st.write("ভেবেছিলাম তুই একটা ছেলে!!! এখন দেখি এটা একটা" + " হিজরা?", unsafe_allow_html=True) st.write("প্রতিটি নাটক কয়েকবার করে দেখা হয়ে গেছে")