Spaces:

Amiruzzaman
/

CbBangla

Runtime error

CbBangla / app.py

1f3f8ce verified over 1 year ago

4.55 kB

	import streamlit as st
	import numpy as np
	import joblib
	import re
	from keras.models import load_model
	from keras.preprocessing.sequence import pad_sequences

	# Define max_sequence_length (same as what was used during training)
	max_sequence_length = 186

	# Define bad_words list
	bad_words = ["মাদারচুদ", "বাইন চুদ", "জুকার", "ফালতু", "শালা", "লেংটা", "বাটপার", "মাদারচুদ", "ফালতু", "শালা", "নাস্তিকের বাচ্চা", "শুয়ার", "কুত্তা", "পুটকি", "নগ্নতায়", "সমকামি", "চুদছে", "চুদতে", "চুদা", "আবাল চোদা", "শুয়োরের বাচ্চা", "কুত্তার বাচ্চা", "হারামির বাচ্চা", "হারামজাদা", "শালার পো", "চুতমারানি", "চুদির ভাই","হাউয়ার নাতি", "খানকি", "মারা", "হোগা", "খানকির পোলা", "চোদা", "মিয়া খলিফা", "জনি সিন্স", "মাগির পোলা", "মাগি", "মাগী", "পর্ণ", "গরুচোদা", "হিজরার", "হিজরা"]

	# Load the model
	loaded_model = load_model('cyberbullying_model.h5')

	# Load the tokenizer and label encoder
	loaded_tokenizer = joblib.load('tokenizer.pkl')
	loaded_label_encoder = joblib.load('label_encoder.pkl')

	# Function to filter out bad words from text and return filtered words
	def filter_bad_words_with_model(text):
	filtered_words = []
	for bad_word in bad_words:
	# Create a regular expression pattern for the bad word, ignoring case
	pattern = re.compile(re.escape(bad_word), re.IGNORECASE)
	# Replace occurrences of the bad word with asterisks (*) of the same length
	text, num_replacements = pattern.subn('' len(bad_word), text)
	if num_replacements > 0:
	filtered_words.append(bad_word)
	return text, filtered_words


	# Streamlit UI
	st.title("Cyberbullying Detection App (Bangla)")

	# Input text for prediction
	input_text = st.text_area("Enter Text:")

	if st.button("Predict"):
	if input_text:
	# Filter out bad words using the loaded model
	filtered_text, filtered_bad_words = filter_bad_words_with_model(input_text)

	# Tokenize and pad the filtered input text
	input_sequence = loaded_tokenizer.texts_to_sequences([filtered_text])
	input_sequence = pad_sequences(input_sequence, maxlen=max_sequence_length)

	# Make a prediction using the loaded model
	predicted_probabilities = loaded_model.predict(input_sequence)

	# Get the class with the highest probability as the predicted class
	predicted_class = np.argmax(predicted_probabilities)

	# Decode the predicted class back to the original label using the loaded label encoder
	predicted_label = loaded_label_encoder.classes_[predicted_class]

	st.subheader("Prediction Result:")
	if predicted_label == "not bully":
	st.write("Prediction: Not Cyberbullying")
	st.write("No bad words found.")
	else:
	st.write("Prediction: Cyberbullying")
	st.write(f"Cyberbullying Type: {predicted_label}")

	if filtered_bad_words:
	st.write(f"Bad Words: {', '.join(filtered_bad_words)}")
	else:
	st.write("<span style='color:cyan;'>No bad words found.</span>", unsafe_allow_html=True)

	if filtered_bad_words:
	st.write("Filtered Text:")
	filtered_text_with_asterisks = filtered_text
	for bad_word in filtered_bad_words:
	filtered_text_with_asterisks = re.sub(re.escape(bad_word), '' len(bad_word), filtered_text_with_asterisks, flags=re.IGNORECASE)

	st.write(f"<span style='color:red; font-weight:bold'>{filtered_text_with_asterisks}</span>", unsafe_allow_html=True)
	else:
	st.write("Original Text:")
	st.write(f"{input_text}", unsafe_allow_html=True)





	st.header("Sample Texts")
	st.write("ভেবেছিলাম তুই একটা ছেলে!!! এখন দেখি এটা একটা" + "<span style='color:red; font-weight:bold'> হিজরা</span>?", unsafe_allow_html=True)
	st.write("প্রতিটি নাটক কয়েকবার করে দেখা হয়ে গেছে")