Spaces:

jaifar
/

Smart-Detection-System-of-AI-Generated-Text-Models

Runtime error

Smart-Detection-System-of-AI-Generated-Text-Models / app.py

jaifar530

Update app.py

3b7bbcd unverified almost 2 years ago

6.56 kB

	import streamlit as st
	import zipfile
	import os
	import requests
	from keras.models import load_model
	from keras.preprocessing.text import Tokenizer
	from keras.preprocessing.sequence import pad_sequences
	from sklearn.preprocessing import LabelEncoder
	import pickle
	import numpy as np


	# Custom headers for the HTTP request
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
	}



	# Check if the model folder exists
	zip_file_path = "my_authorship_model_zip.zip"
	if not os.path.exists('my_authorship_model'):
	try:
	# Download the model
	model_url = 'https://jaifar.net/ADS/my_authorship_model_zip.zip'
	r = requests.get(model_url, headers=headers)
	r.raise_for_status()

	# Debugging: Check if download is successful by examining content length
	st.write(f"Downloaded model size: {len(r.content)} bytes")

	# Save the downloaded content
	with open(zip_file_path, "wb") as f:
	f.write(r.content)

	# Debugging: Verify that the zip file exists
	if os.path.exists(zip_file_path):
	st.write("Zip file exists")

	# Extract the model using zipfile
	with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
	zip_ref.extractall('my_authorship_model')

	# Debugging: Check if the folder is successfully created
	if os.path.exists('my_authorship_model'):
	st.write("Model folder successfully extracted using zipfile")
	# Debugging: List the directory contents after extraction
	st.write("Listing directory contents:")
	st.write(os.listdir('.'))
	else:
	st.write("Model folder was not extracted successfully using zipfile")
	exit(1)

	else:
	st.write("Zip file does not exist")
	exit(1)
	except Exception as e:
	st.write(f"Failed to download or extract the model: {e}")
	exit(1)
	else:
	st.write("System Ready !!")


	# Download the required files
	file_urls = {
	'tokenizer.pkl': 'https://jaifar.net/ADS/tokenizer.pkl',
	'label_encoder.pkl': 'https://jaifar.net/ADS/label_encoder.pkl'
	}

	for filename, url in file_urls.items():
	try:
	r = requests.get(url, headers=headers)
	r.raise_for_status()
	with open(filename, 'wb') as f:
	f.write(r.content)
	except Exception as e:
	st.write(f"Failed to download {filename}: {e}")
	exit(1)

	# Load the saved model
	loaded_model = load_model("my_authorship_model")

	# Load the saved tokenizer and label encoder
	with open('tokenizer.pkl', 'rb') as handle:
	tokenizer = pickle.load(handle)

	with open('label_encoder.pkl', 'rb') as handle:
	label_encoder = pickle.load(handle)

	max_length = 300 # As defined in the training code

	# Function to predict author for new text
	def predict_author(new_text, model, tokenizer, label_encoder):
	sequence = tokenizer.texts_to_sequences([new_text])
	padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')
	prediction = model.predict(padded_sequence)

	predicted_label = label_encoder.inverse_transform([prediction.argmax()])[0]
	probabilities = prediction[0]
	author_probabilities = {}
	for idx, prob in enumerate(probabilities):
	author = label_encoder.inverse_transform([idx])[0]
	author_probabilities[author] = prob

	return predicted_label, author_probabilities

	new_text = st.text_area("Input your text here")

	# Creates a button named 'Press me'
	press_me_button = st.button("Writer or Robot?")

	if press_me_button:
	predicted_author, author_probabilities = predict_author(new_text, loaded_model, tokenizer, label_encoder)
	sorted_probabilities = sorted(author_probabilities.items(), key=lambda x: x[1], reverse=True)

	st.write(f"The text is most likely written by: {predicted_author}")
	st.write("Probabilities for each author are (sorted):")
	for author, prob in sorted_probabilities:
	st.write(f"{author}: {prob * 100:.2f}%")

	import streamlit as st

	st.title("Smart Authorship Detection System of AI-Generated Text Models")

	# Slogan under the title
	st.subheader("Uncover the Invisible Ink: Who's the Author?")

	# Using expander to make FAQ sections
	st.subheader("Frequently Asked Questions (FAQ)")

	# Small Description
	with st.expander("What is this project about?"):
	st.write("""
	This project is part of an MSc in Data Analytics at the University of Portsmouth.
	Developed by Jaifar Al Shizawi, it aims to identify whether a text is written by a human or a specific Large Language Model (LLM) like ChatGPT-3, ChatGPT-4, Google Bard, or HuggingChat.
	For inquiries, contact [[email protected]](mailto:[email protected]).
	Supervised by Dr. Mohamed Bader.
	""")

	# System Details
	with st.expander("How does the system work?"):
	st.write("""
	The system is trained using a CNN model on a dataset of 140,546 paragraphs, varying in length from 10 to 500 words.
	It achieves an accuracy of 0.9964 with a validation loss of 0.094.
	""")

	# Data Storage Information
	with st.expander("Does the system store my data?"):
	st.write("No, the system does not collect or store any user input data.")

	# Use-case Limitation
	with st.expander("Can I use this as evidence?"):
	st.write("""
	No, this system is a Proof of Concept (POC) and should not be used as evidence against students or similar entities.
	""")

	# Background and Context
	with st.expander("Background and Context"):
	st.write("""
	The proliferation of AI and Large Language Models (LLMs) like ChatGPT and Google Bard has raised questions about authorship. This project aims to analyze and predict the unique features of these models compared to human-written text.
	""")

	# Problem Statement
	with st.expander("Problem Statement"):
	st.write("""
	Most AI authorship detection systems fail to identify the specific LLM behind a text. This research aims to fill this gap, offering detailed analysis on various LLMs and human writing.
	""")

	# Aim and Objectives
	with st.expander("Aim and Objectives"):
	st.write("""
	The project aims to help staff at the University of Portsmouth distinguish between student-written artifacts and those generated by LLMs. It focuses on text feature extraction, model testing, and implementing a user-friendly dashboard among other objectives.
	""")