sepp81 commited on
Commit
c01a3fa
·
verified ·
1 Parent(s): 1857248

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -0
app.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.ensemble import RandomForestClassifier
7
+ from sklearn.metrics import accuracy_score
8
+ import joblib
9
+
10
+ # Title and Description of the App
11
+ st.title("Human vs LLM-Generated Text Differentiator")
12
+ st.write("This app predicts whether a given text is human-written or generated by a language model (LLM).")
13
+
14
+ # Step 1: Upload Dataset
15
+ st.header("Step 1: Upload the RoFT Dataset")
16
+ uploaded_file = st.file_uploader("Upload your roft.csv file", type="csv")
17
+
18
+ if uploaded_file is not None:
19
+ # Load dataset
20
+ data = pd.read_csv(uploaded_file)
21
+ st.write("Dataset Loaded Successfully!")
22
+
23
+ # Display the first few rows of the dataset
24
+ st.subheader("Sample of the Dataset:")
25
+ st.dataframe(data.head())
26
+
27
+ # Preprocessing the data
28
+ st.header("Step 2: Preprocess the Data")
29
+
30
+ # Combine prompt_body and gen_body to form the complete text
31
+ data['text'] = data['prompt_body'].fillna('') + ' ' + data['gen_body'].fillna('')
32
+ data['label'] = data['true_boundary_index'].apply(lambda x: 1 if x == 9 else 0) # 1 = Human, 0 = LLM
33
+
34
+ st.write("Data Preprocessing Complete!")
35
+
36
+ # Show distribution of labels
37
+ st.subheader("Label Distribution:")
38
+ st.bar_chart(data['label'].value_counts())
39
+
40
+ # Feature Extraction
41
+ st.header("Step 3: Train the Model")
42
+ st.write("Extracting features using TF-IDF and training a Random Forest classifier.")
43
+
44
+ # TF-IDF Vectorization
45
+ vectorizer = TfidfVectorizer(max_features=5000)
46
+ X = vectorizer.fit_transform(data['text']).toarray()
47
+ y = data['label']
48
+
49
+ # Train-Test Split
50
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
51
+
52
+ # Train a Random Forest Classifier
53
+ model = RandomForestClassifier(n_estimators=100, random_state=42)
54
+ model.fit(X_train, y_train)
55
+
56
+ # Evaluate the model
57
+ y_pred = model.predict(X_test)
58
+ accuracy = accuracy_score(y_test, y_pred)
59
+ st.write(f"Model Accuracy: {accuracy * 100:.2f}%")
60
+
61
+ # Save the model and vectorizer
62
+ joblib.dump(model, 'text_classifier.pkl')
63
+ joblib.dump(vectorizer, 'vectorizer.pkl')
64
+ st.success("Model Trained and Saved Successfully!")
65
+
66
+ # Step 4: User Input for Prediction
67
+ st.header("Step 4: Predict Human vs LLM-Generated Text")
68
+
69
+ # Load the trained model and vectorizer
70
+ model = joblib.load('text_classifier.pkl')
71
+ vectorizer = joblib.load('vectorizer.pkl')
72
+
73
+ # Input text from the user
74
+ user_input = st.text_area("Enter the text you want to classify:")
75
+
76
+ if st.button("Predict"):
77
+ if user_input.strip():
78
+ # Vectorize the input text
79
+ input_vector = vectorizer.transform([user_input]).toarray()
80
+
81
+ # Predict and show the result
82
+ prediction = model.predict(input_vector)
83
+ confidence = model.predict_proba(input_vector).max() * 100
84
+
85
+ if prediction[0] == 1:
86
+ st.success(f"The text is likely **Human-Written** with a confidence of {confidence:.2f}%.")
87
+ else:
88
+ st.warning(f"The text is likely **LLM-Generated** with a confidence of {confidence:.2f}%.")
89
+ else:
90
+ st.error("Please enter some text for prediction.")