sepp81 commited on
Commit
ddf19db
·
verified ·
1 Parent(s): b1fc271

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -88
app.py CHANGED
@@ -1,90 +1,75 @@
1
  import streamlit as st
2
  import pandas as pd
3
- import numpy as np
4
- from sklearn.feature_extraction.text import TfidfVectorizer
5
- from sklearn.model_selection import train_test_split
6
- from sklearn.ensemble import RandomForestClassifier
7
- from sklearn.metrics import accuracy_score
8
- import joblib
9
-
10
- # Title and Description of the App
11
- st.title("Human vs LLM-Generated Text Differentiator")
12
- st.write("This app predicts whether a given text is human-written or generated by a language model (LLM).")
13
-
14
- # Step 1: Upload Dataset
15
- st.header("Step 1: Upload the RoFT Dataset")
16
- uploaded_file = st.file_uploader("Upload your roft.csv file", type="csv")
17
-
18
- if uploaded_file is not None:
19
- # Load dataset
20
- data = pd.read_csv(uploaded_file)
21
- st.write("Dataset Loaded Successfully!")
22
-
23
- # Display the first few rows of the dataset
24
- st.subheader("Sample of the Dataset:")
25
- st.dataframe(data.head())
26
-
27
- # Preprocessing the data
28
- st.header("Step 2: Preprocess the Data")
29
-
30
- # Combine prompt_body and gen_body to form the complete text
31
- data['text'] = data['prompt_body'].fillna('') + ' ' + data['gen_body'].fillna('')
32
- data['label'] = data['true_boundary_index'].apply(lambda x: 1 if x == 9 else 0) # 1 = Human, 0 = LLM
33
-
34
- st.write("Data Preprocessing Complete!")
35
-
36
- # Show distribution of labels
37
- st.subheader("Label Distribution:")
38
- st.bar_chart(data['label'].value_counts())
39
-
40
- # Feature Extraction
41
- st.header("Step 3: Train the Model")
42
- st.write("Extracting features using TF-IDF and training a Random Forest classifier.")
43
-
44
- # TF-IDF Vectorization
45
- vectorizer = TfidfVectorizer(max_features=5000)
46
- X = vectorizer.fit_transform(data['text']).toarray()
47
- y = data['label']
48
-
49
- # Train-Test Split
50
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
51
-
52
- # Train a Random Forest Classifier
53
- model = RandomForestClassifier(n_estimators=100, random_state=42)
54
- model.fit(X_train, y_train)
55
-
56
- # Evaluate the model
57
- y_pred = model.predict(X_test)
58
- accuracy = accuracy_score(y_test, y_pred)
59
- st.write(f"Model Accuracy: {accuracy * 100:.2f}%")
60
-
61
- # Save the model and vectorizer
62
- joblib.dump(model, 'text_classifier.pkl')
63
- joblib.dump(vectorizer, 'vectorizer.pkl')
64
- st.success("Model Trained and Saved Successfully!")
65
-
66
- # Step 4: User Input for Prediction
67
- st.header("Step 4: Predict Human vs LLM-Generated Text")
68
-
69
- # Load the trained model and vectorizer
70
- model = joblib.load('text_classifier.pkl')
71
- vectorizer = joblib.load('vectorizer.pkl')
72
-
73
- # Input text from the user
74
- user_input = st.text_area("Enter the text you want to classify:")
75
-
76
- if st.button("Predict"):
77
- if user_input.strip():
78
- # Vectorize the input text
79
- input_vector = vectorizer.transform([user_input]).toarray()
80
-
81
- # Predict and show the result
82
- prediction = model.predict(input_vector)
83
- confidence = model.predict_proba(input_vector).max() * 100
84
-
85
- if prediction[0] == 1:
86
- st.success(f"The text is likely **Human-Written** with a confidence of {confidence:.2f}%.")
87
- else:
88
- st.warning(f"The text is likely **LLM-Generated** with a confidence of {confidence:.2f}%.")
89
- else:
90
- st.error("Please enter some text for prediction.")
 
1
  import streamlit as st
2
  import pandas as pd
3
+ import random
4
+
5
+ # Load the dataset (replace with your actual dataset path)
6
+ df = pd.read_csv('roft.csv') # Ensure your dataset path is correct
7
+
8
+ # Initialize session state variables
9
+ if 'score' not in st.session_state:
10
+ st.session_state.score = 0
11
+ if 'index' not in st.session_state:
12
+ st.session_state.index = 0
13
+ if 'game_over' not in st.session_state:
14
+ st.session_state.game_over = False
15
+
16
+ # Function to get next text from the dataset
17
+ def get_next_text():
18
+ # If game is over, return None to stop
19
+ if st.session_state.game_over:
20
+ return None
21
+ # Fetch the next text in sequence
22
+ text_data = df.iloc[st.session_state.index]
23
+
24
+ # Combine the prompt body and generation body to form the text to classify
25
+ prompt_text = text_data['prompt_body']
26
+ gen_text = text_data['gen_body']
27
+
28
+ # Combine the text and make sure it doesn't exceed 10 sentences
29
+ full_text = prompt_text + " _SEP_ " + gen_text
30
+ return full_text, text_data['model'], text_data['true_boundary_index']
31
+
32
+ # Function to update the game state
33
+ def update_game_state(user_answer, correct_answer):
34
+ # Check if answer is correct
35
+ if user_answer == correct_answer:
36
+ st.session_state.score += 5
37
+ st.session_state.index += 1
38
+ if st.session_state.index >= len(df):
39
+ st.session_state.game_over = True
40
+ st.success("Congratulations! You've completed the quiz.")
41
+ else:
42
+ st.session_state.game_over = True
43
+ st.error("Game Over! Incorrect Answer.")
44
+
45
+ # Display the score
46
+ st.sidebar.text(f"Score: {st.session_state.score}")
47
+
48
+ # Show the current text to classify
49
+ text, model_used, true_boundary_index = get_next_text()
50
+
51
+ if text:
52
+ # Display the text
53
+ st.write(f"Model used: {model_used}")
54
+ st.write(f"Text: {text}")
55
+
56
+ # User input (radio buttons for classification)
57
+ user_answer = st.radio("Classify the text as:", ["human", "machine"])
58
+
59
+ # When user submits answer
60
+ if st.button("Submit"):
61
+ # Correct answer is determined based on boundary index
62
+ correct_answer = "human" if true_boundary_index == 0 else "machine"
63
+ update_game_state(user_answer, correct_answer)
64
+ else:
65
+ # Game over message
66
+ st.write("Game Over!")
67
+ st.write(f"Your final score is: {st.session_state.score}")
68
+
69
+ # Option to restart the game
70
+ if st.session_state.game_over:
71
+ if st.button("Restart Game"):
72
+ st.session_state.score = 0
73
+ st.session_state.index = 0
74
+ st.session_state.game_over = False
75
+ st.experimental_rerun()