Spaces:

jaynopponep
/

CTIIC-Plagiarism-Detector

Sleeping

App Files Files Community

jaynopponep commited on Apr 16, 2024

Commit

2e23211

1 Parent(s): 154a557

Adding new scikit based code!

Browse files

Files changed (2) hide show

app.py +3 -3
model.py +11 -19

app.py CHANGED Viewed

@@ -1,10 +1,10 @@
 from flask import Flask, render_template, request, jsonify
-import model  # Import your model module
 app = Flask(__name__)
 # Load data and train the model globally
-df = model.load_data()
 X_train, X_test, y_train, y_test = model.split_data(df)
 pipeline = model.create_pipeline(X_train, y_train)
@@ -18,4 +18,4 @@ def home():
     return render_template('home.html')
 if __name__ == '__main__':
-    app.run(debug=True)

 from flask import Flask, render_template, request, jsonify
+import model
 app = Flask(__name__)
 # Load data and train the model globally
+df = model.load_data('path_to_AI_Human.csv')  # Make sure this path is correct
 X_train, X_test, y_train, y_test = model.split_data(df)
 pipeline = model.create_pipeline(X_train, y_train)
     return render_template('home.html')
 if __name__ == '__main__':
+    app.run(debug=True)

model.py CHANGED Viewed

@@ -1,46 +1,38 @@
 import pandas as pd
 from sklearn.model_selection import train_test_split
-from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
 from sklearn.pipeline import Pipeline
 from sklearn.naive_bayes import MultinomialNB
-# Function to remove unwanted tags from the text
 def remove_tags(text):
     tags = ['\n', '\'']
     for tag in tags:
         text = text.replace(tag, '')
     return text
-# Assuming the data is loaded into a DataFrame 'df' at some point
-def load_data():
-    # Dummy loading mechanism, replace with actual data loading
-    df = pd.read_csv("AI_Human.csv")
     df['text'] = df['text'].apply(remove_tags)
     return df
 def split_data(df):
     y = df['generated']
     X = df['text']
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
     return X_train, X_test, y_train, y_test
-# Build and train the pipeline
 def create_pipeline(X_train, y_train):
     pipeline = Pipeline([
-        ('count_vectorizer', CountVectorizer()),  # Step 1: Convert text to count vectors
-        ('tfidf_transformer', TfidfTransformer()),  # Step 2: Transform count vectors to TF-IDF
-        ('classifier', MultinomialNB())  # Step 3: Train a classifier, here using Naive Bayes
     ])
     pipeline.fit(X_train, y_train)
     return pipeline
-# Function to predict new inputs using the trained pipeline
 def predict_text(text, pipeline):
-    return pipeline.predict([text])[0]  # Return the classification result
-# Main routine to train the model if this file is executed directly (for testing)
-#if __name__ == "__main__":
- #   df = load_data()
- #   X_train, X_test, y_train, y_test = split_data(df)
- #   pipeline = create_pipeline(X_train, y_train)
- #   print(f"Model trained. Test accuracy: {pipeline.score(X_test, y_test)}")

 import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.pipeline import Pipeline
+from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
 from sklearn.naive_bayes import MultinomialNB
+from sklearn.metrics import accuracy_score, classification_report
 def remove_tags(text):
     tags = ['\n', '\'']
     for tag in tags:
         text = text.replace(tag, '')
     return text
+def load_data(filepath):
+    df = pd.read_csv(filepath)
     df['text'] = df['text'].apply(remove_tags)
     return df
 def split_data(df):
     y = df['generated']
     X = df['text']
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
     return X_train, X_test, y_train, y_test
 def create_pipeline(X_train, y_train):
     pipeline = Pipeline([
+        ('count_vectorizer', CountVectorizer()),
+        ('tfidf_transformer', TfidfTransformer()),
+        ('classifier', MultinomialNB())
     ])
     pipeline.fit(X_train, y_train)
     return pipeline
 def predict_text(text, pipeline):
+    processed_text = remove_tags(text)
+    prediction = pipeline.predict([processed_text])[0]
+    return "AI-generated" if prediction else "Human-written"