Spaces:

jaynopponep
/

CTIIC-Plagiarism-Detector

Sleeping

App Files Files Community

jaynopponep commited on Apr 16, 2024

Commit

d96116f

1 Parent(s): 7cc941c

Deploying new flask, new sklearn based modeling

Browse files

Files changed (3) hide show

.idea/.name +1 -1
app.py +6 -9
model.py +42 -11

.idea/.name CHANGED Viewed

	@@ -1 +1 @@
1	- ~~train~~.py


1	+ app.py

app.py CHANGED Viewed

@@ -1,24 +1,21 @@
 from flask import Flask, render_template, request, jsonify
 import model  # Import your model module
-from transformers import BertTokenizer
 app = Flask(__name__)
-# Load the model and tokenizer here
-loaded_model = model.get_model()
-tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 @app.route('/', methods=['GET', 'POST'])
 def home():
     if request.method == 'POST':
         data = request.json
         user_input = data['text']
-        # Use your model to classify the text
-        prediction = model.predict(loaded_model, user_input, tokenizer)
         return jsonify({'classification': prediction})
     return render_template('home.html')
 if __name__ == '__main__':
-    app.run()

 from flask import Flask, render_template, request, jsonify
 import model  # Import your model module
 app = Flask(__name__)
+# Load data and train the model globally
+df = model.load_data()
+X_train, X_test, y_train, y_test = model.split_data(df)
+pipeline = model.create_pipeline(X_train, y_train)
 @app.route('/', methods=['GET', 'POST'])
 def home():
     if request.method == 'POST':
         data = request.json
         user_input = data['text']
+        prediction = model.predict_text(user_input, pipeline)
         return jsonify({'classification': prediction})
     return render_template('home.html')
 if __name__ == '__main__':
+    app.run(debug=True)

model.py CHANGED Viewed

@@ -1,15 +1,46 @@
-import torch
-from transformers import BertTokenizer, BertForSequenceClassification
-def get_model():
-    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
-    return model
-# Predicting Function
-def predict(model, text, tokenizer):
-    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
-    outputs = model(**inputs)
-    predictions = torch.argmax(outputs.logits, dim=-1)
-    return "AI-generated" if predictions.item() == 1 else "Human-written"

+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.naive_bayes import MultinomialNB
+# Function to remove unwanted tags from the text
+def remove_tags(text):
+    tags = ['\n', '\'']
+    for tag in tags:
+        text = text.replace(tag, '')
+    return text
+# Assuming the data is loaded into a DataFrame 'df' at some point
+def load_data():
+    # Dummy loading mechanism, replace with actual data loading
+    df = pd.read_csv('path_to_your_dataset.csv')
+    df['text'] = df['text'].apply(remove_tags)
+    return df
+def split_data(df):
+    y = df['generated']
+    X = df['text']
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
+    return X_train, X_test, y_train, y_test
+# Build and train the pipeline
+def create_pipeline(X_train, y_train):
+    pipeline = Pipeline([
+        ('count_vectorizer', CountVectorizer()),  # Step 1: Convert text to count vectors
+        ('tfidf_transformer', TfidfTransformer()),  # Step 2: Transform count vectors to TF-IDF
+        ('classifier', MultinomialNB())  # Step 3: Train a classifier, here using Naive Bayes
+    ])
+    pipeline.fit(X_train, y_train)
+    return pipeline
+# Function to predict new inputs using the trained pipeline
+def predict_text(text, pipeline):
+    return pipeline.predict([text])[0]  # Return the classification result
+# Main routine to train the model if this file is executed directly (for testing)
+if __name__ == "__main__":
+    df = load_data()
+    X_train, X_test, y_train, y_test = split_data(df)
+    pipeline = create_pipeline(X_train, y_train)
+    print(f"Model trained. Test accuracy: {pipeline.score(X_test, y_test)}")