Spaces:

jaynopponep
/

CTIIC-Plagiarism-Detector

Sleeping

App Files Files Community

jaynopponep commited on Apr 15, 2024

Commit

c7b4b03

1 Parent(s): 1f29cc4

Bringing back to original

Browse files

Files changed (3) hide show

app.py +10 -12
model.py +11 -39
train.py +62 -13

app.py CHANGED Viewed

@@ -1,24 +1,22 @@
 from flask import Flask, render_template, request, jsonify
 import model  # Import your model module
 app = Flask(__name__)
-# Load the model and tokenizer
-loaded_model, tokenizer = model.get_model_and_tokenizer()
-@app.route('/', methods=['GET', 'POST'])
-def home():
-    if request.method == 'POST':
-        data = request.get_json(force=True)  # Safely extract JSON and handle parsing errors
-        user_input = data.get('text')
-        if user_input is None:
-            return jsonify({'error': 'No text provided'}), 400
         # Use your model to classify the text
-        prediction = model.predict(loaded_model, tokenizer, user_input)
         return jsonify({'classification': prediction})
     return render_template('home.html')
 if __name__ == '__main__':
-    app.run(debug=True)  # Turn off debug in production

 from flask import Flask, render_template, request, jsonify
 import model  # Import your model module
+from transformers import BertTokenizer
 app = Flask(__name__)
+# Load the model and tokenizer here
+loaded_model = model.get_model()
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+@app.route('/', methods=['GET', 'POST'])
+        data = request.json
+        user_input = data['text']
         # Use your model to classify the text
+        prediction = model.predict(loaded_model, user_input, tokenizer)
         return jsonify({'classification': prediction})
     return render_template('home.html')
 if __name__ == '__main__':
+    app.run()

model.py CHANGED Viewed

@@ -1,44 +1,16 @@
-# model.py
-from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
-from sklearn.pipeline import Pipeline
-from sklearn.naive_bayes import MultinomialNB
-def train_model(x_train, y_train):
-    pipeline = Pipeline([
-        ('count_vectorizer', CountVectorizer()),
-        ('tfidf_transformer', TfidfTransformer()),
-        ('naive_bayes', MultinomialNB())
-    ])
-    pipeline.fit(x_train, y_train)
-    return pipeline
-def predict(model, text):
-    return model.predict([text])[0]
-# app.py
-from flask import Flask, request, jsonify
-import pandas as pd
-from model import train_model, predict
-app = Flask(__name__)
-# Assume your data is already loaded and preprocessed into x_train, y_train
-# For demonstration, let's assume these are loaded elsewhere and imported here
-# x_train, y_train would typically be lists or a Pandas Series
-# Load and train the model
-# This should be replaced by actual data loading and splitting
-x_train = ["your training data goes here"]
-y_train = ["your labels go here"]
-model = train_model(x_train, y_train)
-@app.route('/', methods=['POST'])
-def home():
-    data = request.get_json()
-    user_input = data['text']
-    prediction = predict(model, user_input)
-    result = "AI-generated" if prediction == 1 else "Human-written"
-    return jsonify({'classification': result})
-if __name__ == '__main__':
-    app.run(debug=True)

+import torch
+from transformers import BertTokenizer, BertForSequenceClassification
+def get_model():
+    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
+    return model
+# Predicting Function
+def predict(model, text, tokenizer):
+    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
+    outputs = model(**inputs)
+    predictions = torch.argmax(outputs.logits, dim=-1)
+    return "AI-generated" if predictions.item() == 1 else "Human-written"

train.py CHANGED Viewed

@@ -1,20 +1,69 @@
 import pandas as pd
 from sklearn.model_selection import train_test_split
-import model
-# Load and preprocess data
-data = pd.read_csv('data.csv')
-data['text'] = data['text'].apply(preprocess_text)  # Assuming preprocess_text is a function you've defined
-# Split the data
-x_train, x_test, y_train, y_test = train_test_split(data['text'], data['generated'], test_size=0.2, random_state=42)
-# Get the model pipeline
-pipeline = model.create_pipeline()
-# Train the model
-trained_model = model.train_model(pipeline, x_train, y_train)
-# Optionally save the model
-import joblib
-joblib.dump(trained_model, 'text_classifier.pkl')

+from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
+import numpy as np
 import pandas as pd
+import json
+from datasets import Dataset
 from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, precision_recall_fscore_support
+# Load dataset
+df = pd.read_csv("AI_Human.csv")
+train_df, eval_df = train_test_split(df, test_size=0.2)
+# Tokenizer
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+def tokenize_function(examples):
+    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
+# Convert DataFrames to Datasets and apply tokenization
+train_dataset = Dataset.from_pandas(train_df)
+eval_dataset = Dataset.from_pandas(eval_df)
+train_dataset = train_dataset.map(tokenize_function, batched=True)
+train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
+eval_dataset = eval_dataset.map(tokenize_function, batched=True)
+eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
+# Model
+model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
+# Training Arguments
+training_args = TrainingArguments(
+    output_dir="./results",
+    num_train_epochs=3,
+    per_device_train_batch_size=8,
+    per_device_eval_batch_size=8,
+    warmup_steps=500,
+    weight_decay=0.01,
+    logging_dir='./logs',
+    evaluation_strategy="steps",
+    save_steps=500,
+    logging_steps=100,
+)
+def compute_metrics(pred):
+    labels = pred.label_ids
+    preds = np.argmax(pred.predictions, axis=-1)
+    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
+    acc = accuracy_score(labels, preds)
+    return {
+        'accuracy': acc,
+        'f1': f1,
+        'precision': precision,
+        'recall': recall
+    }
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset,
+    eval_dataset=eval_dataset,
+    compute_metrics=compute_metrics
+)
+trainer.train()
+model.save_pretrained("./trained_model")
+tokenizer.save_pretrained("./trained_model")