jaynopponep commited on
Commit
c7b4b03
·
1 Parent(s): 1f29cc4

Bringing back to original

Browse files
Files changed (3) hide show
  1. app.py +10 -12
  2. model.py +11 -39
  3. train.py +62 -13
app.py CHANGED
@@ -1,24 +1,22 @@
1
  from flask import Flask, render_template, request, jsonify
2
  import model # Import your model module
 
3
 
4
  app = Flask(__name__)
5
 
6
- # Load the model and tokenizer
7
- loaded_model, tokenizer = model.get_model_and_tokenizer()
 
8
 
9
- @app.route('/', methods=['GET', 'POST'])
10
- def home():
11
- if request.method == 'POST':
12
- data = request.get_json(force=True) # Safely extract JSON and handle parsing errors
13
- user_input = data.get('text')
14
- if user_input is None:
15
- return jsonify({'error': 'No text provided'}), 400
16
 
 
 
 
17
  # Use your model to classify the text
18
- prediction = model.predict(loaded_model, tokenizer, user_input)
19
  return jsonify({'classification': prediction})
20
-
21
  return render_template('home.html')
22
 
 
23
  if __name__ == '__main__':
24
- app.run(debug=True) # Turn off debug in production
 
1
  from flask import Flask, render_template, request, jsonify
2
  import model # Import your model module
3
+ from transformers import BertTokenizer
4
 
5
  app = Flask(__name__)
6
 
7
+ # Load the model and tokenizer here
8
+ loaded_model = model.get_model()
9
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
10
 
 
 
 
 
 
 
 
11
 
12
+ @app.route('/', methods=['GET', 'POST'])
13
+ data = request.json
14
+ user_input = data['text']
15
  # Use your model to classify the text
16
+ prediction = model.predict(loaded_model, user_input, tokenizer)
17
  return jsonify({'classification': prediction})
 
18
  return render_template('home.html')
19
 
20
+
21
  if __name__ == '__main__':
22
+ app.run()
model.py CHANGED
@@ -1,44 +1,16 @@
1
- # model.py
2
- from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
3
- from sklearn.pipeline import Pipeline
4
- from sklearn.naive_bayes import MultinomialNB
5
 
6
- def train_model(x_train, y_train):
7
- pipeline = Pipeline([
8
- ('count_vectorizer', CountVectorizer()),
9
- ('tfidf_transformer', TfidfTransformer()),
10
- ('naive_bayes', MultinomialNB())
11
- ])
12
- pipeline.fit(x_train, y_train)
13
- return pipeline
14
 
15
- def predict(model, text):
16
- return model.predict([text])[0]
17
 
18
- # app.py
19
- from flask import Flask, request, jsonify
20
- import pandas as pd
21
- from model import train_model, predict
22
 
23
- app = Flask(__name__)
24
 
25
- # Assume your data is already loaded and preprocessed into x_train, y_train
26
- # For demonstration, let's assume these are loaded elsewhere and imported here
27
- # x_train, y_train would typically be lists or a Pandas Series
28
-
29
- # Load and train the model
30
- # This should be replaced by actual data loading and splitting
31
- x_train = ["your training data goes here"]
32
- y_train = ["your labels go here"]
33
- model = train_model(x_train, y_train)
34
-
35
- @app.route('/', methods=['POST'])
36
- def home():
37
- data = request.get_json()
38
- user_input = data['text']
39
- prediction = predict(model, user_input)
40
- result = "AI-generated" if prediction == 1 else "Human-written"
41
- return jsonify({'classification': result})
42
-
43
- if __name__ == '__main__':
44
- app.run(debug=True)
 
1
+ import torch
2
+ from transformers import BertTokenizer, BertForSequenceClassification
 
 
3
 
 
 
 
 
 
 
 
 
4
 
 
 
5
 
6
+ def get_model():
7
+ model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
8
+ return model
 
9
 
 
10
 
11
+ # Predicting Function
12
+ def predict(model, text, tokenizer):
13
+ inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
14
+ outputs = model(**inputs)
15
+ predictions = torch.argmax(outputs.logits, dim=-1)
16
+ return "AI-generated" if predictions.item() == 1 else "Human-written"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train.py CHANGED
@@ -1,20 +1,69 @@
 
 
1
  import pandas as pd
 
 
2
  from sklearn.model_selection import train_test_split
3
- import model
4
 
5
- # Load and preprocess data
6
- data = pd.read_csv('data.csv')
7
- data['text'] = data['text'].apply(preprocess_text) # Assuming preprocess_text is a function you've defined
8
 
9
- # Split the data
10
- x_train, x_test, y_train, y_test = train_test_split(data['text'], data['generated'], test_size=0.2, random_state=42)
11
 
12
- # Get the model pipeline
13
- pipeline = model.create_pipeline()
14
 
15
- # Train the model
16
- trained_model = model.train_model(pipeline, x_train, y_train)
17
 
18
- # Optionally save the model
19
- import joblib
20
- joblib.dump(trained_model, 'text_classifier.pkl')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
2
+ import numpy as np
3
  import pandas as pd
4
+ import json
5
+ from datasets import Dataset
6
  from sklearn.model_selection import train_test_split
7
+ from sklearn.metrics import accuracy_score, precision_recall_fscore_support
8
 
9
+ # Load dataset
10
+ df = pd.read_csv("AI_Human.csv")
11
+ train_df, eval_df = train_test_split(df, test_size=0.2)
12
 
13
+ # Tokenizer
14
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
15
 
 
 
16
 
17
+ def tokenize_function(examples):
18
+ return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
19
 
20
+ # Convert DataFrames to Datasets and apply tokenization
21
+ train_dataset = Dataset.from_pandas(train_df)
22
+ eval_dataset = Dataset.from_pandas(eval_df)
23
+
24
+ train_dataset = train_dataset.map(tokenize_function, batched=True)
25
+ train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
26
+
27
+ eval_dataset = eval_dataset.map(tokenize_function, batched=True)
28
+ eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
29
+
30
+ # Model
31
+ model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
32
+
33
+ # Training Arguments
34
+ training_args = TrainingArguments(
35
+ output_dir="./results",
36
+ num_train_epochs=3,
37
+ per_device_train_batch_size=8,
38
+ per_device_eval_batch_size=8,
39
+ warmup_steps=500,
40
+ weight_decay=0.01,
41
+ logging_dir='./logs',
42
+ evaluation_strategy="steps",
43
+ save_steps=500,
44
+ logging_steps=100,
45
+ )
46
+
47
+ def compute_metrics(pred):
48
+ labels = pred.label_ids
49
+ preds = np.argmax(pred.predictions, axis=-1)
50
+ precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
51
+ acc = accuracy_score(labels, preds)
52
+ return {
53
+ 'accuracy': acc,
54
+ 'f1': f1,
55
+ 'precision': precision,
56
+ 'recall': recall
57
+ }
58
+
59
+ trainer = Trainer(
60
+ model=model,
61
+ args=training_args,
62
+ train_dataset=train_dataset,
63
+ eval_dataset=eval_dataset,
64
+ compute_metrics=compute_metrics
65
+ )
66
+
67
+ trainer.train()
68
+ model.save_pretrained("./trained_model")
69
+ tokenizer.save_pretrained("./trained_model")