jaynopponep commited on
Commit
d96116f
·
1 Parent(s): 7cc941c

Deploying new flask, new sklearn based modeling

Browse files
Files changed (3) hide show
  1. .idea/.name +1 -1
  2. app.py +6 -9
  3. model.py +42 -11
.idea/.name CHANGED
@@ -1 +1 @@
1
- train.py
 
1
+ app.py
app.py CHANGED
@@ -1,24 +1,21 @@
1
  from flask import Flask, render_template, request, jsonify
2
  import model # Import your model module
3
- from transformers import BertTokenizer
4
 
5
  app = Flask(__name__)
6
 
7
- # Load the model and tokenizer here
8
- loaded_model = model.get_model()
9
- tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
10
-
11
 
12
  @app.route('/', methods=['GET', 'POST'])
13
  def home():
14
  if request.method == 'POST':
15
  data = request.json
16
  user_input = data['text']
17
- # Use your model to classify the text
18
- prediction = model.predict(loaded_model, user_input, tokenizer)
19
  return jsonify({'classification': prediction})
20
  return render_template('home.html')
21
 
22
-
23
  if __name__ == '__main__':
24
- app.run()
 
1
  from flask import Flask, render_template, request, jsonify
2
  import model # Import your model module
 
3
 
4
  app = Flask(__name__)
5
 
6
+ # Load data and train the model globally
7
+ df = model.load_data()
8
+ X_train, X_test, y_train, y_test = model.split_data(df)
9
+ pipeline = model.create_pipeline(X_train, y_train)
10
 
11
  @app.route('/', methods=['GET', 'POST'])
12
  def home():
13
  if request.method == 'POST':
14
  data = request.json
15
  user_input = data['text']
16
+ prediction = model.predict_text(user_input, pipeline)
 
17
  return jsonify({'classification': prediction})
18
  return render_template('home.html')
19
 
 
20
  if __name__ == '__main__':
21
+ app.run(debug=True)
model.py CHANGED
@@ -1,15 +1,46 @@
1
- import torch
2
- from transformers import BertTokenizer, BertForSequenceClassification
 
 
 
3
 
 
 
 
 
 
 
4
 
5
- def get_model():
6
- model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
7
- return model
 
 
 
8
 
 
 
 
 
 
9
 
10
- # Predicting Function
11
- def predict(model, text, tokenizer):
12
- inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
13
- outputs = model(**inputs)
14
- predictions = torch.argmax(outputs.logits, dim=-1)
15
- return "AI-generated" if predictions.item() == 1 else "Human-written"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.model_selection import train_test_split
3
+ from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
4
+ from sklearn.pipeline import Pipeline
5
+ from sklearn.naive_bayes import MultinomialNB
6
 
7
+ # Function to remove unwanted tags from the text
8
+ def remove_tags(text):
9
+ tags = ['\n', '\'']
10
+ for tag in tags:
11
+ text = text.replace(tag, '')
12
+ return text
13
 
14
+ # Assuming the data is loaded into a DataFrame 'df' at some point
15
+ def load_data():
16
+ # Dummy loading mechanism, replace with actual data loading
17
+ df = pd.read_csv('path_to_your_dataset.csv')
18
+ df['text'] = df['text'].apply(remove_tags)
19
+ return df
20
 
21
+ def split_data(df):
22
+ y = df['generated']
23
+ X = df['text']
24
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
25
+ return X_train, X_test, y_train, y_test
26
 
27
+ # Build and train the pipeline
28
+ def create_pipeline(X_train, y_train):
29
+ pipeline = Pipeline([
30
+ ('count_vectorizer', CountVectorizer()), # Step 1: Convert text to count vectors
31
+ ('tfidf_transformer', TfidfTransformer()), # Step 2: Transform count vectors to TF-IDF
32
+ ('classifier', MultinomialNB()) # Step 3: Train a classifier, here using Naive Bayes
33
+ ])
34
+ pipeline.fit(X_train, y_train)
35
+ return pipeline
36
+
37
+ # Function to predict new inputs using the trained pipeline
38
+ def predict_text(text, pipeline):
39
+ return pipeline.predict([text])[0] # Return the classification result
40
+
41
+ # Main routine to train the model if this file is executed directly (for testing)
42
+ if __name__ == "__main__":
43
+ df = load_data()
44
+ X_train, X_test, y_train, y_test = split_data(df)
45
+ pipeline = create_pipeline(X_train, y_train)
46
+ print(f"Model trained. Test accuracy: {pipeline.score(X_test, y_test)}")