jaynopponep's picture
Deploying new flask, new sklearn based modeling
d96116f
raw
history blame
1.79 kB
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
# Function to remove unwanted tags from the text
def remove_tags(text):
tags = ['\n', '\'']
for tag in tags:
text = text.replace(tag, '')
return text
# Assuming the data is loaded into a DataFrame 'df' at some point
def load_data():
# Dummy loading mechanism, replace with actual data loading
df = pd.read_csv('path_to_your_dataset.csv')
df['text'] = df['text'].apply(remove_tags)
return df
def split_data(df):
y = df['generated']
X = df['text']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
return X_train, X_test, y_train, y_test
# Build and train the pipeline
def create_pipeline(X_train, y_train):
pipeline = Pipeline([
('count_vectorizer', CountVectorizer()), # Step 1: Convert text to count vectors
('tfidf_transformer', TfidfTransformer()), # Step 2: Transform count vectors to TF-IDF
('classifier', MultinomialNB()) # Step 3: Train a classifier, here using Naive Bayes
])
pipeline.fit(X_train, y_train)
return pipeline
# Function to predict new inputs using the trained pipeline
def predict_text(text, pipeline):
return pipeline.predict([text])[0] # Return the classification result
# Main routine to train the model if this file is executed directly (for testing)
if __name__ == "__main__":
df = load_data()
X_train, X_test, y_train, y_test = split_data(df)
pipeline = create_pipeline(X_train, y_train)
print(f"Model trained. Test accuracy: {pipeline.score(X_test, y_test)}")