jaynopponep's picture
Adding new scikit based code!
2e23211
raw
history blame
1.2 kB
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
def remove_tags(text):
tags = ['\n', '\'']
for tag in tags:
text = text.replace(tag, '')
return text
def load_data(filepath):
df = pd.read_csv(filepath)
df['text'] = df['text'].apply(remove_tags)
return df
def split_data(df):
y = df['generated']
X = df['text']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
return X_train, X_test, y_train, y_test
def create_pipeline(X_train, y_train):
pipeline = Pipeline([
('count_vectorizer', CountVectorizer()),
('tfidf_transformer', TfidfTransformer()),
('classifier', MultinomialNB())
])
pipeline.fit(X_train, y_train)
return pipeline
def predict_text(text, pipeline):
processed_text = remove_tags(text)
prediction = pipeline.predict([processed_text])[0]
return "AI-generated" if prediction else "Human-written"