|
import pandas as pd |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.pipeline import Pipeline |
|
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer |
|
from sklearn.naive_bayes import MultinomialNB |
|
from sklearn.metrics import accuracy_score, classification_report |
|
|
|
|
|
def remove_tags(text): |
|
tags = ['\n', '\''] |
|
for tag in tags: |
|
text = text.replace(tag, '') |
|
return text |
|
|
|
|
|
def load_data(filepath): |
|
df = pd.read_csv(filepath) |
|
df['text'] = df['text'].apply(remove_tags) |
|
return df |
|
|
|
|
|
def split_data(df): |
|
y = df['generated'] |
|
x = df['text'] |
|
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) |
|
return x_train, x_test, y_train, y_test |
|
|
|
|
|
def create_pipeline(x_train, y_train): |
|
pipeline = Pipeline([ |
|
('count_vectorizer', CountVectorizer()), |
|
('tfidf_transformer', TfidfTransformer()), |
|
('classifier', MultinomialNB()) |
|
]) |
|
pipeline.fit(x_train, y_train) |
|
return pipeline |
|
|
|
|
|
def predict_text(text, pipeline): |
|
processed_text = remove_tags(text) |
|
prediction = pipeline.predict([processed_text])[0] |
|
return "AI-generated" if prediction else "Human-written" |
|
|
|
|