import pandas as pd from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import accuracy_score, classification_report def remove_tags(text): tags = ['\n', '\''] for tag in tags: text = text.replace(tag, '') return text def load_data(filepath): df = pd.read_csv(filepath) df['text'] = df['text'].apply(remove_tags) return df def split_data(df): y = df['generated'] x = df['text'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) return x_train, x_test, y_train, y_test def create_pipeline(x_train, y_train): pipeline = Pipeline([ ('count_vectorizer', CountVectorizer()), ('tfidf_transformer', TfidfTransformer()), ('classifier', MultinomialNB()) ]) pipeline.fit(x_train, y_train) return pipeline def predict_text(text, pipeline): processed_text = remove_tags(text) prediction = pipeline.predict([processed_text])[0] return "AI-generated" if prediction else "Human-written"