File size: 1,202 Bytes
d96116f
 
 
2e23211
d96116f
2e23211
ddb4530
d96116f
 
 
 
 
ddb4530
2e23211
 
d96116f
 
ddb4530
d96116f
 
 
2e23211
d96116f
ddb4530
d96116f
 
2e23211
 
 
d96116f
 
 
 
 
2e23211
 
 
d96116f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

def remove_tags(text):
    tags = ['\n', '\'']
    for tag in tags:
        text = text.replace(tag, '')
    return text

def load_data(filepath):
    df = pd.read_csv(filepath)
    df['text'] = df['text'].apply(remove_tags)
    return df

def split_data(df):
    y = df['generated']
    X = df['text']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

def create_pipeline(X_train, y_train):
    pipeline = Pipeline([
        ('count_vectorizer', CountVectorizer()),
        ('tfidf_transformer', TfidfTransformer()),
        ('classifier', MultinomialNB())
    ])
    pipeline.fit(X_train, y_train)
    return pipeline

def predict_text(text, pipeline):
    processed_text = remove_tags(text)
    prediction = pipeline.predict([processed_text])[0]
    return "AI-generated" if prediction else "Human-written"