pmkhanh7890's picture
1st version of demo
1ce1659
raw
history blame
3.96 kB
import csv
import time
import pandas as pd
from chatgpt_detector_roberta import (
check_human,
detect_ai_content,
)
from search_text import detect_by_relative_search
HUMAN = "HUMAN"
MACHINE = "MACHINE"
def read_csv_column(file_path, column_name, data_size=100):
"""
Reads a CSV file and extracts data from the specified column.
Args:
filename: Path to the CSV file.
column_name: Name of the column to extract data from.
Returns:
A list containing the data from the specified column.
"""
try:
df = pd.read_csv(file_path)
column_data = df[column_name].tolist()
return column_data[:data_size]
except FileNotFoundError:
print(f"Error: File '{file_path}' not found.")
return []
except KeyError:
print(f"Error: Column '{column_name}' not found in the CSV file.")
return []
def evaluation(texts):
results = []
index = 0
for text in texts:
if index <= 82:
print(f"index = {index}")
index += 1
continue
# Classify by SOTA model
# SOTA_prediction, SOTA_confidence = detect_by_huggingface_model(text)
SOTA_prediction, SOTA_confidence = detect_ai_content(text)
# Classify by search engine
# is_paraphrased, _, data = find_by_relative_search(text)
is_paraphrased, _, data = detect_by_relative_search(text)
if not is_paraphrased:
search_engine_prediction = "UNKNOWN"
else:
if check_human(data):
search_engine_prediction = HUMAN
else:
search_engine_prediction = MACHINE
print(
f"RESULTS:\t{SOTA_prediction}\t{search_engine_prediction}"
)
results.append(
(index, SOTA_prediction, SOTA_confidence, search_engine_prediction)
)
with open("eva_bbc_test.csv", "a", newline="") as csvfile:
#with open("eva_MAGE_test.csv", "a", newline="") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(
[index, SOTA_prediction, SOTA_confidence, search_engine_prediction]
)
index += 1
time.sleep(1) # avoid 100? queries per minute limit
# Define the column names
# columns = [
# "index",
# "SOTA_prediction",
# "SOTA_confidence",
# "search_engine_prediction",
# ]
# # Create the DataFrame
# df = pd.DataFrame(results, columns=columns)
# # Statistics
# search_engine_acc = df["search_engine_prediction"].value_counts()[
# "HUMAN"
# ] / len(df)
# SOTA_acc = df["SOTA_prediction"].value_counts()["HUMAN"] / len(df)
# # Filter the DataFrame based on the given conditions
# filtered_df = df[
# (df["SOTA_prediction"] == "MACHINE")
# & (df["search_engine_prediction"] == "HUMAN")
# ]
# print(f"Total data: {len(df)}")
# print(f"SOTA accuracy: {SOTA_acc}")
# print(f"Search engine accuracy: {search_engine_acc}")
# print(f"Correction sample: {len(filtered_df)}")
def extract_machine_data(file_path):
df = pd.read_csv(file_path)
machine_data = df[df["src"] == "xsum_machine_topical_gpt-3.5-trubo"]
# write to file
machine_data.to_csv("machine_data.csv", index=False)
def extract_human_data(file_path):
df = pd.read_csv(file_path)
machine_data = df[df["src"] == "xsum_human"]
# write to file
machine_data.to_csv("machine_data.csv", index=False)
if __name__ == "__main__":
# extract_machine_data('data/test_data/test.csv')
# BBC
file_path = "data/test_data/test_100_bbc.csv"
column_name = "content"
# MAGE
# file_path = "data/test_data/test_100_MAGE.csv"
# column_name = "text"
contents = read_csv_column(
file_path=file_path,
column_name=column_name,
data_size=100,
)
evaluation(contents)