Spaces:
Running
Running
import csv | |
import time | |
import pandas as pd | |
from chatgpt_detector_roberta import ( | |
check_human, | |
detect_ai_content, | |
) | |
from search_text import detect_by_relative_search | |
HUMAN = "HUMAN" | |
MACHINE = "MACHINE" | |
def read_csv_column(file_path, column_name, data_size=100): | |
""" | |
Reads a CSV file and extracts data from the specified column. | |
Args: | |
filename: Path to the CSV file. | |
column_name: Name of the column to extract data from. | |
Returns: | |
A list containing the data from the specified column. | |
""" | |
try: | |
df = pd.read_csv(file_path) | |
column_data = df[column_name].tolist() | |
return column_data[:data_size] | |
except FileNotFoundError: | |
print(f"Error: File '{file_path}' not found.") | |
return [] | |
except KeyError: | |
print(f"Error: Column '{column_name}' not found in the CSV file.") | |
return [] | |
def evaluation(texts): | |
results = [] | |
index = 0 | |
for text in texts: | |
if index <= 82: | |
print(f"index = {index}") | |
index += 1 | |
continue | |
# Classify by SOTA model | |
# SOTA_prediction, SOTA_confidence = detect_by_huggingface_model(text) | |
SOTA_prediction, SOTA_confidence = detect_ai_content(text) | |
# Classify by search engine | |
# is_paraphrased, _, data = find_by_relative_search(text) | |
is_paraphrased, _, data = detect_by_relative_search(text) | |
if not is_paraphrased: | |
search_engine_prediction = "UNKNOWN" | |
else: | |
if check_human(data): | |
search_engine_prediction = HUMAN | |
else: | |
search_engine_prediction = MACHINE | |
print( | |
f"RESULTS:\t{SOTA_prediction}\t{search_engine_prediction}" | |
) | |
results.append( | |
(index, SOTA_prediction, SOTA_confidence, search_engine_prediction) | |
) | |
with open("eva_bbc_test.csv", "a", newline="") as csvfile: | |
#with open("eva_MAGE_test.csv", "a", newline="") as csvfile: | |
writer = csv.writer(csvfile) | |
writer.writerow( | |
[index, SOTA_prediction, SOTA_confidence, search_engine_prediction] | |
) | |
index += 1 | |
time.sleep(1) # avoid 100? queries per minute limit | |
# Define the column names | |
# columns = [ | |
# "index", | |
# "SOTA_prediction", | |
# "SOTA_confidence", | |
# "search_engine_prediction", | |
# ] | |
# # Create the DataFrame | |
# df = pd.DataFrame(results, columns=columns) | |
# # Statistics | |
# search_engine_acc = df["search_engine_prediction"].value_counts()[ | |
# "HUMAN" | |
# ] / len(df) | |
# SOTA_acc = df["SOTA_prediction"].value_counts()["HUMAN"] / len(df) | |
# # Filter the DataFrame based on the given conditions | |
# filtered_df = df[ | |
# (df["SOTA_prediction"] == "MACHINE") | |
# & (df["search_engine_prediction"] == "HUMAN") | |
# ] | |
# print(f"Total data: {len(df)}") | |
# print(f"SOTA accuracy: {SOTA_acc}") | |
# print(f"Search engine accuracy: {search_engine_acc}") | |
# print(f"Correction sample: {len(filtered_df)}") | |
def extract_machine_data(file_path): | |
df = pd.read_csv(file_path) | |
machine_data = df[df["src"] == "xsum_machine_topical_gpt-3.5-trubo"] | |
# write to file | |
machine_data.to_csv("machine_data.csv", index=False) | |
def extract_human_data(file_path): | |
df = pd.read_csv(file_path) | |
machine_data = df[df["src"] == "xsum_human"] | |
# write to file | |
machine_data.to_csv("machine_data.csv", index=False) | |
if __name__ == "__main__": | |
# extract_machine_data('data/test_data/test.csv') | |
# BBC | |
file_path = "data/test_data/test_100_bbc.csv" | |
column_name = "content" | |
# MAGE | |
# file_path = "data/test_data/test_100_MAGE.csv" | |
# column_name = "text" | |
contents = read_csv_column( | |
file_path=file_path, | |
column_name=column_name, | |
data_size=100, | |
) | |
evaluation(contents) |