Spaces:
Running
Running
File size: 3,955 Bytes
22e1b62 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import csv
import time
import pandas as pd
from chatgpt_detector_roberta import (
check_human,
detect_ai_content,
)
from search_text import detect_by_relative_search
HUMAN = "HUMAN"
MACHINE = "MACHINE"
def read_csv_column(file_path, column_name, data_size=100):
"""
Reads a CSV file and extracts data from the specified column.
Args:
filename: Path to the CSV file.
column_name: Name of the column to extract data from.
Returns:
A list containing the data from the specified column.
"""
try:
df = pd.read_csv(file_path)
column_data = df[column_name].tolist()
return column_data[:data_size]
except FileNotFoundError:
print(f"Error: File '{file_path}' not found.")
return []
except KeyError:
print(f"Error: Column '{column_name}' not found in the CSV file.")
return []
def evaluation(texts):
results = []
index = 0
for text in texts:
if index <= 82:
print(f"index = {index}")
index += 1
continue
# Classify by SOTA model
# SOTA_prediction, SOTA_confidence = detect_by_huggingface_model(text)
SOTA_prediction, SOTA_confidence = detect_ai_content(text)
# Classify by search engine
# is_paraphrased, _, data = find_by_relative_search(text)
is_paraphrased, _, data = detect_by_relative_search(text)
if not is_paraphrased:
search_engine_prediction = "UNKNOWN"
else:
if check_human(data):
search_engine_prediction = HUMAN
else:
search_engine_prediction = MACHINE
print(
f"RESULTS:\t{SOTA_prediction}\t{search_engine_prediction}"
)
results.append(
(index, SOTA_prediction, SOTA_confidence, search_engine_prediction)
)
with open("eva_bbc_test.csv", "a", newline="") as csvfile:
#with open("eva_MAGE_test.csv", "a", newline="") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(
[index, SOTA_prediction, SOTA_confidence, search_engine_prediction]
)
index += 1
time.sleep(1) # avoid 100? queries per minute limit
# Define the column names
# columns = [
# "index",
# "SOTA_prediction",
# "SOTA_confidence",
# "search_engine_prediction",
# ]
# # Create the DataFrame
# df = pd.DataFrame(results, columns=columns)
# # Statistics
# search_engine_acc = df["search_engine_prediction"].value_counts()[
# "HUMAN"
# ] / len(df)
# SOTA_acc = df["SOTA_prediction"].value_counts()["HUMAN"] / len(df)
# # Filter the DataFrame based on the given conditions
# filtered_df = df[
# (df["SOTA_prediction"] == "MACHINE")
# & (df["search_engine_prediction"] == "HUMAN")
# ]
# print(f"Total data: {len(df)}")
# print(f"SOTA accuracy: {SOTA_acc}")
# print(f"Search engine accuracy: {search_engine_acc}")
# print(f"Correction sample: {len(filtered_df)}")
def extract_machine_data(file_path):
df = pd.read_csv(file_path)
machine_data = df[df["src"] == "xsum_machine_topical_gpt-3.5-trubo"]
# write to file
machine_data.to_csv("machine_data.csv", index=False)
def extract_human_data(file_path):
df = pd.read_csv(file_path)
machine_data = df[df["src"] == "xsum_human"]
# write to file
machine_data.to_csv("machine_data.csv", index=False)
if __name__ == "__main__":
# extract_machine_data('data/test_data/test.csv')
# BBC
file_path = "data/test_data/test_100_bbc.csv"
column_name = "content"
# MAGE
# file_path = "data/test_data/test_100_MAGE.csv"
# column_name = "text"
contents = read_csv_column(
file_path=file_path,
column_name=column_name,
data_size=100,
)
evaluation(contents) |