Spaces:

pmkhanh7890
/

news_verification

Running

File size: 3,955 Bytes

22e1b62

import csv
import time

import pandas as pd
from chatgpt_detector_roberta import (
    check_human,
    detect_ai_content,
)
from search_text import detect_by_relative_search

HUMAN = "HUMAN"
MACHINE = "MACHINE"


def read_csv_column(file_path, column_name, data_size=100):
    """
    Reads a CSV file and extracts data from the specified column.

    Args:
        filename: Path to the CSV file.
        column_name: Name of the column to extract data from.

    Returns:
        A list containing the data from the specified column.
    """

    try:
        df = pd.read_csv(file_path)
        column_data = df[column_name].tolist()
        return column_data[:data_size]
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return []
    except KeyError:
        print(f"Error: Column '{column_name}' not found in the CSV file.")
        return []


def evaluation(texts):
    results = []
    index = 0
    for text in texts:
        if index <= 82:
            print(f"index = {index}")
            index += 1
            continue
        
        # Classify by SOTA model
        # SOTA_prediction, SOTA_confidence =  detect_by_huggingface_model(text)
        SOTA_prediction, SOTA_confidence = detect_ai_content(text)

        # Classify by search engine
        # is_paraphrased, _, data = find_by_relative_search(text)
        is_paraphrased, _, data = detect_by_relative_search(text)
        if not is_paraphrased:
            search_engine_prediction = "UNKNOWN"
        else:
            if check_human(data):
                search_engine_prediction = HUMAN
            else:
                search_engine_prediction = MACHINE
        print(
            f"RESULTS:\t{SOTA_prediction}\t{search_engine_prediction}"
        )
        results.append(
            (index, SOTA_prediction, SOTA_confidence, search_engine_prediction)
        )

        with open("eva_bbc_test.csv", "a", newline="") as csvfile:
        #with open("eva_MAGE_test.csv", "a", newline="") as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(
                [index, SOTA_prediction, SOTA_confidence, search_engine_prediction]
            )
        index += 1
        time.sleep(1)  # avoid 100? queries per minute limit

    # Define the column names
    # columns = [
    #     "index",
    #     "SOTA_prediction",
    #     "SOTA_confidence",
    #     "search_engine_prediction",
    # ]

    # # Create the DataFrame
    # df = pd.DataFrame(results, columns=columns)

    # # Statistics
    # search_engine_acc = df["search_engine_prediction"].value_counts()[
    #     "HUMAN"
    # ] / len(df)
    # SOTA_acc = df["SOTA_prediction"].value_counts()["HUMAN"] / len(df)

    # # Filter the DataFrame based on the given conditions
    # filtered_df = df[
    #     (df["SOTA_prediction"] == "MACHINE")
    #     & (df["search_engine_prediction"] == "HUMAN")
    # ]

    # print(f"Total data: {len(df)}")
    # print(f"SOTA accuracy: {SOTA_acc}")
    # print(f"Search engine accuracy: {search_engine_acc}")
    # print(f"Correction sample: {len(filtered_df)}")


def extract_machine_data(file_path):
    df = pd.read_csv(file_path)
    machine_data = df[df["src"] == "xsum_machine_topical_gpt-3.5-trubo"]

    # write to file
    machine_data.to_csv("machine_data.csv", index=False)
    
def extract_human_data(file_path):
    df = pd.read_csv(file_path)
    machine_data = df[df["src"] == "xsum_human"]

    # write to file
    machine_data.to_csv("machine_data.csv", index=False)


if __name__ == "__main__":
    # extract_machine_data('data/test_data/test.csv')

    # BBC
    file_path = "data/test_data/test_100_bbc.csv"
    column_name = "content"

    # MAGE
    # file_path = "data/test_data/test_100_MAGE.csv"
    # column_name = "text"

    contents = read_csv_column(
        file_path=file_path,
        column_name=column_name,
        data_size=100,
    )
    evaluation(contents)