Spaces:

pmkhanh7890
/

news_verification

Running

File size: 7,241 Bytes

22e1b62

import pandas as pd
import re
import csv
from collections import Counter
from difflib import Differ
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')


def remove_stop_words(word_list):
    """
    Removes stop words from a list of single words.

    Args:
        word_list: A list of single words.

    Returns:
        A new list containing only the words that are not stop words.
    """

    stop_words = set(stopwords.words('english'))  # Get English stop words
    
    # Define characters to remove
    chars_to_remove = r'[^a-zA-Z0-9]'  # Matches any character that is not a letter or digit

    cleaned_words = []
    for word in word_list:
        # Remove punctuation and special characters
        word = re.sub(chars_to_remove, '', word) 

        # Check for single digits and single letters
        if len(word) > 1 and not word.isdigit(): 
            # Check if the word is not a stop word
            if word.lower() not in stop_words:
                cleaned_words.append(word)

    return cleaned_words


def write_word_counts_to_csv(data):
    """Writes word counts to a CSV file from a dictionary.

    Args:
        data_dict: A dictionary containing the word count data.
        filename: The name of the output CSV file.
    """

    with open('data/results/[res]added_word_counts.csv', 'w', encoding='utf-8', newline='') as csvfile:
        fieldnames = ['Word', 'Count']
        writer = csv.writer(csvfile)
        writer.writerow(fieldnames)

        for word, count in data['added_word_counts']:
            writer.writerow([word, count])
            
    with open('data/results/[res]removed_word_counts.csv', 'w', encoding='utf-8', newline='') as csvfile:
        fieldnames = ['Word', 'Count']
        writer = csv.writer(csvfile)
        writer.writerow(fieldnames)

        for word, count in data['removed_word_counts']:
            writer.writerow([word, count])
            
    # with open('data/results/[res]unchanged_words.csv', 'w', encoding='utf-8', newline='') as csvfile:
    #     fieldnames = ['Count', 'Phrase']
    #     writer = csv.writer(csvfile)
    #     writer.writerow(fieldnames)  # Write the header
        # for phrase, count in data['unchanged_words']:
        #     writer.writerow([count, phrase])


def preprocess_text(text):
    """
    Preprocesses a string by removing punctuation, numbers, and whitespace.

    Args:
        text: The string to preprocess.

    Returns:
        The preprocessed string.
    """
    
    # Lower case
    text = text.lower()

    # Split text into words while keeping commas and dots within numbers
    delimiters = r"(?<!\d)[ \.,;!\?\|-]+(?!\d)"  # Negative lookahead and lookbehind for digits
    text = re.split(delimiters, text)
    return text


def compare_strings_from_csv(csv_file):
    """
    Compares strings in a CSV file and returns added, removed, and unchanged substrings.

    Args:
        csv_file: Path to the CSV file.

    Returns:
        A tuple containing three lists: (removed_substrings, added_substrings, unchanged_substrings)
        and word counts of added substrings
        Returns None if there is an error with file reading
    """

    try:
        df = pd.read_csv(csv_file)
    except FileNotFoundError:
        print(f"Error: File '{csv_file}' not found.")
        return None
    except pd.errors.ParserError:
        print(f"Error: Could not parse CSV file '{csv_file}'.")
        return None

    removed_words_total = []
    added_words_total = []
    unchanged_phrases_total = []

    for _, row in df.iterrows():
        human_text = row['human']
        gpt_text = row['ChatGPT']
        removed_words, added_words, unchanged_phrases = compare_strings(human_text, gpt_text)
        removed_words_total += removed_words 
        added_words_total += added_words
        unchanged_phrases_total.extend(unchanged_phrases)
    
    added_word_counts = Counter()
    for substring in added_words_total:
        added_word_counts.update([substring])
    sorted_added_words = sorted(added_word_counts.items(), key=lambda x: x[1], reverse=True)
        
    removed_word_counts = Counter()
    for substring in removed_words_total:
        removed_word_counts.update([substring])
    sorted_removed_words = sorted(removed_word_counts.items(), key=lambda x: x[1], reverse=True)
    
    #sort phrase by number of words
    unchanged_phrases_total.sort(key=lambda x: x[1], reverse=True)

    return {
        "unchanged_words": unchanged_phrases_total,
        "added_word_counts": sorted_added_words,
        "removed_word_counts": sorted_removed_words,
    }


def compare_strings(a, b, n_gram=3):
    """
    Compares two strings and returns lists of removed, added, and unchanged substrings.

    Args:
        a: The first string.
        b: The second string.

    Returns:
        A tuple containing three lists: (removed, added, unchanged).
        - removed: List of substrings removed from a.
        - added: List of substrings added to b.
        - unchanged: List of common substrings (at least 4 consecutive words).
    """
    removed_ngrams = []
    added_ngrams = []
    unchanged_phrases = []
    
    # Pre-process the string
    a_splited = preprocess_text(a)
    b_splited = preprocess_text(b)
    
    # Find differences between words in a and b and generate diff list
    diff = Differ().compare(a_splited, b_splited)
    diff_list = list(diff)

    # Find removed words/substrings
    if n_gram == 1:
        removed_ngrams = list(w[2:] for w in diff_list if w.startswith("-"))
        removed_ngrams = remove_stop_words(removed_ngrams)
    # removed_word_counts = Counter()
    # for substring in removed_ngrams:
    #     removed_word_counts.update(substring.split())
    for i in range(len(diff_list) - n_gram + 1):
        if all(w.startswith("-") for w in diff_list[i:i+n_gram]): 
            joint_words = " ".join(diff_list[i:i+n_gram]).replace("- ", "")
            removed_ngrams.append(joint_words)  
    
    # Find added words/substrings
    if n_gram == 1:
        added_ngrams = list(w[2:] for w in diff_list if w.startswith(("+")))
        added_ngrams = remove_stop_words(added_ngrams)
        
    for i in range(len(diff_list) - n_gram + 1):
        if all(w.startswith("+") for w in diff_list[i:i+n_gram]): 
            joint_words = " ".join(diff_list[i:i+n_gram]).replace("+ ", "")
            added_ngrams.append(joint_words) 

    #Find Unchanged substrings
    substring = ""
    count = 0
    for word in diff_list:
        if word.startswith(("+", "-")):
            if substring != "":
                if count >= 4:
                    unchanged_phrase = " ".join(substring.split())
                    unchanged_phrases.append((unchanged_phrase, count))
                substring = ""
                count = 0
            continue
        substring += " " + word
        count += 1
        
    return removed_ngrams, added_ngrams, unchanged_phrases


if __name__ == "__main__":
    res = compare_strings_from_csv("data/ChatGPT_Nous_Hermes_2_Yi_34B_openchat_3_5_1210_with_best_similarity.csv")
    write_word_counts_to_csv(res)
    
    #remove_stop_words(["the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"])