Spaces:
Running
Running
import pandas as pd | |
import re | |
import csv | |
from collections import Counter | |
from difflib import Differ | |
import nltk | |
from nltk.corpus import stopwords | |
nltk.download('stopwords') | |
def remove_stop_words(word_list): | |
""" | |
Removes stop words from a list of single words. | |
Args: | |
word_list: A list of single words. | |
Returns: | |
A new list containing only the words that are not stop words. | |
""" | |
stop_words = set(stopwords.words('english')) # Get English stop words | |
# Define characters to remove | |
chars_to_remove = r'[^a-zA-Z0-9]' # Matches any character that is not a letter or digit | |
cleaned_words = [] | |
for word in word_list: | |
# Remove punctuation and special characters | |
word = re.sub(chars_to_remove, '', word) | |
# Check for single digits and single letters | |
if len(word) > 1 and not word.isdigit(): | |
# Check if the word is not a stop word | |
if word.lower() not in stop_words: | |
cleaned_words.append(word) | |
return cleaned_words | |
def write_word_counts_to_csv(data): | |
"""Writes word counts to a CSV file from a dictionary. | |
Args: | |
data_dict: A dictionary containing the word count data. | |
filename: The name of the output CSV file. | |
""" | |
with open('data/results/[res]added_word_counts.csv', 'w', encoding='utf-8', newline='') as csvfile: | |
fieldnames = ['Word', 'Count'] | |
writer = csv.writer(csvfile) | |
writer.writerow(fieldnames) | |
for word, count in data['added_word_counts']: | |
writer.writerow([word, count]) | |
with open('data/results/[res]removed_word_counts.csv', 'w', encoding='utf-8', newline='') as csvfile: | |
fieldnames = ['Word', 'Count'] | |
writer = csv.writer(csvfile) | |
writer.writerow(fieldnames) | |
for word, count in data['removed_word_counts']: | |
writer.writerow([word, count]) | |
# with open('data/results/[res]unchanged_words.csv', 'w', encoding='utf-8', newline='') as csvfile: | |
# fieldnames = ['Count', 'Phrase'] | |
# writer = csv.writer(csvfile) | |
# writer.writerow(fieldnames) # Write the header | |
# for phrase, count in data['unchanged_words']: | |
# writer.writerow([count, phrase]) | |
def preprocess_text(text): | |
""" | |
Preprocesses a string by removing punctuation, numbers, and whitespace. | |
Args: | |
text: The string to preprocess. | |
Returns: | |
The preprocessed string. | |
""" | |
# Lower case | |
text = text.lower() | |
# Split text into words while keeping commas and dots within numbers | |
delimiters = r"(?<!\d)[ \.,;!\?\|-]+(?!\d)" # Negative lookahead and lookbehind for digits | |
text = re.split(delimiters, text) | |
return text | |
def compare_strings_from_csv(csv_file): | |
""" | |
Compares strings in a CSV file and returns added, removed, and unchanged substrings. | |
Args: | |
csv_file: Path to the CSV file. | |
Returns: | |
A tuple containing three lists: (removed_substrings, added_substrings, unchanged_substrings) | |
and word counts of added substrings | |
Returns None if there is an error with file reading | |
""" | |
try: | |
df = pd.read_csv(csv_file) | |
except FileNotFoundError: | |
print(f"Error: File '{csv_file}' not found.") | |
return None | |
except pd.errors.ParserError: | |
print(f"Error: Could not parse CSV file '{csv_file}'.") | |
return None | |
removed_words_total = [] | |
added_words_total = [] | |
unchanged_phrases_total = [] | |
for _, row in df.iterrows(): | |
human_text = row['human'] | |
gpt_text = row['ChatGPT'] | |
removed_words, added_words, unchanged_phrases = compare_strings(human_text, gpt_text) | |
removed_words_total += removed_words | |
added_words_total += added_words | |
unchanged_phrases_total.extend(unchanged_phrases) | |
added_word_counts = Counter() | |
for substring in added_words_total: | |
added_word_counts.update([substring]) | |
sorted_added_words = sorted(added_word_counts.items(), key=lambda x: x[1], reverse=True) | |
removed_word_counts = Counter() | |
for substring in removed_words_total: | |
removed_word_counts.update([substring]) | |
sorted_removed_words = sorted(removed_word_counts.items(), key=lambda x: x[1], reverse=True) | |
#sort phrase by number of words | |
unchanged_phrases_total.sort(key=lambda x: x[1], reverse=True) | |
return { | |
"unchanged_words": unchanged_phrases_total, | |
"added_word_counts": sorted_added_words, | |
"removed_word_counts": sorted_removed_words, | |
} | |
def compare_strings(a, b, n_gram=3): | |
""" | |
Compares two strings and returns lists of removed, added, and unchanged substrings. | |
Args: | |
a: The first string. | |
b: The second string. | |
Returns: | |
A tuple containing three lists: (removed, added, unchanged). | |
- removed: List of substrings removed from a. | |
- added: List of substrings added to b. | |
- unchanged: List of common substrings (at least 4 consecutive words). | |
""" | |
removed_ngrams = [] | |
added_ngrams = [] | |
unchanged_phrases = [] | |
# Pre-process the string | |
a_splited = preprocess_text(a) | |
b_splited = preprocess_text(b) | |
# Find differences between words in a and b and generate diff list | |
diff = Differ().compare(a_splited, b_splited) | |
diff_list = list(diff) | |
# Find removed words/substrings | |
if n_gram == 1: | |
removed_ngrams = list(w[2:] for w in diff_list if w.startswith("-")) | |
removed_ngrams = remove_stop_words(removed_ngrams) | |
# removed_word_counts = Counter() | |
# for substring in removed_ngrams: | |
# removed_word_counts.update(substring.split()) | |
for i in range(len(diff_list) - n_gram + 1): | |
if all(w.startswith("-") for w in diff_list[i:i+n_gram]): | |
joint_words = " ".join(diff_list[i:i+n_gram]).replace("- ", "") | |
removed_ngrams.append(joint_words) | |
# Find added words/substrings | |
if n_gram == 1: | |
added_ngrams = list(w[2:] for w in diff_list if w.startswith(("+"))) | |
added_ngrams = remove_stop_words(added_ngrams) | |
for i in range(len(diff_list) - n_gram + 1): | |
if all(w.startswith("+") for w in diff_list[i:i+n_gram]): | |
joint_words = " ".join(diff_list[i:i+n_gram]).replace("+ ", "") | |
added_ngrams.append(joint_words) | |
#Find Unchanged substrings | |
substring = "" | |
count = 0 | |
for word in diff_list: | |
if word.startswith(("+", "-")): | |
if substring != "": | |
if count >= 4: | |
unchanged_phrase = " ".join(substring.split()) | |
unchanged_phrases.append((unchanged_phrase, count)) | |
substring = "" | |
count = 0 | |
continue | |
substring += " " + word | |
count += 1 | |
return removed_ngrams, added_ngrams, unchanged_phrases | |
if __name__ == "__main__": | |
res = compare_strings_from_csv("data/ChatGPT_Nous_Hermes_2_Yi_34B_openchat_3_5_1210_with_best_similarity.csv") | |
write_word_counts_to_csv(res) | |
#remove_stop_words(["the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"]) |