import pandas as pd import re import csv from collections import Counter from difflib import Differ import nltk from nltk.corpus import stopwords nltk.download('stopwords') def remove_stop_words(word_list): """ Removes stop words from a list of single words. Args: word_list: A list of single words. Returns: A new list containing only the words that are not stop words. """ stop_words = set(stopwords.words('english')) # Get English stop words # Define characters to remove chars_to_remove = r'[^a-zA-Z0-9]' # Matches any character that is not a letter or digit cleaned_words = [] for word in word_list: # Remove punctuation and special characters word = re.sub(chars_to_remove, '', word) # Check for single digits and single letters if len(word) > 1 and not word.isdigit(): # Check if the word is not a stop word if word.lower() not in stop_words: cleaned_words.append(word) return cleaned_words def write_word_counts_to_csv(data): """Writes word counts to a CSV file from a dictionary. Args: data_dict: A dictionary containing the word count data. filename: The name of the output CSV file. """ with open('data/results/[res]added_word_counts.csv', 'w', encoding='utf-8', newline='') as csvfile: fieldnames = ['Word', 'Count'] writer = csv.writer(csvfile) writer.writerow(fieldnames) for word, count in data['added_word_counts']: writer.writerow([word, count]) with open('data/results/[res]removed_word_counts.csv', 'w', encoding='utf-8', newline='') as csvfile: fieldnames = ['Word', 'Count'] writer = csv.writer(csvfile) writer.writerow(fieldnames) for word, count in data['removed_word_counts']: writer.writerow([word, count]) # with open('data/results/[res]unchanged_words.csv', 'w', encoding='utf-8', newline='') as csvfile: # fieldnames = ['Count', 'Phrase'] # writer = csv.writer(csvfile) # writer.writerow(fieldnames) # Write the header # for phrase, count in data['unchanged_words']: # writer.writerow([count, phrase]) def preprocess_text(text): """ Preprocesses a string by removing punctuation, numbers, and whitespace. Args: text: The string to preprocess. Returns: The preprocessed string. """ # Lower case text = text.lower() # Split text into words while keeping commas and dots within numbers delimiters = r"(?= 4: unchanged_phrase = " ".join(substring.split()) unchanged_phrases.append((unchanged_phrase, count)) substring = "" count = 0 continue substring += " " + word count += 1 return removed_ngrams, added_ngrams, unchanged_phrases if __name__ == "__main__": res = compare_strings_from_csv("data/ChatGPT_Nous_Hermes_2_Yi_34B_openchat_3_5_1210_with_best_similarity.csv") write_word_counts_to_csv(res) #remove_stop_words(["the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"])