import collections import pandas as pd import course_scraper from nltk.corpus import stopwords import string import re DESCRIPTION = "Description" COURSE_PREFIX = "Course Prefix" words_to_remove = ["lectures", "per", "two", "and/or", "``", "''", "laboratory", "course", "courses", "work", "students", "units", "total", "selected", "may", "major", "'s", "quarter", "and/or", "report", "undergraduate", "format", "laboratory", "limited", "topics", "fulfills", "including", "topic", "catalogs", "list", "earlier", "overview", "impact", "required", "open", "study", "class", "grading", "credit/no", "individual", "kine", "new", "within", "offered", "laboratories", "include", "use", "using", "used", "basic", "student", "current", "related", "practice", "online", "examination", "formal", "quality", "one", "time", "must", "maximum", "hours", "effects"] ge_areas = ["a", "b", "c", "d", "e", "f", "area", "areas", "uscp", "upper-division"] year = ["2017-19", "2019-20"] stopwords_to_remove = ["ge", "credit", "class", "topics", "course", "following", "student", "units", "section", "study", "k", "unit", "week", "used", "division", "catalogs", "graduate", "selected", "courses", "may", "majors", "format", "emphasis", "area", "hours", "emphasized", "non", "based", "application", "applications", "classroom", "introduction", "students", "crosslisted", "focus", "methods", "completion", "required", "implementation", "u", "better", "part", "fields", "completed", "taken", "well", "grade", "present", "basic", "etc" "graduates", "variety", "context", "presented", "instruction", "quarter", "projects", "meet", "fulfills", "enroll", "enrollment", "requirement", "studies", "surveys", "planning", "discussion", "assessment", "role", "field", "preparation", "principles", "evaluation", "techniques", "selection", "practices", "concepts", "faculty", "theories", "issues", "paid", "usually", "quarters", "independent", "fundamentals", "project", "senior"] def generate_ge_prefixes(): letters = ["a", "b", "c", "d", "e", "f"] numbers = list(range(1, 8)) pairs = [] for letter in letters: for num in numbers: pairs.append(letter + str(num)) return pairs ge_prefixes = generate_ge_prefixes() filter_set = set(stopwords.words('english')) filter_set.update(string.punctuation, words_to_remove, stopwords_to_remove, ge_areas, year, ge_prefixes) def preprocess(text): text_input = re.sub('[^a-zA-Z1-9]+', ' ', str(text)) output = re.sub(r'\d+', '', text_input) return output.lower().strip() def clean_text(text): # add spaces and replace leading "and" or "&" return re.sub('^(and|&)', '', text.replace('\xa0', " ")).strip() def remove_stopwords(text): filtered_words = [word.lower() for word in text.split() if word.lower() not in filter_set] return " ".join(filtered_words) def build_word_course_dict(): df = pd.read_csv(course_scraper.FILE_NAME) df[DESCRIPTION] = df[DESCRIPTION].map(preprocess) df[DESCRIPTION] = df[DESCRIPTION].map(remove_stopwords) word_course_dict = collections.defaultdict(list) for index, row in df.iterrows(): description = row[DESCRIPTION] prefix = row[COURSE_PREFIX] prefixes = format_course_prefixes(prefix) for word in description.split(' '): word_course_dict[word] += prefixes return word_course_dict def format_course_prefixes(prefix_str: str): # Returns a list of course prefixes formatted_prefixes = [] # one course listed if "/" not in prefix_str: formatted_prefixes.append(prefix_str.replace(" ", "-")) return formatted_prefixes # multiple courses split_prefixes = re.split('/| ', prefix_str) course_number_count = len([e for e in split_prefixes if e.isdigit()]) if course_number_count == 1: # crosslisted courses with different depts, same number (HIST/HNRS 335) course_num = split_prefixes[-1] for prefix in split_prefixes[:-1]: formatted_prefixes.append(f'{prefix}-{course_num}') return formatted_prefixes else: # crosslisted courses with different numbers (HNRS 304/ISLA 303) for i in range(0,len(split_prefixes)-1,2): prefix = split_prefixes[i] course_num = split_prefixes[i+1] formatted_prefixes.append(f'{prefix}-{course_num}') return formatted_prefixes def build_course_program_dict(): df = pd.read_csv("program_courses.csv") df["Program"] = df["Program"].map(clean_text) program_course_dict = collections.defaultdict(list) for index, row in df.iterrows(): program = row["Program"] course_prefix = row["Course Prefix"] # handle mulitple prefixes EX. CPE/CSC 123 prefixes = format_course_prefixes(course_prefix) # print(program, course_prefix, prefixes) for prefix in prefixes: program_course_dict[prefix] += [program] return program_course_dict if __name__ == "__main__": d = build_word_course_dict() program_course_dict = build_course_program_dict() print(d) # print(program_course_dict)