Spaces:
Runtime error
Runtime error
import pandas as pd | |
from nltk.tokenize import sent_tokenize | |
from build_data_dict import build_course_program_dict | |
OUTPUT_FILE = "course_sentences.csv" | |
invalid_phrases = [line.rstrip('\n') for line in open( | |
'stopwords/invalid_description_phrases.txt')] # Load .txt file line by line | |
def is_valid_sentence(sentence): | |
if sentence == "": | |
return False; | |
return all(phrase not in sentence.lower() for phrase in invalid_phrases) | |
if __name__ == "__main__": | |
courses_df = pd.read_csv("courses.csv") | |
course_program_dict = build_course_program_dict() | |
rows = [] | |
for course, programs in course_program_dict.items(): | |
# only capture unique courses | |
if (len(programs) > 1): | |
continue | |
course_row = courses_df.loc[courses_df['Course Prefix'] == course] | |
if(len(course_row["Description"].values) == 0): | |
continue; | |
course_description = course_row["Description"].values[0] | |
sentences = sent_tokenize(course_description) | |
sentences = [sentence.strip() for sentence in sentences if is_valid_sentence(sentence)] | |
# if a course belongs to more than one program, use the department as the program | |
if len(programs) > 1: | |
dept = course_row["Dept"].values[0] | |
for sentence in sentences: | |
rows.append([sentence, course, dept]) | |
continue | |
else: | |
for program in programs: | |
for sentence in sentences: | |
rows.append([sentence, course, program]) | |
output_df = pd.DataFrame(rows, columns=["sentence", "course", "program"]) | |
output_df.to_csv(OUTPUT_FILE, index=False) |