major-matcher / preprocessing /format_input_data.py
waidhoferj's picture
first commit
aadb779
import pandas as pd
from nltk.tokenize import sent_tokenize
from build_data_dict import build_course_program_dict
OUTPUT_FILE = "course_sentences.csv"
invalid_phrases = [line.rstrip('\n') for line in open(
'stopwords/invalid_description_phrases.txt')] # Load .txt file line by line
def is_valid_sentence(sentence):
if sentence == "":
return False;
return all(phrase not in sentence.lower() for phrase in invalid_phrases)
if __name__ == "__main__":
courses_df = pd.read_csv("courses.csv")
course_program_dict = build_course_program_dict()
rows = []
for course, programs in course_program_dict.items():
# only capture unique courses
if (len(programs) > 1):
continue
course_row = courses_df.loc[courses_df['Course Prefix'] == course]
if(len(course_row["Description"].values) == 0):
continue;
course_description = course_row["Description"].values[0]
sentences = sent_tokenize(course_description)
sentences = [sentence.strip() for sentence in sentences if is_valid_sentence(sentence)]
# if a course belongs to more than one program, use the department as the program
if len(programs) > 1:
dept = course_row["Dept"].values[0]
for sentence in sentences:
rows.append([sentence, course, dept])
continue
else:
for program in programs:
for sentence in sentences:
rows.append([sentence, course, program])
output_df = pd.DataFrame(rows, columns=["sentence", "course", "program"])
output_df.to_csv(OUTPUT_FILE, index=False)