import nltk nltk.download("punkt") nltk.download("stopwords") import re from nltk.corpus import stopwords from pandas import DataFrame stop_words = stopwords.words("english") def process(df: DataFrame): """Text2KG post-processing.""" drop_list = [] for i, row in df.iterrows(): # remove stopwords (pronouns) if (row.subject in stop_words) or (row.object in stop_words): drop_list.append(i) # remove broken triplets elif row.hasnans: drop_list.append(i) # lowercase nodes/edges, remove articles else: article_pattern = r'^(the|a|an) (.+)' be_pattern = r'^(are|is) (a )?(.+)' df.at[i, "subject"] = re.sub(article_pattern, r'\2', row.subject.lower()) df.at[i, "relation"] = re.sub(be_pattern, r'\3', row.relation.lower()) df.at[i, "object"] = re.sub(article_pattern, r'\2', row.object.lower()) return df.drop(drop_list)