andreasmartin commited on
Commit
e547dc0
·
1 Parent(s): 1435c22

deepnote update

Browse files
Files changed (1) hide show
  1. util.py +7 -0
util.py CHANGED
@@ -23,6 +23,7 @@ def read_df(xlsx_url: str, page_content_column: str) -> pd.DataFrame:
23
  df = pd.read_excel(xlsx_url, header=0, keep_default_na=False)
24
  if SPLIT_PAGE_BREAKS:
25
  df = split_page_breaks(df, page_content_column)
 
26
  if SYNONYMS is not None:
27
  df = duplicate_rows_with_synonyms(df, page_content_column, SYNONYMS)
28
  return df
@@ -90,3 +91,9 @@ def duplicate_rows_with_synonyms(df: pd.DataFrame, column: str, synonyms: list[l
90
  new_df = pd.DataFrame(new_rows, columns=df.columns)
91
  new_df = new_df.reset_index(drop=True)
92
  return new_df
 
 
 
 
 
 
 
23
  df = pd.read_excel(xlsx_url, header=0, keep_default_na=False)
24
  if SPLIT_PAGE_BREAKS:
25
  df = split_page_breaks(df, page_content_column)
26
+ df = remove_empty_rows(df, page_content_column)
27
  if SYNONYMS is not None:
28
  df = duplicate_rows_with_synonyms(df, page_content_column, SYNONYMS)
29
  return df
 
91
  new_df = pd.DataFrame(new_rows, columns=df.columns)
92
  new_df = new_df.reset_index(drop=True)
93
  return new_df
94
+
95
+
96
+ def remove_empty_rows(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
97
+ df = df[df[column_name].str.strip().astype(bool)]
98
+ df = df.reset_index(drop=True)
99
+ return df