Spaces:
Sleeping
Sleeping
Commit
·
e547dc0
1
Parent(s):
1435c22
deepnote update
Browse files
util.py
CHANGED
@@ -23,6 +23,7 @@ def read_df(xlsx_url: str, page_content_column: str) -> pd.DataFrame:
|
|
23 |
df = pd.read_excel(xlsx_url, header=0, keep_default_na=False)
|
24 |
if SPLIT_PAGE_BREAKS:
|
25 |
df = split_page_breaks(df, page_content_column)
|
|
|
26 |
if SYNONYMS is not None:
|
27 |
df = duplicate_rows_with_synonyms(df, page_content_column, SYNONYMS)
|
28 |
return df
|
@@ -90,3 +91,9 @@ def duplicate_rows_with_synonyms(df: pd.DataFrame, column: str, synonyms: list[l
|
|
90 |
new_df = pd.DataFrame(new_rows, columns=df.columns)
|
91 |
new_df = new_df.reset_index(drop=True)
|
92 |
return new_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
df = pd.read_excel(xlsx_url, header=0, keep_default_na=False)
|
24 |
if SPLIT_PAGE_BREAKS:
|
25 |
df = split_page_breaks(df, page_content_column)
|
26 |
+
df = remove_empty_rows(df, page_content_column)
|
27 |
if SYNONYMS is not None:
|
28 |
df = duplicate_rows_with_synonyms(df, page_content_column, SYNONYMS)
|
29 |
return df
|
|
|
91 |
new_df = pd.DataFrame(new_rows, columns=df.columns)
|
92 |
new_df = new_df.reset_index(drop=True)
|
93 |
return new_df
|
94 |
+
|
95 |
+
|
96 |
+
def remove_empty_rows(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
|
97 |
+
df = df[df[column_name].str.strip().astype(bool)]
|
98 |
+
df = df.reset_index(drop=True)
|
99 |
+
return df
|