andreasmartin commited on
Commit
05f3fd9
·
1 Parent(s): e547dc0

deepnote update

Browse files
Files changed (1) hide show
  1. util.py +8 -5
util.py CHANGED
@@ -1,5 +1,6 @@
1
  import pandas as pd
2
  from langchain.docstore.document import Document
 
3
 
4
  SHEET_URL_X = "https://docs.google.com/spreadsheets/d/"
5
  SHEET_URL_Y = "/edit#gid="
@@ -80,13 +81,15 @@ def duplicate_rows_with_synonyms(df: pd.DataFrame, column: str, synonyms: list[l
80
  new_rows = []
81
  for index, row in df.iterrows():
82
  new_rows.append(row)
 
83
  for synonym_list in synonyms:
84
- for word in row[column].split():
85
- if word in synonym_list:
86
- for synonym in synonym_list:
87
- if synonym != word:
 
88
  new_row = row.copy()
89
- new_row[column] = row[column].replace(word, synonym)
90
  new_rows.append(new_row)
91
  new_df = pd.DataFrame(new_rows, columns=df.columns)
92
  new_df = new_df.reset_index(drop=True)
 
1
  import pandas as pd
2
  from langchain.docstore.document import Document
3
+ import re
4
 
5
  SHEET_URL_X = "https://docs.google.com/spreadsheets/d/"
6
  SHEET_URL_Y = "/edit#gid="
 
81
  new_rows = []
82
  for index, row in df.iterrows():
83
  new_rows.append(row)
84
+ text = row[column]
85
  for synonym_list in synonyms:
86
+ for synonym in synonym_list:
87
+ pattern = r'\b(?:{}|{}(?:s|es|ed|ing)?)\b'.format(synonym, synonym)
88
+ if re.search(pattern, text):
89
+ for replacement in synonym_list:
90
+ if replacement != synonym:
91
  new_row = row.copy()
92
+ new_row[column] = re.sub(pattern, replacement, text)
93
  new_rows.append(new_row)
94
  new_df = pd.DataFrame(new_rows, columns=df.columns)
95
  new_df = new_df.reset_index(drop=True)