alexpantex commited on
Commit
2c84f1d
·
verified ·
1 Parent(s): 290938c

Upload scripts/preprocess.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. scripts/preprocess.py +1 -1
scripts/preprocess.py CHANGED
@@ -29,7 +29,7 @@ def clean_corpus():
29
  """
30
  if not os.path.isfile(os.path.join(PROCESSED_DATA_PATH, 'prompt_corpus_clean.parquet')): # to speed up the process
31
  os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)
32
- df = pd.read_parquet(PROMPTS_URL).sample(10000, random_state=123)
33
  assert 'prompt' in df.columns, "Parquet file must contain a 'prompt' column."
34
  df = df[df['prompt'].notna()][['prompt']] # drop missing rows
35
  df['prompt'] = df['prompt'].apply(preprocess_text) # preprocess each prompt
 
29
  """
30
  if not os.path.isfile(os.path.join(PROCESSED_DATA_PATH, 'prompt_corpus_clean.parquet')): # to speed up the process
31
  os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)
32
+ df = pd.read_parquet(PROMPTS_URL).sample(100000, random_state=123)
33
  assert 'prompt' in df.columns, "Parquet file must contain a 'prompt' column."
34
  df = df[df['prompt'].notna()][['prompt']] # drop missing rows
35
  df['prompt'] = df['prompt'].apply(preprocess_text) # preprocess each prompt