Spaces:
Sleeping
Sleeping
Upload scripts/preprocess.py with huggingface_hub
Browse files- scripts/preprocess.py +1 -1
scripts/preprocess.py
CHANGED
@@ -29,7 +29,7 @@ def clean_corpus():
|
|
29 |
"""
|
30 |
if not os.path.isfile(os.path.join(PROCESSED_DATA_PATH, 'prompt_corpus_clean.parquet')): # to speed up the process
|
31 |
os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)
|
32 |
-
df = pd.read_parquet(PROMPTS_URL).sample(
|
33 |
assert 'prompt' in df.columns, "Parquet file must contain a 'prompt' column."
|
34 |
df = df[df['prompt'].notna()][['prompt']] # drop missing rows
|
35 |
df['prompt'] = df['prompt'].apply(preprocess_text) # preprocess each prompt
|
|
|
29 |
"""
|
30 |
if not os.path.isfile(os.path.join(PROCESSED_DATA_PATH, 'prompt_corpus_clean.parquet')): # to speed up the process
|
31 |
os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)
|
32 |
+
df = pd.read_parquet(PROMPTS_URL).sample(100000, random_state=123)
|
33 |
assert 'prompt' in df.columns, "Parquet file must contain a 'prompt' column."
|
34 |
df = df[df['prompt'].notna()][['prompt']] # drop missing rows
|
35 |
df['prompt'] = df['prompt'].apply(preprocess_text) # preprocess each prompt
|