Spaces:
Sleeping
Sleeping
File size: 3,268 Bytes
5caedb4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import os
import pandas as pd
from datasets import load_dataset
def prepare_default_dataset_causal_language_modeling(path):
if os.environ.get("H2O_LLM_STUDIO_DEMO_DATASETS") is None:
ds = load_dataset("OpenAssistant/oasst2")
train = ds["train"].to_pandas()
val = ds["validation"].to_pandas()
df = pd.concat([train, val], axis=0).reset_index(drop=True)
else:
df = pd.read_parquet(
os.path.join(
os.environ.get("H2O_LLM_STUDIO_DEMO_DATASETS"),
"causal_language_modeling.pq",
)
)
df_assistant = df[(df.role == "assistant")].copy()
df_prompter = df[(df.role == "prompter")].copy()
df_prompter = df_prompter.set_index("message_id")
df_assistant["output"] = df_assistant["text"].values
inputs = []
parent_ids = []
for _, row in df_assistant.iterrows():
input = df_prompter.loc[row.parent_id]
inputs.append(input.text)
parent_ids.append(input.parent_id)
df_assistant["instruction"] = inputs
df_assistant["parent_id"] = parent_ids
df_assistant = df_assistant[
["instruction", "output", "message_id", "parent_id", "lang", "rank"]
].rename(columns={"message_id": "id"})
df_assistant[(df_assistant["rank"] == 0.0) & (df_assistant["lang"] == "en")][
["instruction", "output", "id", "parent_id"]
].to_parquet(os.path.join(path, "train_full.pq"), index=False)
df_assistant[df_assistant["lang"] == "en"][
["instruction", "output", "id", "parent_id"]
].to_parquet(os.path.join(path, "train_full_allrank.pq"), index=False)
df_assistant[df_assistant["rank"] == 0.0][
["instruction", "output", "id", "parent_id"]
].to_parquet(os.path.join(path, "train_full_multilang.pq"), index=False)
df_assistant[["instruction", "output", "id", "parent_id"]].to_parquet(
os.path.join(path, "train_full_multilang_allrank.pq"), index=False
)
return df_assistant[(df_assistant["rank"] == 0.0) & (df_assistant["lang"] == "en")]
def prepare_default_dataset_dpo_modeling() -> pd.DataFrame:
if os.environ.get("H2O_LLM_STUDIO_DEMO_DATASETS") is None:
df = load_dataset("Intel/orca_dpo_pairs")["train"].to_pandas()
else:
df = pd.read_parquet(
os.path.join(
os.environ.get("H2O_LLM_STUDIO_DEMO_DATASETS"), "dpo_modeling.pq"
)
)
return df
def prepare_default_dataset_classification_modeling() -> pd.DataFrame:
if os.environ.get("H2O_LLM_STUDIO_DEMO_DATASETS") is None:
df = load_dataset("stanfordnlp/imdb")["train"].to_pandas()
else:
df = pd.read_parquet(
os.path.join(
os.environ.get("H2O_LLM_STUDIO_DEMO_DATASETS"),
"classification_modeling.pq",
)
)
return df
def prepare_default_dataset_regression_modeling() -> pd.DataFrame:
if os.environ.get("H2O_LLM_STUDIO_DEMO_DATASETS") is None:
df = load_dataset("nvidia/HelpSteer2")["train"].to_pandas()
else:
df = pd.read_parquet(
os.path.join(
os.environ.get("H2O_LLM_STUDIO_DEMO_DATASETS"),
"regression_modeling.pq",
)
)
return df
|