gen-synth-data / pages /3_🌱 Generate Dataset.py
Ben Burtenshaw
fix index error in perspectives
7503ca9
raw
history blame
7.01 kB
import streamlit as st
from defaults import ARGILLA_URL
from hub import push_pipeline_params
from utils import project_sidebar
st.set_page_config(
page_title="Domain Data Grower",
page_icon="🧑‍🌾",
)
project_sidebar()
################################################################################
# HEADER
################################################################################
st.header("🧑‍🌾 Domain Data Grower")
st.divider()
st.subheader("Step 3. Run the pipeline to generate synthetic data")
st.write("Define the distilabel pipeline for generating the dataset.")
hub_username = st.session_state.get("hub_username")
project_name = st.session_state.get("project_name")
hub_token = st.session_state.get("hub_token")
###############################################################
# CONFIGURATION
###############################################################
st.divider()
st.markdown("## 🧰 Pipeline Configuration")
st.write(
"Now we need to define the configuration for the pipeline that will generate the synthetic data."
)
st.write(
"⚠️ Model and parameter choices significantly affect the quality of the generated data. \
We reccomend that you start with generating a few samples and review the data. Then scale up from there. \
You can run the pipeline multiple times with different configurations and append it to the same Argilla dataset."
)
st.markdown("#### 🤖 Inference configuration")
st.write(
"Add the url of the Huggingface inference API or endpoint that your pipeline should use. You can find compatible models here:"
)
with st.expander("🤗 Recommended Models"):
st.write("All inference endpoint compatible models can be found via the link below")
st.link_button(
"🤗 Inference compaptible models on the hub",
"https://huggingface.co/models?pipeline_tag=text-generation&other=endpoints_compatible&sort=trending",
)
st.write("🔋Projects with sufficient resources could take advantage of LLama3 70b")
st.code(
"https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
)
st.write("🪫Projects with less resources could take advantage of LLama 3 8b")
st.code(
"https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
)
st.write("🍃Projects with even less resources could use Phi-3-mini-4k-instruct")
st.code(
"https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct"
)
st.write("Note Hugggingface Pro gives access to more compute resources")
st.link_button(
"🤗 Huggingface Pro",
"https://huggingface.co/pricing",
)
self_instruct_base_url = st.text_input(
label="Model base URL for instruction generation",
value="https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct",
)
domain_expert_base_url = st.text_input(
label="Model base URL for domain expert response",
value="https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct",
)
st.divider()
st.markdown("#### 🧮 Parameters configuration")
self_intruct_num_generations = st.slider(
"Number of generations for self-instruction", 1, 10, 2
)
domain_expert_num_generations = st.slider(
"Number of generations for domain expert response", 1, 10, 2
)
self_instruct_temperature = st.slider("Temperature for self-instruction", 0.1, 1.0, 0.9)
domain_expert_temperature = st.slider("Temperature for domain expert", 0.1, 1.0, 0.9)
st.divider()
st.markdown("#### 🔬 Argilla API details to push the generated dataset")
argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
argilla_api_key = st.text_input("Argilla API Key", "owner.apikey")
argilla_dataset_name = st.text_input("Argilla Dataset Name", project_name)
st.divider()
###############################################################
# LOCAL
###############################################################
st.markdown("## Run the pipeline")
st.markdown(
"Once you've defined the pipeline configuration above, you can run the pipeline from your local machine."
)
if all(
[
argilla_api_key,
argilla_url,
self_instruct_base_url,
domain_expert_base_url,
self_intruct_num_generations,
domain_expert_num_generations,
self_instruct_temperature,
domain_expert_temperature,
hub_username,
project_name,
hub_token,
argilla_dataset_name,
]
) and st.button("💾 Save Pipeline Config"):
with st.spinner("Pushing pipeline to the Hub..."):
push_pipeline_params(
pipeline_params={
"argilla_api_key": argilla_api_key,
"argilla_api_url": argilla_url,
"argilla_dataset_name": argilla_dataset_name,
"self_instruct_base_url": self_instruct_base_url,
"domain_expert_base_url": domain_expert_base_url,
"self_instruct_temperature": self_instruct_temperature,
"domain_expert_temperature": domain_expert_temperature,
"self_intruct_num_generations": self_intruct_num_generations,
"domain_expert_num_generations": domain_expert_num_generations,
},
hub_username=hub_username,
hub_token=hub_token,
project_name=project_name,
)
st.success(
f"Pipeline configuration pushed to the dataset repo {hub_username}/{project_name} on the Hub."
)
st.markdown(
"To run the pipeline locally, you need to have the `distilabel` library installed. You can install it using the following command:"
)
st.code(
f"""
# Install the distilabel library
pip install distilabel
"""
)
st.markdown("Next, you'll need to clone your dataset repo and run the pipeline:")
st.code(
f"""
git clone https://github.com/huggingface/data-is-better-together
cd data-is-better-together/domain-specific-datasets/pipelines
pip install -r requirements.txt
"""
)
st.markdown("Finally, you can run the pipeline using the following command:")
st.code(
f"""
huggingface-cli login
python domain_expert_pipeline.py {hub_username}/{project_name}""",
language="bash",
)
st.markdown(
"👩‍🚀 If you want to customise the pipeline take a look in `pipeline.py` and teh [distilabel docs](https://distilabel.argilla.io/)"
)
st.markdown(
"🚀 Once you've run the pipeline your records will be available in the Argilla space"
)
st.link_button("🔗 Argilla Space", argilla_url)
st.markdown("Once you've reviewed the data, you can publish it on the next page:")
st.page_link(
page="pages/4_🔍 Review Generated Data.py",
label="Review Generated Data",
icon="🔍",
)
else:
st.info("Please fill all the required fields.")