Spaces:
Sleeping
Sleeping
Ben Burtenshaw
commited on
Commit
·
fc828f1
1
Parent(s):
dfd3683
run pipeline locally
Browse files- __pycache__/defaults.cpython-311.pyc +0 -0
- __pycache__/domain.cpython-311.pyc +0 -0
- __pycache__/hub.cpython-311.pyc +0 -0
- __pycache__/infer.cpython-311.pyc +0 -0
- __pycache__/pipeline.cpython-311.pyc +0 -0
- __pycache__/utils.cpython-311.pyc +0 -0
- pages/2_👩🏼🔬 Describe Domain.py +3 -1
- pages/3_🌱 Generate Dataset.py +79 -37
- pipeline.yaml +9 -33
- pipeline_params.json +0 -0
- utils.py +30 -3
__pycache__/defaults.cpython-311.pyc
ADDED
Binary file (2.32 kB). View file
|
|
__pycache__/domain.cpython-311.pyc
ADDED
Binary file (4.53 kB). View file
|
|
__pycache__/hub.cpython-311.pyc
ADDED
Binary file (5.78 kB). View file
|
|
__pycache__/infer.cpython-311.pyc
ADDED
Binary file (837 Bytes). View file
|
|
__pycache__/pipeline.cpython-311.pyc
ADDED
Binary file (8.2 kB). View file
|
|
__pycache__/utils.cpython-311.pyc
ADDED
Binary file (4.93 kB). View file
|
|
pages/2_👩🏼🔬 Describe Domain.py
CHANGED
@@ -11,7 +11,7 @@ from defaults import (
|
|
11 |
PIPELINE_PATH,
|
12 |
DATASET_REPO_ID,
|
13 |
)
|
14 |
-
from utils import project_sidebar
|
15 |
|
16 |
|
17 |
st.set_page_config(
|
@@ -212,6 +212,8 @@ domain_data = {
|
|
212 |
"topics": topics,
|
213 |
"examples": examples,
|
214 |
"domain_expert_prompt": domain_expert_prompt,
|
|
|
|
|
215 |
}
|
216 |
|
217 |
with open(SEED_DATA_PATH, "w") as f:
|
|
|
11 |
PIPELINE_PATH,
|
12 |
DATASET_REPO_ID,
|
13 |
)
|
14 |
+
from utils import project_sidebar, create_seed_terms, create_application_instruction
|
15 |
|
16 |
|
17 |
st.set_page_config(
|
|
|
212 |
"topics": topics,
|
213 |
"examples": examples,
|
214 |
"domain_expert_prompt": domain_expert_prompt,
|
215 |
+
"application_instruction": create_application_instruction(domain, examples),
|
216 |
+
"seed_terms": create_seed_terms(topics, perspectives),
|
217 |
}
|
218 |
|
219 |
with open(SEED_DATA_PATH, "w") as f:
|
pages/3_🌱 Generate Dataset.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import streamlit as st
|
2 |
|
3 |
from defaults import ARGILLA_URL
|
4 |
-
from hub import push_pipeline_params
|
5 |
from utils import project_sidebar
|
6 |
|
7 |
st.set_page_config(
|
@@ -20,16 +20,27 @@ st.divider()
|
|
20 |
st.subheader("Step 3. Run the pipeline to generate synthetic data")
|
21 |
st.write("Define the distilabel pipeline for generating the dataset.")
|
22 |
|
23 |
-
###############################################################
|
24 |
-
# CONFIGURATION
|
25 |
-
###############################################################
|
26 |
-
|
27 |
hub_username = st.session_state.get("hub_username")
|
28 |
project_name = st.session_state.get("project_name")
|
29 |
hub_token = st.session_state.get("hub_token")
|
30 |
|
|
|
|
|
|
|
|
|
31 |
st.divider()
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
st.markdown("#### 🤖 Inference configuration")
|
34 |
|
35 |
st.write(
|
@@ -43,13 +54,19 @@ with st.expander("🤗 Recommended Models"):
|
|
43 |
"https://huggingface.co/models?pipeline_tag=text-generation&other=endpoints_compatible&sort=trending",
|
44 |
)
|
45 |
st.write("🔋Projects with sufficient resources could take advantage of LLama3 70b")
|
46 |
-
st.code(
|
|
|
|
|
47 |
|
48 |
st.write("🪫Projects with less resources could take advantage of LLama 3 8b")
|
49 |
-
st.code(
|
|
|
|
|
50 |
|
51 |
-
st.write("🍃Projects with even less resources could
|
52 |
-
st.code(
|
|
|
|
|
53 |
|
54 |
st.write("Note Hugggingface Pro gives access to more compute resources")
|
55 |
st.link_button(
|
@@ -58,10 +75,27 @@ with st.expander("🤗 Recommended Models"):
|
|
58 |
)
|
59 |
|
60 |
|
61 |
-
|
62 |
-
label="
|
63 |
-
value="https://api-inference.huggingface.co/models/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
st.divider()
|
66 |
st.markdown("#### 🔬 Argilla API details to push the generated dataset")
|
67 |
argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
|
@@ -84,30 +118,38 @@ if all(
|
|
84 |
[
|
85 |
argilla_api_key,
|
86 |
argilla_url,
|
87 |
-
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
89 |
project_name,
|
90 |
hub_token,
|
91 |
argilla_dataset_name,
|
92 |
]
|
93 |
-
):
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
|
|
|
|
|
|
111 |
)
|
112 |
|
113 |
st.markdown(
|
@@ -118,7 +160,7 @@ if all(
|
|
118 |
f"""
|
119 |
|
120 |
# Install the distilabel library
|
121 |
-
pip install
|
122 |
"""
|
123 |
)
|
124 |
|
@@ -126,8 +168,8 @@ if all(
|
|
126 |
|
127 |
st.code(
|
128 |
f"""
|
129 |
-
git clone https://
|
130 |
-
cd
|
131 |
pip install -r requirements.txt
|
132 |
"""
|
133 |
)
|
@@ -135,9 +177,9 @@ if all(
|
|
135 |
st.markdown("Finally, you can run the pipeline using the following command:")
|
136 |
|
137 |
st.code(
|
138 |
-
"""
|
139 |
huggingface-cli login
|
140 |
-
python
|
141 |
language="bash",
|
142 |
)
|
143 |
st.markdown(
|
|
|
1 |
import streamlit as st
|
2 |
|
3 |
from defaults import ARGILLA_URL
|
4 |
+
from hub import push_pipeline_params
|
5 |
from utils import project_sidebar
|
6 |
|
7 |
st.set_page_config(
|
|
|
20 |
st.subheader("Step 3. Run the pipeline to generate synthetic data")
|
21 |
st.write("Define the distilabel pipeline for generating the dataset.")
|
22 |
|
|
|
|
|
|
|
|
|
23 |
hub_username = st.session_state.get("hub_username")
|
24 |
project_name = st.session_state.get("project_name")
|
25 |
hub_token = st.session_state.get("hub_token")
|
26 |
|
27 |
+
###############################################################
|
28 |
+
# CONFIGURATION
|
29 |
+
###############################################################
|
30 |
+
|
31 |
st.divider()
|
32 |
|
33 |
+
st.markdown("## 🧰 Pipeline Configuration")
|
34 |
+
|
35 |
+
st.write(
|
36 |
+
"Now we need to define the configuration for the pipeline that will generate the synthetic data."
|
37 |
+
)
|
38 |
+
st.write(
|
39 |
+
"⚠️ Model and parameter choice significantly affect the quality of the generated data. \
|
40 |
+
We reccomend that you start with a few samples and review the data. The scale up from there."
|
41 |
+
)
|
42 |
+
|
43 |
+
|
44 |
st.markdown("#### 🤖 Inference configuration")
|
45 |
|
46 |
st.write(
|
|
|
54 |
"https://huggingface.co/models?pipeline_tag=text-generation&other=endpoints_compatible&sort=trending",
|
55 |
)
|
56 |
st.write("🔋Projects with sufficient resources could take advantage of LLama3 70b")
|
57 |
+
st.code(
|
58 |
+
"https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
|
59 |
+
)
|
60 |
|
61 |
st.write("🪫Projects with less resources could take advantage of LLama 3 8b")
|
62 |
+
st.code(
|
63 |
+
"https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
|
64 |
+
)
|
65 |
|
66 |
+
st.write("🍃Projects with even less resources could use Phi-3-mini-4k-instruct")
|
67 |
+
st.code(
|
68 |
+
"https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct"
|
69 |
+
)
|
70 |
|
71 |
st.write("Note Hugggingface Pro gives access to more compute resources")
|
72 |
st.link_button(
|
|
|
75 |
)
|
76 |
|
77 |
|
78 |
+
self_instruct_base_url = st.text_input(
|
79 |
+
label="Model base URL for instruction generation",
|
80 |
+
value="https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct",
|
81 |
+
)
|
82 |
+
domain_expert_base_url = st.text_input(
|
83 |
+
label="Model base URL for domain expert response",
|
84 |
+
value="https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct",
|
85 |
+
)
|
86 |
+
|
87 |
+
st.divider()
|
88 |
+
st.markdown("#### 🧮 Parameters configuration")
|
89 |
+
|
90 |
+
self_intruct_num_generations = st.slider(
|
91 |
+
"Number of generations for self-instruction", 1, 10, 2
|
92 |
)
|
93 |
+
domain_expert_num_generations = st.slider(
|
94 |
+
"Number of generations for domain expert", 1, 10, 2
|
95 |
+
)
|
96 |
+
self_instruct_temperature = st.slider("Temperature for self-instruction", 0.1, 1.0, 0.9)
|
97 |
+
domain_expert_temperature = st.slider("Temperature for domain expert", 0.1, 1.0, 0.9)
|
98 |
+
|
99 |
st.divider()
|
100 |
st.markdown("#### 🔬 Argilla API details to push the generated dataset")
|
101 |
argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
|
|
|
118 |
[
|
119 |
argilla_api_key,
|
120 |
argilla_url,
|
121 |
+
self_instruct_base_url,
|
122 |
+
domain_expert_base_url,
|
123 |
+
self_intruct_num_generations,
|
124 |
+
domain_expert_num_generations,
|
125 |
+
self_instruct_temperature,
|
126 |
+
domain_expert_temperature,
|
127 |
+
hub_username,
|
128 |
project_name,
|
129 |
hub_token,
|
130 |
argilla_dataset_name,
|
131 |
]
|
132 |
+
) and st.button("💾 Save Pipeline Config"):
|
133 |
+
with st.spinner("Pushing pipeline to the Hub..."):
|
134 |
+
push_pipeline_params(
|
135 |
+
pipeline_params={
|
136 |
+
"argilla_api_key": argilla_api_key,
|
137 |
+
"argilla_api_url": argilla_url,
|
138 |
+
"argilla_dataset_name": argilla_dataset_name,
|
139 |
+
"self_instruct_base_url": self_instruct_base_url,
|
140 |
+
"domain_expert_base_url": domain_expert_base_url,
|
141 |
+
"self_instruct_temperature": self_instruct_temperature,
|
142 |
+
"domain_expert_temperature": domain_expert_temperature,
|
143 |
+
"self_intruct_num_generations": self_intruct_num_generations,
|
144 |
+
"domain_expert_num_generations": domain_expert_num_generations,
|
145 |
+
},
|
146 |
+
hub_username=hub_username,
|
147 |
+
hub_token=hub_token,
|
148 |
+
project_name=project_name,
|
149 |
+
)
|
150 |
+
|
151 |
+
st.success(
|
152 |
+
f"Pipeline configuration pushed to the dataset repo {hub_username}/{project_name} on the Hub."
|
153 |
)
|
154 |
|
155 |
st.markdown(
|
|
|
160 |
f"""
|
161 |
|
162 |
# Install the distilabel library
|
163 |
+
pip install distilabel
|
164 |
"""
|
165 |
)
|
166 |
|
|
|
168 |
|
169 |
st.code(
|
170 |
f"""
|
171 |
+
git clone https://github.com/huggingface/data-is-better-together
|
172 |
+
cd data-is-better-together/domain-specific-datasets/pipelines
|
173 |
pip install -r requirements.txt
|
174 |
"""
|
175 |
)
|
|
|
177 |
st.markdown("Finally, you can run the pipeline using the following command:")
|
178 |
|
179 |
st.code(
|
180 |
+
f"""
|
181 |
huggingface-cli login
|
182 |
+
python domain_expert_pipeline.py {hub_username}/{project_name}""",
|
183 |
language="bash",
|
184 |
)
|
185 |
st.markdown(
|
pipeline.yaml
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
distilabel:
|
2 |
-
version: 1.0.
|
3 |
pipeline:
|
4 |
name: farming
|
5 |
description: null
|
@@ -10,31 +10,7 @@ pipeline:
|
|
10 |
output_mappings: {}
|
11 |
batch_size: 64
|
12 |
data:
|
13 |
-
- input:
|
14 |
-
- input: animal welfare from a Agribusiness perspective
|
15 |
-
- input: animal welfare from a Permaculture perspective
|
16 |
-
- input: animal welfare from a Agroforestery perspective
|
17 |
-
- input: animal welfare from a Conventional Farming perspective
|
18 |
-
- input: economic growth from a Family Farming perspective
|
19 |
-
- input: economic growth from a Agribusiness perspective
|
20 |
-
- input: economic growth from a Permaculture perspective
|
21 |
-
- input: economic growth from a Agroforestery perspective
|
22 |
-
- input: economic growth from a Conventional Farming perspective
|
23 |
-
- input: land from a Family Farming perspective
|
24 |
-
- input: land from a Agribusiness perspective
|
25 |
-
- input: land from a Permaculture perspective
|
26 |
-
- input: land from a Agroforestery perspective
|
27 |
-
- input: land from a Conventional Farming perspective
|
28 |
-
- input: resources from a Family Farming perspective
|
29 |
-
- input: resources from a Agribusiness perspective
|
30 |
-
- input: resources from a Permaculture perspective
|
31 |
-
- input: resources from a Agroforestery perspective
|
32 |
-
- input: resources from a Conventional Farming perspective
|
33 |
-
- input: efficiency from a Family Farming perspective
|
34 |
-
- input: efficiency from a Agribusiness perspective
|
35 |
-
- input: efficiency from a Permaculture perspective
|
36 |
-
- input: efficiency from a Agroforestery perspective
|
37 |
-
- input: efficiency from a Conventional Farming perspective
|
38 |
runtime_parameters_info:
|
39 |
- name: batch_size
|
40 |
optional: true
|
@@ -54,7 +30,7 @@ pipeline:
|
|
54 |
model_id: null
|
55 |
endpoint_name: null
|
56 |
endpoint_namespace: null
|
57 |
-
base_url: https://
|
58 |
tokenizer_id: null
|
59 |
model_display_name: null
|
60 |
use_openai_client: false
|
@@ -75,14 +51,14 @@ pipeline:
|
|
75 |
Blend interrogative (e.g., "What is the significance of x?") and imperative
|
76 |
(e.g., "Detail the process of x.") styles.'
|
77 |
application_description: 'You are an AI assistant than generates queries around
|
78 |
-
the domain of
|
79 |
|
80 |
Your should not expect basic but profound questions from your users.
|
81 |
|
82 |
The queries should reflect a diversity of vision and economic positions and
|
83 |
political positions.
|
84 |
|
85 |
-
The queries may know about different methods of
|
86 |
|
87 |
The queries can be positioned politically, economically, socially, or practically.
|
88 |
|
@@ -163,7 +139,7 @@ pipeline:
|
|
163 |
model_id: null
|
164 |
endpoint_name: null
|
165 |
endpoint_namespace: null
|
166 |
-
base_url: https://
|
167 |
tokenizer_id: null
|
168 |
model_display_name: null
|
169 |
use_openai_client: false
|
@@ -390,7 +366,7 @@ pipeline:
|
|
390 |
model_id: null
|
391 |
endpoint_name: null
|
392 |
endpoint_namespace: null
|
393 |
-
base_url: https://
|
394 |
tokenizer_id: null
|
395 |
model_display_name: null
|
396 |
use_openai_client: false
|
@@ -489,9 +465,9 @@ pipeline:
|
|
489 |
generation: domain_expert_answer
|
490 |
output_mappings: {}
|
491 |
input_batch_size: 50
|
492 |
-
dataset_name:
|
493 |
dataset_workspace: admin
|
494 |
-
api_url: https://argilla-
|
495 |
runtime_parameters_info:
|
496 |
- name: input_batch_size
|
497 |
optional: true
|
|
|
1 |
distilabel:
|
2 |
+
version: 1.0.1
|
3 |
pipeline:
|
4 |
name: farming
|
5 |
description: null
|
|
|
10 |
output_mappings: {}
|
11 |
batch_size: 64
|
12 |
data:
|
13 |
+
- input: punctures from a Retro bikes perspective
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
runtime_parameters_info:
|
15 |
- name: batch_size
|
16 |
optional: true
|
|
|
30 |
model_id: null
|
31 |
endpoint_name: null
|
32 |
endpoint_namespace: null
|
33 |
+
base_url: https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta
|
34 |
tokenizer_id: null
|
35 |
model_display_name: null
|
36 |
use_openai_client: false
|
|
|
51 |
Blend interrogative (e.g., "What is the significance of x?") and imperative
|
52 |
(e.g., "Detail the process of x.") styles.'
|
53 |
application_description: 'You are an AI assistant than generates queries around
|
54 |
+
the domain of Bicycle maintenance.
|
55 |
|
56 |
Your should not expect basic but profound questions from your users.
|
57 |
|
58 |
The queries should reflect a diversity of vision and economic positions and
|
59 |
political positions.
|
60 |
|
61 |
+
The queries may know about different methods of Bicycle maintenance.
|
62 |
|
63 |
The queries can be positioned politically, economically, socially, or practically.
|
64 |
|
|
|
139 |
model_id: null
|
140 |
endpoint_name: null
|
141 |
endpoint_namespace: null
|
142 |
+
base_url: https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta
|
143 |
tokenizer_id: null
|
144 |
model_display_name: null
|
145 |
use_openai_client: false
|
|
|
366 |
model_id: null
|
367 |
endpoint_name: null
|
368 |
endpoint_namespace: null
|
369 |
+
base_url: https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta
|
370 |
tokenizer_id: null
|
371 |
model_display_name: null
|
372 |
use_openai_client: false
|
|
|
465 |
generation: domain_expert_answer
|
466 |
output_mappings: {}
|
467 |
input_batch_size: 50
|
468 |
+
dataset_name: bicycle_maintenance
|
469 |
dataset_workspace: admin
|
470 |
+
api_url: https://burtenshaw-bicycle-maintenance-argilla-space.hf.space
|
471 |
runtime_parameters_info:
|
472 |
- name: input_batch_size
|
473 |
optional: true
|
pipeline_params.json
ADDED
File without changes
|
utils.py
CHANGED
@@ -1,13 +1,13 @@
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
|
3 |
from defaults import (
|
4 |
-
ARGILLA_SPACE_REPO_ID,
|
5 |
PROJECT_NAME,
|
6 |
ARGILLA_URL,
|
7 |
DIBT_PARENT_APP_URL,
|
8 |
DATASET_URL,
|
9 |
DATASET_REPO_ID,
|
10 |
-
ARGILLA_SPACE_REPO_ID,
|
11 |
)
|
12 |
|
13 |
|
@@ -48,8 +48,35 @@ def project_sidebar():
|
|
48 |
st.sidebar.divider()
|
49 |
|
50 |
st.sidebar.link_button("🧑🌾 New Project", DIBT_PARENT_APP_URL)
|
51 |
-
|
52 |
if st.session_state["hub_token"] is None:
|
53 |
st.error("Please provide a Hub token to generate answers")
|
54 |
st.stop()
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from textwrap import dedent
|
2 |
+
|
3 |
import streamlit as st
|
4 |
|
5 |
from defaults import (
|
|
|
6 |
PROJECT_NAME,
|
7 |
ARGILLA_URL,
|
8 |
DIBT_PARENT_APP_URL,
|
9 |
DATASET_URL,
|
10 |
DATASET_REPO_ID,
|
|
|
11 |
)
|
12 |
|
13 |
|
|
|
48 |
st.sidebar.divider()
|
49 |
|
50 |
st.sidebar.link_button("🧑🌾 New Project", DIBT_PARENT_APP_URL)
|
51 |
+
|
52 |
if st.session_state["hub_token"] is None:
|
53 |
st.error("Please provide a Hub token to generate answers")
|
54 |
st.stop()
|
55 |
|
56 |
+
|
57 |
+
def create_seed_terms(topics: list[str], perspectives: list[str]) -> list[str]:
|
58 |
+
"""Create seed terms for self intruct to start from."""
|
59 |
+
|
60 |
+
return [
|
61 |
+
f"{topic} from a {perspective} perspective"
|
62 |
+
for topic in topics
|
63 |
+
for perspective in perspectives
|
64 |
+
]
|
65 |
+
|
66 |
+
|
67 |
+
def create_application_instruction(domain: str, examples: list[dict[str, str]]) -> str:
|
68 |
+
"""Create the instruction for Self-Instruct task."""
|
69 |
+
system_prompt = dedent(
|
70 |
+
f"""You are an AI assistant than generates queries around the domain of {domain}.
|
71 |
+
Your should not expect basic but profound questions from your users.
|
72 |
+
The queries should reflect a diversxamity of vision and economic positions and political positions.
|
73 |
+
The queries may know about different methods of {domain}.
|
74 |
+
The queries can be positioned politically, economically, socially, or practically.
|
75 |
+
Also take into account the impact of diverse causes on diverse domains."""
|
76 |
+
)
|
77 |
+
for example in examples:
|
78 |
+
question = example["question"]
|
79 |
+
answer = example["answer"]
|
80 |
+
system_prompt += f"""\n- Question: {question}\n- Answer: {answer}\n"""
|
81 |
+
|
82 |
+
return system_prompt
|