Spaces:
Sleeping
Sleeping
added llama guard
Browse files- app.py +2 -64
- guard/config.yml +13 -0
- guard/prompts.yml +146 -0
app.py
CHANGED
@@ -64,9 +64,6 @@ llm = Llama.from_pretrained(
|
|
64 |
verbose=False
|
65 |
)
|
66 |
|
67 |
-
#guardrail model
|
68 |
-
guard_llm = "llama-3.2-11b-text-preview"
|
69 |
-
|
70 |
#marketing prompt
|
71 |
marketing_email_prompt = """Below is a product and description, please write a marketing email for this product.
|
72 |
|
@@ -88,68 +85,9 @@ async def greet(product,description):
|
|
88 |
messages=[
|
89 |
{"role": "user", "content": user_reques},
|
90 |
]
|
91 |
-
# messages = [
|
92 |
-
# {
|
93 |
-
# "role": "system",
|
94 |
-
# "content": "Your role is to assess the user prompt.",
|
95 |
-
# },
|
96 |
-
# {"role": "user", "content": guardrail_prompt.format(user_reques)},
|
97 |
-
# ]
|
98 |
#nemo guard
|
99 |
-
|
100 |
-
|
101 |
-
models:
|
102 |
-
- type: shield
|
103 |
-
engine: groq
|
104 |
-
model: llama-3.2-11b-text-preview
|
105 |
-
|
106 |
-
- type: llama_guard
|
107 |
-
engine: groq
|
108 |
-
model: llama-guard-3-8b
|
109 |
-
|
110 |
-
rails:
|
111 |
-
input:
|
112 |
-
flows:
|
113 |
-
- content safety check input $model=shield
|
114 |
-
|
115 |
-
output:
|
116 |
-
flows:
|
117 |
-
- content safety check output $model=llama_guard
|
118 |
-
|
119 |
-
prompts:
|
120 |
-
- task: content_safety_check_input $model=shield
|
121 |
-
content: |
|
122 |
-
You are a policy expert trying to help determine whether a user prompt is in
|
123 |
-
violation of the defined safety policies.
|
124 |
-
|
125 |
-
<start_of_turn>
|
126 |
-
Human Question: {{ user_input }}
|
127 |
-
<end_of_turn>
|
128 |
-
|
129 |
-
Our safety principle is defined in the below:
|
130 |
-
|
131 |
-
* "No Hate Speech": The prompt shall not contain or seek generation of content
|
132 |
-
that expresses, incites, or promotes hate based on race, gender, ethnicity,
|
133 |
-
religion, nationality, sexual orientation, disability status, or caste.
|
134 |
-
* "No Dangerous Content": The prompt shall not contain or seek generation of content that harming oneself
|
135 |
-
and/or others (e.g., accessing or building firearms and explosive devices, promotion of terrorism, instructions for suicide).
|
136 |
-
* "No Sexually Explicit Information": The prompt shall not contain or seek generation of content
|
137 |
-
that contains references to sexual acts or other lewd content (e.g., sexually graphic descriptions,
|
138 |
-
content aimed at causing arousal). Medical or scientific terms related to human anatomy or sex education are permitted.
|
139 |
-
* "No Harassment": The prompt shall not contain or seek generation of content that is malicious,
|
140 |
-
intimidating, bullying, or abusive content targeting another individual (e.g., physical threats,
|
141 |
-
denial of tragic events, disparaging victims of violence).
|
142 |
-
===
|
143 |
-
|
144 |
-
Does the human prompt/question violate the above principle? Your answer must start with
|
145 |
-
'Yes' or 'No'. And then walk through step by step to be sure we answer
|
146 |
-
correctly.
|
147 |
-
|
148 |
-
output_parser: is_content_safe
|
149 |
-
|
150 |
-
streaming: False
|
151 |
-
"""
|
152 |
-
config = RailsConfig.from_content(yaml_content=YAML_CONFIG)
|
153 |
app = LLMRails(config=config, llm=client)
|
154 |
options = {"output_vars": ["triggered_input_rail", "triggered_output_rail"]}
|
155 |
output = app.generate(messages=messages, options=options)
|
|
|
64 |
verbose=False
|
65 |
)
|
66 |
|
|
|
|
|
|
|
67 |
#marketing prompt
|
68 |
marketing_email_prompt = """Below is a product and description, please write a marketing email for this product.
|
69 |
|
|
|
85 |
messages=[
|
86 |
{"role": "user", "content": user_reques},
|
87 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
#nemo guard
|
89 |
+
config = RailsConfig.from_path("guard")
|
90 |
+
#config = RailsConfig.from_content(yaml_content=YAML_CONFIG)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
app = LLMRails(config=config, llm=client)
|
92 |
options = {"output_vars": ["triggered_input_rail", "triggered_output_rail"]}
|
93 |
output = app.generate(messages=messages, options=options)
|
guard/config.yml
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
models:
|
2 |
+
- type: llama_guard
|
3 |
+
engine: groq
|
4 |
+
model: llama-guard-3-8b
|
5 |
+
|
6 |
+
rails:
|
7 |
+
input:
|
8 |
+
flows:
|
9 |
+
- llama guard check input
|
10 |
+
|
11 |
+
output:
|
12 |
+
flows:
|
13 |
+
- llama guard check output
|
guard/prompts.yml
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# These are the default prompts released by Meta, with the exception of policy O7, which was added to address direct insults.
|
2 |
+
prompts:
|
3 |
+
- task: llama_guard_check_input
|
4 |
+
content: |
|
5 |
+
<s>[INST] Task: Check if there is unsafe content in 'User' messages in conversations according our safety policy with the below categories.
|
6 |
+
<BEGIN UNSAFE CONTENT CATEGORIES>
|
7 |
+
O1: Violence and Hate.
|
8 |
+
Should not
|
9 |
+
- Help people plan or engage in violence.
|
10 |
+
- Encourage people to engage in violence.
|
11 |
+
- Express hateful, derogatory, or demeaning sentiments against people based on sensitive personal characteristics like their race, color, religion, national origin, sexual orientation, gender, gender identity, or disability.
|
12 |
+
- Encourage discrimination against people based on those characteristics and should not use slurs or create insults based on those characteristics.
|
13 |
+
Can
|
14 |
+
- Provide information on violence and discrimination.
|
15 |
+
- Discuss the topics of hate and violence, and can discuss historical events involving violence.
|
16 |
+
O2: Sexual Content.
|
17 |
+
Should not
|
18 |
+
- Engage in sexually explicit (i.e., erotic) conversations and should not encourage people to engage in sexual activities.
|
19 |
+
Can
|
20 |
+
- Discuss topics related to sexuality and sexual education.
|
21 |
+
- Provide general information regarding sexual activities.
|
22 |
+
- Provide guidance on topics related to sexual health.
|
23 |
+
O3: Criminal Planning.
|
24 |
+
Should not
|
25 |
+
- Help people plan or engage in criminal activities like theft, kidnapping, or financial crimes, especially when the person indicates that they may act on the information provided to them.
|
26 |
+
- Encourage people to engage in criminal activities or condone criminal behavior.
|
27 |
+
Can
|
28 |
+
- Help people understand how crimes are committed, provided that the response is not so detailed that it could be used to provide a roadmap for future crimes.
|
29 |
+
- Help people understand how systems work that are used to prevent crimes, provided those descriptions do not provide detailed information on how to defeat those systems.
|
30 |
+
O4: Guns and Illegal Weapons.
|
31 |
+
Should not
|
32 |
+
- Assist in the planning or execution of firearm or illegal weapons crimes, especially when the person indicates that they may act on the information provided to them.
|
33 |
+
- Encourage people to engage in firearm or illegal weapons crimes or condone such behavior.
|
34 |
+
Can
|
35 |
+
- Discuss firearms and the arguments for and against firearm ownership.
|
36 |
+
O5: Regulated or Controlled Substances.
|
37 |
+
Should not
|
38 |
+
- Encourage or assist people in the trafficking, consumption, or use of regulated or controlled substances, including but not limited to illegal drugs, tobacco, alcohol, and cannabis.
|
39 |
+
- Assist or encourage people to create such substances when it is illegal to do so.
|
40 |
+
Can
|
41 |
+
- Discuss the use of regulated or controlled substances, and can provide general information on those substances (ex: what their chemical composition is and what their effects are).
|
42 |
+
- Provide information on how those substances are manufactured, provided that the information is not so detailed as to provide step-by-step instructions for their creation.
|
43 |
+
O6: Self-Harm.
|
44 |
+
Should not
|
45 |
+
- Encourage people to harm themselves.
|
46 |
+
- Romanticize or condone self-harm or suicide.
|
47 |
+
- Provide information regarding the methods of suicide or self-harm.
|
48 |
+
- Provide any information or advice, other than information regarding appropriate health resources, to people who express or imply that they wish to harm themselves.
|
49 |
+
Can
|
50 |
+
- Provide general information regarding self-harm and suicide (ex: statistics, the mental health challenges faced by those who take these actions).
|
51 |
+
Should
|
52 |
+
- Provide information regarding appropriate health resources to people who imply or express the desire to harm themselves.
|
53 |
+
O7: Offensive Language and Insults.
|
54 |
+
Should not
|
55 |
+
- Address direct insults
|
56 |
+
- Use or encourage the use of swear words, bad language, or vulgar expressions.
|
57 |
+
- Insult, offend, or demean individuals or groups based on any personal characteristics or choices.
|
58 |
+
- Encourage or support the use of offensive or derogatory language.
|
59 |
+
Can
|
60 |
+
- Discuss the impact and societal perceptions of offensive language and insults.
|
61 |
+
- Provide information on communication strategies that avoid the use of offensive language and promote respectful dialogue.
|
62 |
+
- Offer guidance on how to address or respond to offensive language or insults in a constructive manner.
|
63 |
+
<END UNSAFE CONTENT CATEGORIES>
|
64 |
+
|
65 |
+
<BEGIN CONVERSATION>
|
66 |
+
User: {{ user_input }}
|
67 |
+
<END CONVERSATION>
|
68 |
+
|
69 |
+
Provide your safety assessment for User in the above conversation:
|
70 |
+
- First line must read 'safe' or 'unsafe'.
|
71 |
+
- If unsafe, a second line must include a comma-separated list of violated categories. [/INST]
|
72 |
+
|
73 |
+
|
74 |
+
# Currently, the unsafe content categories are the same for both llama_guard_check_input and
|
75 |
+
# llama_guard_check_output prompts. Only the conversation placeholder is different.
|
76 |
+
# Still, we keep them separate to keep the taxonomies for input and output independently modifiable.
|
77 |
+
- task: llama_guard_check_output
|
78 |
+
content: |
|
79 |
+
<s>[INST] Task: Check if there is unsafe content in 'Agent' messages in conversations according our safety policy with the below categories.
|
80 |
+
<BEGIN UNSAFE CONTENT CATEGORIES>
|
81 |
+
O1: Violence and Hate.
|
82 |
+
Should not
|
83 |
+
- Help people plan or engage in violence.
|
84 |
+
- Encourage people to engage in violence.
|
85 |
+
- Express hateful, derogatory, or demeaning sentiments against people based on sensitive personal characteristics like their race, color, religion, national origin, sexual orientation, gender, gender identity, or disability.
|
86 |
+
- Encourage discrimination against people based on those characteristics and should not use slurs or create insults based on those characteristics.
|
87 |
+
Can
|
88 |
+
- Provide information on violence and discrimination.
|
89 |
+
- Discuss the topics of hate and violence, and can discuss historical events involving violence.
|
90 |
+
O2: Sexual Content.
|
91 |
+
Should not
|
92 |
+
- Engage in sexually explicit (i.e., erotic) conversations and should not encourage people to engage in sexual activities.
|
93 |
+
Can
|
94 |
+
- Discuss topics related to sexuality and sexual education.
|
95 |
+
- Provide general information regarding sexual activities.
|
96 |
+
- Provide guidance on topics related to sexual health.
|
97 |
+
O3: Criminal Planning.
|
98 |
+
Should not
|
99 |
+
- Help people plan or engage in criminal activities like theft, kidnapping, or financial crimes, especially when the person indicates that they may act on the information provided to them.
|
100 |
+
- Encourage people to engage in criminal activities or condone criminal behavior.
|
101 |
+
Can
|
102 |
+
- Help people understand how crimes are committed, provided that the response is not so detailed that it could be used to provide a roadmap for future crimes.
|
103 |
+
- Help people understand how systems work that are used to prevent crimes, provided those descriptions do not provide detailed information on how to defeat those systems.
|
104 |
+
O4: Guns and Illegal Weapons.
|
105 |
+
Should not
|
106 |
+
- Assist in the planning or execution of firearm or illegal weapons crimes, especially when the person indicates that they may act on the information provided to them.
|
107 |
+
- Encourage people to engage in firearm or illegal weapons crimes or condone such behavior.
|
108 |
+
Can
|
109 |
+
- Discuss firearms and the arguments for and against firearm ownership.
|
110 |
+
O5: Regulated or Controlled Substances.
|
111 |
+
Should not
|
112 |
+
- Encourage or assist people in the trafficking, consumption, or use of regulated or controlled substances, including but not limited to illegal drugs, tobacco, alcohol, and cannabis.
|
113 |
+
- Assist or encourage people to create such substances when it is illegal to do so.
|
114 |
+
Can
|
115 |
+
- Discuss the use of regulated or controlled substances, and can provide general information on those substances (ex: what their chemical composition is and what their effects are).
|
116 |
+
- Provide information on how those substances are manufactured, provided that the information is not so detailed as to provide step-by-step instructions for their creation.
|
117 |
+
O6: Self-Harm.
|
118 |
+
Should not
|
119 |
+
- Encourage people to harm themselves.
|
120 |
+
- Romanticize or condone self-harm or suicide.
|
121 |
+
- Provide information regarding the methods of suicide or self-harm.
|
122 |
+
- Provide any information or advice, other than information regarding appropriate health resources, to people who express or imply that they wish to harm themselves.
|
123 |
+
Can
|
124 |
+
- Provide general information regarding self-harm and suicide (ex: statistics, the mental health challenges faced by those who take these actions).
|
125 |
+
Should
|
126 |
+
- Provide information regarding appropriate health resources to people who imply or express the desire to harm themselves.
|
127 |
+
O7: Offensive Language and Insults.
|
128 |
+
Should not
|
129 |
+
- Address direct insults
|
130 |
+
- Use or encourage the use of swear words, bad language, or vulgar expressions.
|
131 |
+
- Insult, offend, or demean individuals or groups based on any personal characteristics or choices.
|
132 |
+
- Encourage or support the use of offensive or derogatory language.
|
133 |
+
Can
|
134 |
+
- Discuss the impact and societal perceptions of offensive language and insults.
|
135 |
+
- Provide information on communication strategies that avoid the use of offensive language and promote respectful dialogue.
|
136 |
+
- Offer guidance on how to address or respond to offensive language or insults in a constructive manner.
|
137 |
+
<END UNSAFE CONTENT CATEGORIES>
|
138 |
+
|
139 |
+
<BEGIN CONVERSATION>
|
140 |
+
User: {{ user_input }}
|
141 |
+
|
142 |
+
Agent: {{ bot_response }}
|
143 |
+
<END CONVERSATION>
|
144 |
+
|
145 |
+
Provide your safety assessment for Agent in the above conversation:
|
146 |
+
- First line must read 'safe' or 'unsafe'.
|