dolphinium
commited on
Commit
·
2af2760
1
Parent(s):
6e9ade9
suggestion to obligation on filter query part.
Browse files- llm_prompts.py +51 -28
llm_prompts.py
CHANGED
@@ -32,36 +32,37 @@ def get_analysis_plan_prompt(natural_language_query, chat_history, search_fields
|
|
32 |
# The search_fields are now pre-mapped, so we can use them directly
|
33 |
formatted_fields = "\n".join([f" - {field['field_name']}: {field['field_value']}" for field in search_fields])
|
34 |
dynamic_fields_prompt_section = f"""
|
35 |
-
---
|
36 |
-
### DYNAMIC
|
37 |
|
38 |
-
An external API has
|
39 |
-
**
|
40 |
-
|
41 |
-
|
42 |
|
43 |
-
**
|
44 |
{formatted_fields}
|
45 |
"""
|
46 |
|
47 |
return f"""
|
48 |
You are an expert data analyst and Solr query engineer. Your task is to convert a natural language question into a structured JSON "Analysis Plan". This plan will be used to run two separate, efficient queries: one for aggregate data (facets) and one for finding illustrative examples (grouping).
|
49 |
|
50 |
-
Your most important job is to think like an analyst and choose a `analysis_dimension` that provides a meaningful, non-obvious breakdown of the data.
|
51 |
|
52 |
---
|
53 |
### CONTEXT & RULES
|
54 |
|
55 |
1. **Today's Date for Calculations**: {datetime.datetime.now().date().strftime("%Y-%m-%d")}
|
56 |
-
2. **
|
57 |
-
3. **
|
|
|
58 |
* For `group.sort`: If `analysis_measure` involves a function on a field (e.g., `sum(total_deal_value_in_million)`), you MUST use the full function: `group.sort: 'sum(total_deal_value_in_million) desc'`.
|
59 |
* If `analysis_measure` is 'count', you MUST OMIT the `group.sort` parameter entirely.
|
60 |
* For sorting, NEVER use 'date_year' directly for `sort` in `terms` facets; use 'index asc' or 'index desc' instead. For other sorts, use 'date'.
|
61 |
-
|
62 |
-
* We need to show user **standout examples** for each category chosen.
|
63 |
-
For example: if user asks for "USA approved drugs last 5 years" We need to show user standout examples for each year. In this context: standout means the news with the biggest deals in million for each year for example.
|
64 |
-
|
65 |
|
66 |
---
|
67 |
### HOW TO CHOOSE THE ANALYSIS DIMENSION AND MEASURE (ANALYTICAL STRATEGY)
|
@@ -99,17 +100,24 @@ This is the most critical part of your task. A bad choice leads to a useless, bo
|
|
99 |
### EXAMPLES
|
100 |
|
101 |
**User Query 1:** "What are the top 5 companies by total deal value in 2023?"
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
**Correct JSON Output 1:**
|
103 |
```json
|
104 |
{{
|
105 |
"reasoning": {{
|
106 |
"dimension_choice": "User explicitly asked for 'top 5 companies', so 'company_name' is the correct dimension.",
|
107 |
-
"measure_choice": "User explicitly asked for 'total deal value', so 'sum(total_deal_value_in_million)' is the correct measure."
|
|
|
108 |
}},
|
109 |
"analysis_dimension": "company_name",
|
110 |
"analysis_measure": "sum(total_deal_value_in_million)",
|
111 |
"sort_field_for_examples": "total_deal_value_in_million",
|
112 |
-
"query_filter": "
|
113 |
"quantitative_request": {{
|
114 |
"json.facet": {{
|
115 |
"companies_by_deal_value": {{
|
@@ -126,25 +134,32 @@ This is the most critical part of your task. A bad choice leads to a useless, bo
|
|
126 |
"qualitative_request": {{
|
127 |
"group": true,
|
128 |
"group.field": "company_name",
|
129 |
-
"group.limit":
|
130 |
"group.sort": "sum(total_deal_value_in_million) desc",
|
131 |
"sort": "total_deal_value_in_million desc"
|
132 |
}}
|
133 |
}}
|
134 |
```
|
135 |
|
136 |
-
**User Query 2:** "What are the most common news types for infections
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
**Correct JSON Output 2:**
|
138 |
```json
|
139 |
{{
|
140 |
"reasoning": {{
|
141 |
-
"dimension_choice": "User asked for 'most common news types', so 'news_type' is the correct dimension.",
|
142 |
-
"measure_choice": "User asked for 'most common', which implies counting occurrences. Therefore, the measure is 'count'."
|
|
|
143 |
}},
|
144 |
"analysis_dimension": "news_type",
|
145 |
"analysis_measure": "count",
|
146 |
"sort_field_for_examples": "date",
|
147 |
-
"query_filter": "therapeutic_category_s:infections AND
|
148 |
"quantitative_request": {{
|
149 |
"json.facet": {{
|
150 |
"news_by_type": {{
|
@@ -158,7 +173,7 @@ This is the most critical part of your task. A bad choice leads to a useless, bo
|
|
158 |
"qualitative_request": {{
|
159 |
"group": true,
|
160 |
"group.field": "news_type",
|
161 |
-
"group.limit":
|
162 |
"group.sort": "sum(total_deal_value_in_million) desc",
|
163 |
"sort": "total_deal_value_in_million desc"
|
164 |
}}
|
@@ -167,18 +182,26 @@ This is the most critical part of your task. A bad choice leads to a useless, bo
|
|
167 |
|
168 |
|
169 |
|
170 |
-
**User Query
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
**Correct JSON Output 3:**
|
172 |
```json
|
173 |
{{
|
174 |
"reasoning": {{
|
175 |
-
"dimension_choice": "The user wants to compare
|
176 |
-
"measure_choice": "The user explicitly asks to compare 'deal values', so 'sum(total_deal_value_in_million)' is the correct measure."
|
|
|
177 |
}},
|
178 |
"analysis_dimension": "route_branch",
|
179 |
"analysis_measure": "sum(total_deal_value_in_million)",
|
180 |
"sort_field_for_examples": "total_deal_value_in_million",
|
181 |
-
"query_filter": "drug_delivery_branch_s:(injection OR oral) AND therapeutic_category_s:infections
|
182 |
"quantitative_request": {{
|
183 |
"json.facet": {{
|
184 |
"deal_values_by_route": {{
|
@@ -195,7 +218,7 @@ This is the most critical part of your task. A bad choice leads to a useless, bo
|
|
195 |
"qualitative_request": {{
|
196 |
"group": true,
|
197 |
"group.field": "route_branch",
|
198 |
-
"group.limit":
|
199 |
"group.sort": "sum(total_deal_value_in_million) desc",
|
200 |
"sort": "total_deal_value_in_million desc"
|
201 |
}}
|
@@ -204,7 +227,7 @@ This is the most critical part of your task. A bad choice leads to a useless, bo
|
|
204 |
---
|
205 |
### YOUR TASK
|
206 |
|
207 |
-
Convert the following user query into a single, raw JSON "Analysis Plan" object. Strictly follow all rules, especially the
|
208 |
|
209 |
**Current User Query:** `{natural_language_query}`
|
210 |
"""
|
|
|
32 |
# The search_fields are now pre-mapped, so we can use them directly
|
33 |
formatted_fields = "\n".join([f" - {field['field_name']}: {field['field_value']}" for field in search_fields])
|
34 |
dynamic_fields_prompt_section = f"""
|
35 |
+
---
|
36 |
+
### MANDATORY DYNAMIC FILTERS
|
37 |
|
38 |
+
An external API has identified the following field-value pairs from the user query.
|
39 |
+
**You MUST use ALL of these fields and values to construct the `query_filter`.**
|
40 |
+
- Construct the `query_filter` by combining these key-value pairs using the 'AND' operator.
|
41 |
+
- Do NOT add any other fields or conditions to the `query_filter`. This section is the definitive source for it.
|
42 |
|
43 |
+
**Mandatory Fields for Query Filter:**
|
44 |
{formatted_fields}
|
45 |
"""
|
46 |
|
47 |
return f"""
|
48 |
You are an expert data analyst and Solr query engineer. Your task is to convert a natural language question into a structured JSON "Analysis Plan". This plan will be used to run two separate, efficient queries: one for aggregate data (facets) and one for finding illustrative examples (grouping).
|
49 |
|
50 |
+
Your most important job is to think like an analyst and choose a `analysis_dimension` and `analysis_measure` that provides a meaningful, non-obvious breakdown of the data.
|
51 |
|
52 |
---
|
53 |
### CONTEXT & RULES
|
54 |
|
55 |
1. **Today's Date for Calculations**: {datetime.datetime.now().date().strftime("%Y-%m-%d")}
|
56 |
+
2. **Query Filter Construction**: The `query_filter` MUST be built exclusively from the fields provided in the "MANDATORY DYNAMIC FILTERS" section, if present.
|
57 |
+
3. **Field Usage**: You MUST use the fields described in the 'Field Definitions'. Pay close attention to the definitions to select the correct field, especially the `_s` fields for searching. Do not use fields ending with `_s` in `group.field` or facet `field` unless necessary for the analysis.
|
58 |
+
4. **Crucial Sorting Rules**:
|
59 |
* For `group.sort`: If `analysis_measure` involves a function on a field (e.g., `sum(total_deal_value_in_million)`), you MUST use the full function: `group.sort: 'sum(total_deal_value_in_million) desc'`.
|
60 |
* If `analysis_measure` is 'count', you MUST OMIT the `group.sort` parameter entirely.
|
61 |
* For sorting, NEVER use 'date_year' directly for `sort` in `terms` facets; use 'index asc' or 'index desc' instead. For other sorts, use 'date'.
|
62 |
+
5. On **Qualitative Data** Group Operation:
|
63 |
+
* We need to show user **standout examples** for each category chosen.
|
64 |
+
For example: if user asks for "USA approved drugs last 5 years" We need to show user standout examples for each year. In this context: standout means the news with the biggest deals in million for each year for example.
|
65 |
+
6. **Output Format**: Your final output must be a single, raw JSON object. Do not add comments or markdown formatting. The JSON MUST include a `reasoning` object explaining your choices.
|
66 |
|
67 |
---
|
68 |
### HOW TO CHOOSE THE ANALYSIS DIMENSION AND MEASURE (ANALYTICAL STRATEGY)
|
|
|
100 |
### EXAMPLES
|
101 |
|
102 |
**User Query 1:** "What are the top 5 companies by total deal value in 2023?"
|
103 |
+
**API Filter Input 1:**
|
104 |
+
```
|
105 |
+
### MANDATORY DYNAMIC FILTERS
|
106 |
+
**Mandatory Fields for Query Filter:**
|
107 |
+
- date: '2023'
|
108 |
+
```
|
109 |
**Correct JSON Output 1:**
|
110 |
```json
|
111 |
{{
|
112 |
"reasoning": {{
|
113 |
"dimension_choice": "User explicitly asked for 'top 5 companies', so 'company_name' is the correct dimension.",
|
114 |
+
"measure_choice": "User explicitly asked for 'total deal value', so 'sum(total_deal_value_in_million)' is the correct measure.",
|
115 |
+
"filter_choice": "The query filter was constructed from the mandatory fields provided by the API: date(date is converted to ISO 8601 format) and total_deal_value_in_million."
|
116 |
}},
|
117 |
"analysis_dimension": "company_name",
|
118 |
"analysis_measure": "sum(total_deal_value_in_million)",
|
119 |
"sort_field_for_examples": "total_deal_value_in_million",
|
120 |
+
"query_filter": "date:["2023-01-01T00:00:00Z" TO \"2023-12-31T23:59:59Z\"]",
|
121 |
"quantitative_request": {{
|
122 |
"json.facet": {{
|
123 |
"companies_by_deal_value": {{
|
|
|
134 |
"qualitative_request": {{
|
135 |
"group": true,
|
136 |
"group.field": "company_name",
|
137 |
+
"group.limit": 2,
|
138 |
"group.sort": "sum(total_deal_value_in_million) desc",
|
139 |
"sort": "total_deal_value_in_million desc"
|
140 |
}}
|
141 |
}}
|
142 |
```
|
143 |
|
144 |
+
**User Query 2:** "What are the most common news types for infections in 2025?"
|
145 |
+
**API Filter Input 2:**
|
146 |
+
```### MANDATORY DYNAMIC FILTERS
|
147 |
+
**Mandatory Fields for Query Filter:**
|
148 |
+
- therapeutic_category_s: infections
|
149 |
+
- date: '2025'
|
150 |
+
```
|
151 |
**Correct JSON Output 2:**
|
152 |
```json
|
153 |
{{
|
154 |
"reasoning": {{
|
155 |
+
"dimension_choice": "User asked for 'most common news types', so 'news_type' is the correct dimension. This is not redundant as the filter is on 'therapeutic_category'.",
|
156 |
+
"measure_choice": "User asked for 'most common', which implies counting occurrences. Therefore, the measure is 'count'.",
|
157 |
+
"filter_choice": "The query filter was constructed from the mandatory fields provided by the API: therapeutic_category_s and date(date is converted to ISO 8601 format)."
|
158 |
}},
|
159 |
"analysis_dimension": "news_type",
|
160 |
"analysis_measure": "count",
|
161 |
"sort_field_for_examples": "date",
|
162 |
+
"query_filter": "therapeutic_category_s:infections AND date:["2025-01-01T00:00:00Z" TO *]",
|
163 |
"quantitative_request": {{
|
164 |
"json.facet": {{
|
165 |
"news_by_type": {{
|
|
|
173 |
"qualitative_request": {{
|
174 |
"group": true,
|
175 |
"group.field": "news_type",
|
176 |
+
"group.limit": 2,
|
177 |
"group.sort": "sum(total_deal_value_in_million) desc",
|
178 |
"sort": "total_deal_value_in_million desc"
|
179 |
}}
|
|
|
182 |
|
183 |
|
184 |
|
185 |
+
**User Query 3:** "Compare deal values for injection vs oral related to infection news."
|
186 |
+
**API Filter Input 3:**
|
187 |
+
```
|
188 |
+
### MANDATORY DYNAMIC FILTERS
|
189 |
+
**Mandatory Fields for Query Filter:**
|
190 |
+
- drug_delivery_branch_s: (injection OR oral)
|
191 |
+
- therapeutic_category_s: infections
|
192 |
+
```
|
193 |
**Correct JSON Output 3:**
|
194 |
```json
|
195 |
{{
|
196 |
"reasoning": {{
|
197 |
+
"dimension_choice": "The user wants to compare 'injection' vs 'oral', making 'route_branch' the appropriate analysis dimension.",
|
198 |
+
"measure_choice": "The user explicitly asks to compare 'deal values', so 'sum(total_deal_value_in_million)' is the correct measure.",
|
199 |
+
"filter_choice": "The query filter was constructed directly from the mandatory fields provided by the API: drug_delivery_branch_s and therapeutic_category_s."
|
200 |
}},
|
201 |
"analysis_dimension": "route_branch",
|
202 |
"analysis_measure": "sum(total_deal_value_in_million)",
|
203 |
"sort_field_for_examples": "total_deal_value_in_million",
|
204 |
+
"query_filter": "drug_delivery_branch_s:(injection OR oral) AND therapeutic_category_s:infections",
|
205 |
"quantitative_request": {{
|
206 |
"json.facet": {{
|
207 |
"deal_values_by_route": {{
|
|
|
218 |
"qualitative_request": {{
|
219 |
"group": true,
|
220 |
"group.field": "route_branch",
|
221 |
+
"group.limit": 2,
|
222 |
"group.sort": "sum(total_deal_value_in_million) desc",
|
223 |
"sort": "total_deal_value_in_million desc"
|
224 |
}}
|
|
|
227 |
---
|
228 |
### YOUR TASK
|
229 |
|
230 |
+
Convert the following user query into a single, raw JSON "Analysis Plan" object. Strictly follow all rules, especially the rule for building the `query_filter` from the mandatory dynamic filters. Your JSON output MUST include the `reasoning` field.
|
231 |
|
232 |
**Current User Query:** `{natural_language_query}`
|
233 |
"""
|