Spaces:
Sleeping
Sleeping
dolphinium
commited on
Commit
·
bdcb123
1
Parent(s):
d0ea80f
update field definitions for clarity and usage instructions in metadata and also prompt for better field_usage while facet generation.
Browse files
app.py
CHANGED
@@ -101,15 +101,15 @@ field_metadata = [
|
|
101 |
},
|
102 |
{
|
103 |
"field_name": "company_name",
|
104 |
-
"type": "string (exact match)",
|
105 |
"example_values": ["pfizer inc.", "astrazeneca plc", "roche"],
|
106 |
-
"definition": "The canonical, standardized name of a company.
|
107 |
},
|
108 |
{
|
109 |
"field_name": "company_name_s",
|
110 |
"type": "string (multi-valued, for searching)",
|
111 |
"example_values": ["pfizer inc.", "roche", "f. hoffmann-la roche ag", "nih"],
|
112 |
-
"definition": "A field containing all known names
|
113 |
},
|
114 |
{
|
115 |
"field_name": "territory_hq_s",
|
@@ -121,61 +121,61 @@ field_metadata = [
|
|
121 |
"field_name": "therapeutic_category",
|
122 |
"type": "string (specific)",
|
123 |
"example_values": ["cancer, other", "cancer, nsclc metastatic", "alzheimer's"],
|
124 |
-
"definition": "The specific disease or therapeutic area being targeted
|
125 |
},
|
126 |
{
|
127 |
"field_name": "therapeutic_category_s",
|
128 |
"type": "string (multi-valued, for searching)",
|
129 |
"example_values": ["cancer", "oncology", "infections", "cns"],
|
130 |
-
"definition": "Broader, multi-valued therapeutic categories and their synonyms. **Use this field for broad category searches**
|
131 |
},
|
132 |
{
|
133 |
"field_name": "compound_name",
|
134 |
-
"type": "string (exact match)",
|
135 |
"example_values": ["opdivo injection solution", "keytruda injection solution"],
|
136 |
-
"definition": "The specific, full trade
|
137 |
},
|
138 |
{
|
139 |
"field_name": "compound_name_s",
|
140 |
"type": "string (multi-valued, for searching)",
|
141 |
"example_values": ["nivolumab injection solution", "opdivo injection solution", "ono-4538 injection solution"],
|
142 |
-
"definition": "A field
|
143 |
},
|
144 |
{
|
145 |
"field_name": "molecule_name",
|
146 |
-
"type": "string (exact match)",
|
147 |
"example_values": ["cannabidiol", "paclitaxel", "pembrolizumab"],
|
148 |
-
"definition": "The generic, non-proprietary name of the active molecule.
|
149 |
},
|
150 |
{
|
151 |
"field_name": "molecule_name_s",
|
152 |
"type": "string (multi-valued, for searching)",
|
153 |
"example_values": ["cbd", "s1-220", "a1002n5s"],
|
154 |
-
"definition": "A field
|
155 |
},
|
156 |
{
|
157 |
"field_name": "highest_phase",
|
158 |
"type": "string (categorical)",
|
159 |
"example_values": ["marketed", "phase 2", "phase 1"],
|
160 |
-
"definition": "The highest stage of development a drug has ever reached.
|
161 |
},
|
162 |
{
|
163 |
"field_name": "drug_delivery_branch_s",
|
164 |
"type": "string (multi-valued, for searching)",
|
165 |
"example_values": ["injection", "parenteral", "oral", "injection, other", "oral, other"],
|
166 |
-
"definition": "The method of drug administration
|
167 |
},
|
168 |
{
|
169 |
"field_name": "drug_delivery_branch",
|
170 |
-
"type": "string (categorical, specific)",
|
171 |
"example_values": ["injection, other", "prefilled syringes", "np liposome", "oral enteric/delayed release"],
|
172 |
-
"definition": "The most specific category of drug delivery technology
|
173 |
},
|
174 |
{
|
175 |
"field_name": "route_branch",
|
176 |
"type": "string (categorical)",
|
177 |
"example_values": ["injection", "oral", "topical", "inhalation"],
|
178 |
-
"definition": "The
|
179 |
},
|
180 |
{
|
181 |
"field_name": "molecule_api_group",
|
@@ -193,13 +193,13 @@ field_metadata = [
|
|
193 |
"field_name": "date",
|
194 |
"type": "date",
|
195 |
"example_values": ["2020-10-22T00:00:00Z"],
|
196 |
-
"definition": "The full publication date and time in ISO 8601 format. Use for precise date range queries
|
197 |
},
|
198 |
{
|
199 |
"field_name": "date_year",
|
200 |
"type": "number (year)",
|
201 |
"example_values": [2020, 2021, 2022],
|
202 |
-
"definition": "The 4-digit year of publication. **Use this for queries involving whole years** (e.g., 'in 2023', 'last year', 'since 2020')
|
203 |
},
|
204 |
{
|
205 |
"field_name": "total_deal_value_in_million",
|
@@ -244,11 +244,14 @@ You are an expert Solr query engineer who converts natural language questions in
|
|
244 |
### CONTEXT & RULES
|
245 |
|
246 |
1. **Today's Date for Calculations**: {datetime.datetime.now().date().strftime("%Y-%m-%d")}
|
247 |
-
2. **Field Usage**: You MUST use the fields described in the 'Field Definitions' section. Pay close attention to the definitions to select the correct field.
|
248 |
-
3. **
|
249 |
-
|
250 |
-
|
251 |
-
|
|
|
|
|
|
|
252 |
|
253 |
---
|
254 |
### FIELD DEFINITIONS (Your Source of Truth)
|
|
|
101 |
},
|
102 |
{
|
103 |
"field_name": "company_name",
|
104 |
+
"type": "string (exact match, for faceting)",
|
105 |
"example_values": ["pfizer inc.", "astrazeneca plc", "roche"],
|
106 |
+
"definition": "The canonical, standardized name of a company. **Crucially, you MUST use this field for `terms` faceting** to group results by a unique company. Do NOT use this for searching."
|
107 |
},
|
108 |
{
|
109 |
"field_name": "company_name_s",
|
110 |
"type": "string (multi-valued, for searching)",
|
111 |
"example_values": ["pfizer inc.", "roche", "f. hoffmann-la roche ag", "nih"],
|
112 |
+
"definition": "A field containing all known names and synonyms for a company. **You MUST use this field for all `query` parameter searches involving a company name** to ensure comprehensive results. Do NOT use for `terms` faceting."
|
113 |
},
|
114 |
{
|
115 |
"field_name": "territory_hq_s",
|
|
|
121 |
"field_name": "therapeutic_category",
|
122 |
"type": "string (specific)",
|
123 |
"example_values": ["cancer, other", "cancer, nsclc metastatic", "alzheimer's"],
|
124 |
+
"definition": "The specific disease or therapeutic area being targeted. Use for very specific disease queries."
|
125 |
},
|
126 |
{
|
127 |
"field_name": "therapeutic_category_s",
|
128 |
"type": "string (multi-valued, for searching)",
|
129 |
"example_values": ["cancer", "oncology", "infections", "cns"],
|
130 |
+
"definition": "Broader, multi-valued therapeutic categories and their synonyms. **Use this field for broad category searches** in the `query` parameter."
|
131 |
},
|
132 |
{
|
133 |
"field_name": "compound_name",
|
134 |
+
"type": "string (exact match, for faceting)",
|
135 |
"example_values": ["opdivo injection solution", "keytruda injection solution"],
|
136 |
+
"definition": "The specific, full trade name of a drug. **Use this field for `terms` faceting** on compounds."
|
137 |
},
|
138 |
{
|
139 |
"field_name": "compound_name_s",
|
140 |
"type": "string (multi-valued, for searching)",
|
141 |
"example_values": ["nivolumab injection solution", "opdivo injection solution", "ono-4538 injection solution"],
|
142 |
+
"definition": "A field with all known trade names and synonyms for a drug. **Use this field for all `query` parameter searches** involving a compound name."
|
143 |
},
|
144 |
{
|
145 |
"field_name": "molecule_name",
|
146 |
+
"type": "string (exact match, for faceting)",
|
147 |
"example_values": ["cannabidiol", "paclitaxel", "pembrolizumab"],
|
148 |
+
"definition": "The generic, non-proprietary name of the active molecule. **Use this field for `terms` faceting** on molecules."
|
149 |
},
|
150 |
{
|
151 |
"field_name": "molecule_name_s",
|
152 |
"type": "string (multi-valued, for searching)",
|
153 |
"example_values": ["cbd", "s1-220", "a1002n5s"],
|
154 |
+
"definition": "A field with all known generic names and synonyms for a molecule. **Use this field for all `query` parameter searches** involving a molecule name."
|
155 |
},
|
156 |
{
|
157 |
"field_name": "highest_phase",
|
158 |
"type": "string (categorical)",
|
159 |
"example_values": ["marketed", "phase 2", "phase 1"],
|
160 |
+
"definition": "The highest stage of development a drug has ever reached."
|
161 |
},
|
162 |
{
|
163 |
"field_name": "drug_delivery_branch_s",
|
164 |
"type": "string (multi-valued, for searching)",
|
165 |
"example_values": ["injection", "parenteral", "oral", "injection, other", "oral, other"],
|
166 |
+
"definition": "The method of drug administration. **Use this for `query` parameter searches about route of administration** as it contains broader, search-friendly terms."
|
167 |
},
|
168 |
{
|
169 |
"field_name": "drug_delivery_branch",
|
170 |
+
"type": "string (categorical, specific, for faceting)",
|
171 |
"example_values": ["injection, other", "prefilled syringes", "np liposome", "oral enteric/delayed release"],
|
172 |
+
"definition": "The most specific category of drug delivery technology. **Use this field for `terms` faceting** on specific delivery technologies."
|
173 |
},
|
174 |
{
|
175 |
"field_name": "route_branch",
|
176 |
"type": "string (categorical)",
|
177 |
"example_values": ["injection", "oral", "topical", "inhalation"],
|
178 |
+
"definition": "The primary route of drug administration. Good for faceting on exact routes."
|
179 |
},
|
180 |
{
|
181 |
"field_name": "molecule_api_group",
|
|
|
193 |
"field_name": "date",
|
194 |
"type": "date",
|
195 |
"example_values": ["2020-10-22T00:00:00Z"],
|
196 |
+
"definition": "The full publication date and time in ISO 8601 format. Use for precise date range queries."
|
197 |
},
|
198 |
{
|
199 |
"field_name": "date_year",
|
200 |
"type": "number (year)",
|
201 |
"example_values": [2020, 2021, 2022],
|
202 |
+
"definition": "The 4-digit year of publication. **Use this for queries involving whole years** (e.g., 'in 2023', 'last year', 'since 2020')."
|
203 |
},
|
204 |
{
|
205 |
"field_name": "total_deal_value_in_million",
|
|
|
244 |
### CONTEXT & RULES
|
245 |
|
246 |
1. **Today's Date for Calculations**: {datetime.datetime.now().date().strftime("%Y-%m-%d")}
|
247 |
+
2. **Field Usage**: You MUST use the fields described in the 'Field Definitions' section. Pay close attention to the definitions to select the correct field.
|
248 |
+
3. **Facet vs. Query Field Distinction**: This is critical.
|
249 |
+
* For searching in the main `query` parameter, ALWAYS use the multi-valued search fields (ending in `_s`, like `company_name_s`) to get comprehensive results.
|
250 |
+
* For grouping in a `terms` facet, ALWAYS use the canonical, single-value field (e.g., `company_name`, `molecule_name`) to ensure unique and accurate grouping.
|
251 |
+
4. **No `count(*)`**: Do NOT use functions like `count(*)`. The default facet bucket count is sufficient for counting documents.
|
252 |
+
5. **Allowed Aggregations**: For statistical facets (`stats` or `stat` type), only use these functions: `sum`, `avg`, `min`, `max`, `unique`. The primary metric field is `total_deal_value_in_million`.
|
253 |
+
6. **Term Facet Limits**: Every `terms` facet MUST include a `limit` key. Default to `limit: 10` unless the user specifies a different number of top results.
|
254 |
+
7. **Output Format**: Your final output must be a single, raw JSON object and nothing else.
|
255 |
|
256 |
---
|
257 |
### FIELD DEFINITIONS (Your Source of Truth)
|