dolphinium commited on
Commit
bdcb123
·
1 Parent(s): d0ea80f

update field definitions for clarity and usage instructions in metadata and also prompt for better field_usage while facet generation.

Browse files
Files changed (1) hide show
  1. app.py +26 -23
app.py CHANGED
@@ -101,15 +101,15 @@ field_metadata = [
101
  },
102
  {
103
  "field_name": "company_name",
104
- "type": "string (exact match)",
105
  "example_values": ["pfizer inc.", "astrazeneca plc", "roche"],
106
- "definition": "The canonical, standardized name of a company. Good for comparison. For searching, prefer 'company_name_s' to catch all variations."
107
  },
108
  {
109
  "field_name": "company_name_s",
110
  "type": "string (multi-valued, for searching)",
111
  "example_values": ["pfizer inc.", "roche", "f. hoffmann-la roche ag", "nih"],
112
- "definition": "A field containing all known names, synonyms, and abbreviations for a company. **Use this field for all queries involving a company name** to ensure comprehensive results (e.g., searching for 'roche' will also find 'f. hoffmann-la roche ag')."
113
  },
114
  {
115
  "field_name": "territory_hq_s",
@@ -121,61 +121,61 @@ field_metadata = [
121
  "field_name": "therapeutic_category",
122
  "type": "string (specific)",
123
  "example_values": ["cancer, other", "cancer, nsclc metastatic", "alzheimer's"],
124
- "definition": "The specific disease or therapeutic area being targeted, often in a detailed, hierarchical format. Use for very specific disease queries."
125
  },
126
  {
127
  "field_name": "therapeutic_category_s",
128
  "type": "string (multi-valued, for searching)",
129
  "example_values": ["cancer", "oncology", "infections", "cns"],
130
- "definition": "Broader, multi-valued therapeutic categories and their synonyms. **Use this field for broad category searches** (e.g., 'cancer', 'cns', 'immune') as it is designed for easier searching."
131
  },
132
  {
133
  "field_name": "compound_name",
134
- "type": "string (exact match)",
135
  "example_values": ["opdivo injection solution", "keytruda injection solution"],
136
- "definition": "The specific, full trade or development name of a drug/compound, including its formulation. For searching, prefer 'compound_name_s'."
137
  },
138
  {
139
  "field_name": "compound_name_s",
140
  "type": "string (multi-valued, for searching)",
141
  "example_values": ["nivolumab injection solution", "opdivo injection solution", "ono-4538 injection solution"],
142
- "definition": "A field containing all known trade names, development codes, and synonyms for a drug/compound. **Use this field for all queries involving a compound name**."
143
  },
144
  {
145
  "field_name": "molecule_name",
146
- "type": "string (exact match)",
147
  "example_values": ["cannabidiol", "paclitaxel", "pembrolizumab"],
148
- "definition": "The generic, non-proprietary name of the active molecule. For searching, prefer 'molecule_name_s'."
149
  },
150
  {
151
  "field_name": "molecule_name_s",
152
  "type": "string (multi-valued, for searching)",
153
  "example_values": ["cbd", "s1-220", "a1002n5s"],
154
- "definition": "A field containing all known generic names, development codes, and synonyms for a molecule. **Use this field for all queries involving a molecule name**."
155
  },
156
  {
157
  "field_name": "highest_phase",
158
  "type": "string (categorical)",
159
  "example_values": ["marketed", "phase 2", "phase 1"],
160
- "definition": "The highest stage of development a drug has ever reached. Distinct from 'event_type' which is about the event in the news."
161
  },
162
  {
163
  "field_name": "drug_delivery_branch_s",
164
  "type": "string (multi-valued, for searching)",
165
  "example_values": ["injection", "parenteral", "oral", "injection, other", "oral, other"],
166
- "definition": "The method of drug administration (e.g., injection, oral). **Use this for queries about route of administration** as it contains broader, search-friendly terms. contains synonyms."
167
  },
168
  {
169
  "field_name": "drug_delivery_branch",
170
- "type": "string (categorical, specific)",
171
  "example_values": ["injection, other", "prefilled syringes", "np liposome", "oral enteric/delayed release"],
172
- "definition": "The most specific category of drug delivery technology or formulation. This provides a detailed, granular breakdown (e.g., 'prefilled syringes', 'np liposome'). Use this field only when a user asks for very specific technologies or for detailed faceting on the technology itself."
173
  },
174
  {
175
  "field_name": "route_branch",
176
  "type": "string (categorical)",
177
  "example_values": ["injection", "oral", "topical", "inhalation"],
178
- "definition": "The specific, primary route of drug administration. This field is good for faceting on exact routes, for example comparing values for injection vs oral. it is usually better to use the broader 'drug_delivery_branch_s' field which contains synonyms and parent categories for search operations."
179
  },
180
  {
181
  "field_name": "molecule_api_group",
@@ -193,13 +193,13 @@ field_metadata = [
193
  "field_name": "date",
194
  "type": "date",
195
  "example_values": ["2020-10-22T00:00:00Z"],
196
- "definition": "The full publication date and time in ISO 8601 format. Use for precise date range queries (e.g., 'last 30 days', 'between date A and date B')."
197
  },
198
  {
199
  "field_name": "date_year",
200
  "type": "number (year)",
201
  "example_values": [2020, 2021, 2022],
202
- "definition": "The 4-digit year of publication. **Use this for queries involving whole years** (e.g., 'in 2023', 'last year', 'since 2020') as it is more efficient."
203
  },
204
  {
205
  "field_name": "total_deal_value_in_million",
@@ -244,11 +244,14 @@ You are an expert Solr query engineer who converts natural language questions in
244
  ### CONTEXT & RULES
245
 
246
  1. **Today's Date for Calculations**: {datetime.datetime.now().date().strftime("%Y-%m-%d")}
247
- 2. **Field Usage**: You MUST use the fields described in the 'Field Definitions' section. Pay close attention to the definitions to select the correct field. For instance, use `company_name_s` for searching companies and `date_year` for year-based queries.
248
- 3. **No `count(*)`**: Do NOT use functions like `count(*)`. The default facet bucket count is sufficient for counting documents.
249
- 4. **Allowed Aggregations**: For statistical facets (`stats` or `stat` type), only use these functions: `sum`, `avg`, `min`, `max`, `unique`. The primary metric field is `total_deal_value_in_million`.
250
- 5. **Term Facet Limits**: Every `terms` facet MUST include a `limit` key. Default to `limit: 10` unless the user specifies a different number of top results.
251
- 6. **Output Format**: Your final output must be a single, raw JSON object and nothing else. Do not add comments, explanations, or markdown formatting like ```json.
 
 
 
252
 
253
  ---
254
  ### FIELD DEFINITIONS (Your Source of Truth)
 
101
  },
102
  {
103
  "field_name": "company_name",
104
+ "type": "string (exact match, for faceting)",
105
  "example_values": ["pfizer inc.", "astrazeneca plc", "roche"],
106
+ "definition": "The canonical, standardized name of a company. **Crucially, you MUST use this field for `terms` faceting** to group results by a unique company. Do NOT use this for searching."
107
  },
108
  {
109
  "field_name": "company_name_s",
110
  "type": "string (multi-valued, for searching)",
111
  "example_values": ["pfizer inc.", "roche", "f. hoffmann-la roche ag", "nih"],
112
+ "definition": "A field containing all known names and synonyms for a company. **You MUST use this field for all `query` parameter searches involving a company name** to ensure comprehensive results. Do NOT use for `terms` faceting."
113
  },
114
  {
115
  "field_name": "territory_hq_s",
 
121
  "field_name": "therapeutic_category",
122
  "type": "string (specific)",
123
  "example_values": ["cancer, other", "cancer, nsclc metastatic", "alzheimer's"],
124
+ "definition": "The specific disease or therapeutic area being targeted. Use for very specific disease queries."
125
  },
126
  {
127
  "field_name": "therapeutic_category_s",
128
  "type": "string (multi-valued, for searching)",
129
  "example_values": ["cancer", "oncology", "infections", "cns"],
130
+ "definition": "Broader, multi-valued therapeutic categories and their synonyms. **Use this field for broad category searches** in the `query` parameter."
131
  },
132
  {
133
  "field_name": "compound_name",
134
+ "type": "string (exact match, for faceting)",
135
  "example_values": ["opdivo injection solution", "keytruda injection solution"],
136
+ "definition": "The specific, full trade name of a drug. **Use this field for `terms` faceting** on compounds."
137
  },
138
  {
139
  "field_name": "compound_name_s",
140
  "type": "string (multi-valued, for searching)",
141
  "example_values": ["nivolumab injection solution", "opdivo injection solution", "ono-4538 injection solution"],
142
+ "definition": "A field with all known trade names and synonyms for a drug. **Use this field for all `query` parameter searches** involving a compound name."
143
  },
144
  {
145
  "field_name": "molecule_name",
146
+ "type": "string (exact match, for faceting)",
147
  "example_values": ["cannabidiol", "paclitaxel", "pembrolizumab"],
148
+ "definition": "The generic, non-proprietary name of the active molecule. **Use this field for `terms` faceting** on molecules."
149
  },
150
  {
151
  "field_name": "molecule_name_s",
152
  "type": "string (multi-valued, for searching)",
153
  "example_values": ["cbd", "s1-220", "a1002n5s"],
154
+ "definition": "A field with all known generic names and synonyms for a molecule. **Use this field for all `query` parameter searches** involving a molecule name."
155
  },
156
  {
157
  "field_name": "highest_phase",
158
  "type": "string (categorical)",
159
  "example_values": ["marketed", "phase 2", "phase 1"],
160
+ "definition": "The highest stage of development a drug has ever reached."
161
  },
162
  {
163
  "field_name": "drug_delivery_branch_s",
164
  "type": "string (multi-valued, for searching)",
165
  "example_values": ["injection", "parenteral", "oral", "injection, other", "oral, other"],
166
+ "definition": "The method of drug administration. **Use this for `query` parameter searches about route of administration** as it contains broader, search-friendly terms."
167
  },
168
  {
169
  "field_name": "drug_delivery_branch",
170
+ "type": "string (categorical, specific, for faceting)",
171
  "example_values": ["injection, other", "prefilled syringes", "np liposome", "oral enteric/delayed release"],
172
+ "definition": "The most specific category of drug delivery technology. **Use this field for `terms` faceting** on specific delivery technologies."
173
  },
174
  {
175
  "field_name": "route_branch",
176
  "type": "string (categorical)",
177
  "example_values": ["injection", "oral", "topical", "inhalation"],
178
+ "definition": "The primary route of drug administration. Good for faceting on exact routes."
179
  },
180
  {
181
  "field_name": "molecule_api_group",
 
193
  "field_name": "date",
194
  "type": "date",
195
  "example_values": ["2020-10-22T00:00:00Z"],
196
+ "definition": "The full publication date and time in ISO 8601 format. Use for precise date range queries."
197
  },
198
  {
199
  "field_name": "date_year",
200
  "type": "number (year)",
201
  "example_values": [2020, 2021, 2022],
202
+ "definition": "The 4-digit year of publication. **Use this for queries involving whole years** (e.g., 'in 2023', 'last year', 'since 2020')."
203
  },
204
  {
205
  "field_name": "total_deal_value_in_million",
 
244
  ### CONTEXT & RULES
245
 
246
  1. **Today's Date for Calculations**: {datetime.datetime.now().date().strftime("%Y-%m-%d")}
247
+ 2. **Field Usage**: You MUST use the fields described in the 'Field Definitions' section. Pay close attention to the definitions to select the correct field.
248
+ 3. **Facet vs. Query Field Distinction**: This is critical.
249
+ * For searching in the main `query` parameter, ALWAYS use the multi-valued search fields (ending in `_s`, like `company_name_s`) to get comprehensive results.
250
+ * For grouping in a `terms` facet, ALWAYS use the canonical, single-value field (e.g., `company_name`, `molecule_name`) to ensure unique and accurate grouping.
251
+ 4. **No `count(*)`**: Do NOT use functions like `count(*)`. The default facet bucket count is sufficient for counting documents.
252
+ 5. **Allowed Aggregations**: For statistical facets (`stats` or `stat` type), only use these functions: `sum`, `avg`, `min`, `max`, `unique`. The primary metric field is `total_deal_value_in_million`.
253
+ 6. **Term Facet Limits**: Every `terms` facet MUST include a `limit` key. Default to `limit: 10` unless the user specifies a different number of top results.
254
+ 7. **Output Format**: Your final output must be a single, raw JSON object and nothing else.
255
 
256
  ---
257
  ### FIELD DEFINITIONS (Your Source of Truth)