Spaces:

dolphinium
/

pc-ai-data-analyst

Sleeping

App Files Files Community

dolphinium commited on Jul 16

Commit

8290c25

1 Parent(s): 840c57d

enhance viz code generation prompt

Browse files

Files changed (1) hide show

app.py +161 -20

app.py CHANGED Viewed

@@ -328,33 +328,174 @@ def llm_generate_visualization_code(query_context, facet_data):
     """Generates Python code for visualization based on query and data."""
     prompt = f"""
 You are a Python Data Visualization expert specializing in Matplotlib and Seaborn.
-Your task is to generate Python code to create a single, insightful visualization.
-**Context:**
-1.  **User's Analytical Goal:** "{query_context}"
-2.  **Aggregated Data (from Solr Facets):**
-    ```json
-    {json.dumps(facet_data, indent=2)}
-    ```
-**Instructions:**
-1.  **Goal:** Write Python code to generate a chart that best visualizes the answer to the user's goal using the provided data.
-2.  **Data Access:** The data is available in a Python dictionary named `facet_data`. Your code must parse this dictionary.
-3.  **Code Requirements:**
-    *   Start with `import matplotlib.pyplot as plt` and `import seaborn as sns`.
-    *   Use `plt.style.use('seaborn-v0_8-whitegrid')` and `fig, ax = plt.subplots(figsize=(12, 7))`. Plot using the `ax` object.
-    *   Always include a clear `ax.set_title(...)`, `ax.set_xlabel(...)`, and `ax.set_ylabel(...)`.
-    *   Dynamically find the primary facet key and extract the 'buckets'.
-    *   For each bucket, extract the 'val' (label) and the relevant metric ('count' or a nested metric).
-    *   Use `plt.tight_layout()` and rotate x-axis labels if needed.
-4.  **Output Format:** ONLY output raw Python code. Do not wrap it in ```python ... ```. Do not include `plt.show()` or any explanation.
 """
     try:
-        response = llm_model.generate_content(prompt)
         code = re.sub(r'^```python\s*|\s*```$', '', response.text, flags=re.MULTILINE)
         return code
     except Exception as e:
-        print(f"Error in llm_generate_visualization_code: {e}")
         return None
 def execute_viz_code_and_get_path(viz_code, facet_data):

     """Generates Python code for visualization based on query and data."""
     prompt = f"""
 You are a Python Data Visualization expert specializing in Matplotlib and Seaborn.
+Your task is to generate robust, error-free Python code to create a single, insightful visualization based on the user's query and the provided Solr facet data.
+**User's Analytical Goal:**
+"{query_context}"
+**Aggregated Data (from Solr Facets):**
+```json
+{json.dumps(facet_data, indent=2)}
+```
+---
+### **CRITICAL INSTRUCTIONS: CODE GENERATION RULES**
+You MUST follow these rules to avoid errors.
+**1. Identify the Data Structure FIRST:**
+Before writing any code, analyze the `facet_data` JSON to determine its structure. There are three common patterns. Choose the correct template below.
+   *   **Pattern A: Simple `terms` Facet.** The JSON has ONE main key (besides "count") which contains a list of "buckets". Each bucket has a "val" and a "count". Use this for standard bar charts.
+   *   **Pattern B: Multiple `query` Facets.** The JSON has MULTIPLE keys (besides "count"), and each key is an object containing metrics like "count" or "sum(...)". Use this for comparing a few distinct items (e.g., "oral vs injection").
+   *   **Pattern C: Nested `terms` Facet.** The JSON has one main key with a list of "buckets", but inside EACH bucket, there are nested metric objects. This is used for grouped comparisons (e.g., "compare 2024 vs 2025 across categories"). This almost always requires `pandas`.
+**2. Use the Correct Parsing Template:**
+---
+**TEMPLATE FOR PATTERN A (Simple Bar Chart from `terms` facet):**
+```python
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+plt.style.use('seaborn-v0_8-whitegrid')
+fig, ax = plt.subplots(figsize=(12, 8))
+# Dynamically find the main facet key (the one with 'buckets')
+facet_key = None
+for key, value in facet_data.items():
+    if isinstance(value, dict) and 'buckets' in value:
+        facet_key = key
+        break
+if facet_key:
+    buckets = facet_data[facet_key].get('buckets', [])
+    # Check if buckets contain data
+    if buckets:
+        df = pd.DataFrame(buckets)
+        # Check for a nested metric or use 'count'
+        if 'total_deal_value' in df.columns and pd.api.types.is_dict_like(df['total_deal_value'].iloc):
+             # Example for nested sum metric
+             df['value'] = df['total_deal_value'].apply(lambda x: x.get('sum', 0))
+             y_axis_label = 'Sum of Total Deal Value'
+        else:
+             df.rename(columns={{'count': 'value'}}, inplace=True)
+             y_axis_label = 'Count'
+        sns.barplot(data=df, x='val', y='value', ax=ax, palette='viridis')
+        ax.set_xlabel('Category')
+        ax.set_ylabel(y_axis_label)
+    else:
+        ax.text(0.5, 0.5, 'No data in buckets to plot.', ha='center')
+ax.set_title('Your Insightful Title Here')
+# Correct way to rotate labels to prevent errors
+plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
+plt.tight_layout()
+```
+---
+**TEMPLATE FOR PATTERN B (Comparison Bar Chart from `query` facets):**
+```python
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+plt.style.use('seaborn-v0_8-whitegrid')
+fig, ax = plt.subplots(figsize=(10, 6))
+labels = []
+values = []
+# Iterate through top-level keys, skipping the 'count'
+for key, data_dict in facet_data.items():
+    if key == 'count' or not isinstance(data_dict, dict):
+        continue
+    # Extract the label (e.g., 'oral_deals' -> 'Oral')
+    label = key.replace('_deals', '').replace('_', ' ').title()
+    # Find the metric value, which is NOT 'count'
+    metric_value = 0
+    for sub_key, sub_value in data_dict.items():
+        if sub_key != 'count':
+            metric_value = sub_value
+            break # Found the metric
+    labels.append(label)
+    values.append(metric_value)
+if labels:
+    sns.barplot(x=labels, y=values, ax=ax, palette='mako')
+    ax.set_ylabel('Total Deal Value') # Or other metric name
+    ax.set_xlabel('Category')
+else:
+    ax.text(0.5, 0.5, 'No query facet data to plot.', ha='center')
+ax.set_title('Your Insightful Title Here')
+plt.tight_layout()
+```
+---
+**TEMPLATE FOR PATTERN C (Grouped Bar Chart from nested `terms` facet):**
+```python
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+plt.style.use('seaborn-v0_8-whitegrid')
+fig, ax = plt.subplots(figsize=(14, 8))
+# Find the key that has the buckets
+facet_key = None
+for key, value in facet_data.items():
+    if isinstance(value, dict) and 'buckets' in value:
+        facet_key = key
+        break
+if facet_key and facet_data[facet_key].get('buckets'):
+    # This list comprehension is robust for parsing nested metrics
+    plot_data = []
+    for bucket in facet_data[facet_key]['buckets']:
+        category = bucket['val']
+        # Find all nested metrics (e.g., total_deal_value_2025)
+        for sub_key, sub_value in bucket.items():
+            if isinstance(sub_value, dict) and 'sum' in sub_value:
+                # Extracts year from 'total_deal_value_2025' -> '2025'
+                year = sub_key.split('_')[-1]
+                value = sub_value['sum']
+                plot_data.append({{'Category': category, 'Year': year, 'Value': value}})
+    if plot_data:
+        df = pd.DataFrame(plot_data)
+        sns.barplot(data=df, x='Category', y='Value', hue='Year', ax=ax)
+        ax.set_ylabel('Total Deal Value')
+        ax.set_xlabel('Business Model')
+        # Correct way to rotate labels to prevent errors
+        plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
+    else:
+        ax.text(0.5, 0.5, 'No nested data found to plot.', ha='center')
+else:
+    ax.text(0.5, 0.5, 'No data in buckets to plot.', ha='center')
+ax.set_title('Your Insightful Title Here')
+plt.tight_layout()
+```
+---
+**3. Final Code Generation:**
+- **DO NOT** include `plt.show()`.
+- **DO** set a dynamic and descriptive `ax.set_title()`, `ax.set_xlabel()`, and `ax.set_ylabel()`.
+- **DO NOT** wrap the code in ```python ... ```. Output only the raw Python code.
+- Adapt the chosen template to the specific keys and metrics in the provided `facet_data`.
+**Your Task:**
+Now, generate the Python code.
 """
     try:
+        # Increase the timeout for potentially complex generation
+        generation_config = genai.types.GenerationConfig(temperature=0, max_output_tokens=2048)
+        response = llm_model.generate_content(prompt, generation_config=generation_config)
+        # Clean the response to remove markdown formatting
         code = re.sub(r'^```python\s*|\s*```$', '', response.text, flags=re.MULTILINE)
         return code
     except Exception as e:
+        print(f"Error in llm_generate_visualization_code: {e}\nRaw response: {response.text}")
         return None
 def execute_viz_code_and_get_path(viz_code, facet_data):