File size: 12,648 Bytes
964d389
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce38001
964d389
 
 
 
 
 
 
 
 
 
ce38001
 
 
 
 
 
964d389
ce38001
 
 
 
 
 
 
 
 
964d389
 
 
ce38001
 
 
964d389
ce38001
 
 
964d389
ce38001
964d389
ce38001
 
964d389
ce38001
 
 
 
 
964d389
 
 
 
 
 
 
 
 
 
 
 
 
 
ce38001
964d389
ce38001
 
 
964d389
 
 
 
 
 
 
ce38001
964d389
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce38001
964d389
 
 
7fac3e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
def ask_gpt4o_for_visualization(query, df, llm):
    columns = ', '.join(df.columns)
    prompt = f"""
    Analyze the query and suggest one or more relevant visualizations.
    Query: "{query}"
    Available Columns: {columns}
    Respond in this JSON format (as a list if multiple suggestions):
    [
      {{
        "chart_type": "bar/box/line/scatter",
        "x_axis": "column_name",
        "y_axis": "column_name",
        "group_by": "optional_column_name"
      }}
    ]
    """
    response = llm.generate(prompt)
    try:
        return json.loads(response)
    except json.JSONDecodeError:
        st.error("⚠️ GPT-4o failed to generate a valid suggestion.")
        return None

def add_stats_to_figure(fig, df, y_axis, chart_type):
    """
    Add relevant statistical annotations to the visualization 
    based on the chart type.
    """
    # Check if the y-axis column is numeric
    if not pd.api.types.is_numeric_dtype(df[y_axis]):
        st.warning(f"⚠️ Cannot compute statistics for non-numeric column: {y_axis}")
        return fig

    # Compute statistics for numeric data
    min_val = df[y_axis].min()
    max_val = df[y_axis].max()
    avg_val = df[y_axis].mean()
    median_val = df[y_axis].median()
    std_dev_val = df[y_axis].std()

    # Format the stats for display
    stats_text = (
        f"πŸ“Š **Statistics**\n\n"
        f"- **Min:** ${min_val:,.2f}\n"
        f"- **Max:** ${max_val:,.2f}\n"
        f"- **Average:** ${avg_val:,.2f}\n"
        f"- **Median:** ${median_val:,.2f}\n"
        f"- **Std Dev:** ${std_dev_val:,.2f}"
    )

    # Apply stats only to relevant chart types
    if chart_type in ["bar", "line"]:
        # Add annotation box for bar and line charts
        fig.add_annotation(
            text=stats_text,
            xref="paper", yref="paper",
            x=1.02, y=1,
            showarrow=False,
            align="left",
            font=dict(size=12, color="black"),
            bordercolor="gray",
            borderwidth=1,
            bgcolor="rgba(255, 255, 255, 0.85)"
        )

        # Add horizontal reference lines
        fig.add_hline(y=min_val, line_dash="dot", line_color="red", annotation_text="Min", annotation_position="bottom right")
        fig.add_hline(y=median_val, line_dash="dash", line_color="orange", annotation_text="Median", annotation_position="top right")
        fig.add_hline(y=avg_val, line_dash="dashdot", line_color="green", annotation_text="Avg", annotation_position="top right")
        fig.add_hline(y=max_val, line_dash="dot", line_color="blue", annotation_text="Max", annotation_position="top right")

    elif chart_type == "scatter":
        # Add stats annotation only, no lines for scatter plots
        fig.add_annotation(
            text=stats_text,
            xref="paper", yref="paper",
            x=1.02, y=1,
            showarrow=False,
            align="left",
            font=dict(size=12, color="black"),
            bordercolor="gray",
            borderwidth=1,
            bgcolor="rgba(255, 255, 255, 0.85)"
        )

    elif chart_type == "box":
        # Box plots inherently show distribution; no extra stats needed
        pass

    elif chart_type == "pie":
        # Pie charts represent proportions, not suitable for stats
        st.info("πŸ“Š Pie charts represent proportions. Additional stats are not applicable.")

    elif chart_type == "heatmap":
        # Heatmaps already reflect data intensity
        st.info("πŸ“Š Heatmaps inherently reflect distribution. No additional stats added.")

    else:
        st.warning(f"⚠️ No statistical overlays applied for unsupported chart type: '{chart_type}'.")

    return fig


# Dynamically generate Plotly visualizations based on GPT-4o suggestions
def generate_visualization(suggestion, df):
    """
    Generate a Plotly visualization based on GPT-4o's suggestion.
    If the Y-axis is missing, infer it intelligently.
    """
    chart_type = suggestion.get("chart_type", "bar").lower()
    x_axis = suggestion.get("x_axis")
    y_axis = suggestion.get("y_axis")
    group_by = suggestion.get("group_by")

    # Step 1: Infer Y-axis if not provided
    if not y_axis:
        numeric_columns = df.select_dtypes(include='number').columns.tolist()

        # Avoid using the same column for both axes
        if x_axis in numeric_columns:
            numeric_columns.remove(x_axis)

        # Smart guess: prioritize salary or relevant metrics if available
        priority_columns = ["salary_in_usd", "income", "earnings", "revenue"]
        for col in priority_columns:
            if col in numeric_columns:
                y_axis = col
                break

        # Fallback to the first numeric column if no priority columns exist
        if not y_axis and numeric_columns:
            y_axis = numeric_columns[0]

    # Step 2: Validate axes
    if not x_axis or not y_axis:
        st.warning("⚠️ Unable to determine appropriate columns for visualization.")
        return None

    #  Step 3: Dynamically select the Plotly function
    plotly_function = getattr(px, chart_type, None)
    if not plotly_function:
        st.warning(f"⚠️ Unsupported chart type '{chart_type}' suggested by GPT-4o.")
        return None

    #  Step 4: Prepare dynamic plot arguments
    plot_args = {"data_frame": df, "x": x_axis, "y": y_axis}
    if group_by and group_by in df.columns:
        plot_args["color"] = group_by

    try:
        # Step 5: Generate the visualization
        fig = plotly_function(**plot_args)
        fig.update_layout(
            title=f"{chart_type.title()} Plot of {y_axis.replace('_', ' ').title()} by {x_axis.replace('_', ' ').title()}",
            xaxis_title=x_axis.replace('_', ' ').title(),
            yaxis_title=y_axis.replace('_', ' ').title(),
        )

        # Step 6: Apply statistics intelligently
        fig = add_statistics_to_visualization(fig, df, y_axis, chart_type)

        return fig

    except Exception as e:
        st.error(f"⚠️ Failed to generate visualization: {e}")
        return None


def generate_multiple_visualizations(suggestions, df):
    """
    Generates one or more visualizations based on GPT-4o's suggestions.
    Handles both single and multiple suggestions.
    """
    visualizations = []

    for suggestion in suggestions:
        fig = generate_visualization(suggestion, df)
        if fig:
            # Apply chart-specific statistics
            fig = add_stats_to_figure(fig, df, suggestion["y_axis"], suggestion["chart_type"])
            visualizations.append(fig)

    if not visualizations and suggestions:
        st.warning("⚠️ No valid visualization found. Displaying the most relevant one.")
        best_suggestion = suggestions[0]
        fig = generate_visualization(best_suggestion, df)
        fig = add_stats_to_figure(fig, df, best_suggestion["y_axis"], best_suggestion["chart_type"])
        visualizations.append(fig)

    return visualizations


def handle_visualization_suggestions(suggestions, df):
    """
    Determines whether to generate a single or multiple visualizations.
    """
    visualizations = []

    # If multiple suggestions, generate multiple plots
    if isinstance(suggestions, list) and len(suggestions) > 1:
        visualizations = generate_multiple_visualizations(suggestions, df)
    
    # If only one suggestion, generate a single plot
    elif isinstance(suggestions, dict) or (isinstance(suggestions, list) and len(suggestions) == 1):
        suggestion = suggestions[0] if isinstance(suggestions, list) else suggestions
        fig = generate_visualization(suggestion, df)
        if fig:
            visualizations.append(fig)
    
    # Handle cases when no visualization could be generated
    if not visualizations:
        st.warning("⚠️ Unable to generate any visualization based on the suggestion.")

    # Display all generated visualizations
    for fig in visualizations:
        st.plotly_chart(fig, use_container_width=True)





-----------------

def ask_gpt4o_for_visualization(query, df, llm, retries=2):
    import json

    # Identify numeric and categorical columns
    numeric_columns = df.select_dtypes(include='number').columns.tolist()
    categorical_columns = df.select_dtypes(exclude='number').columns.tolist()

    # Enhanced Prompt with More Examples
    prompt = f"""
    Analyze the following query and suggest the most suitable visualization(s) using the dataset.

    **Query:** "{query}"

    **Numeric Columns (for Y-axis):** {', '.join(numeric_columns) if numeric_columns else 'None'}
    **Categorical Columns (for X-axis or grouping):** {', '.join(categorical_columns) if categorical_columns else 'None'}

    Suggest visualizations in this exact JSON format:
    [
      {{
        "chart_type": "bar/box/line/scatter/pie/heatmap",
        "x_axis": "categorical_or_time_column",
        "y_axis": "numeric_column",
        "group_by": "optional_column_for_grouping",
        "title": "Title of the chart",
        "description": "Why this chart is suitable"
      }}
    ]

    **Examples:**  
    - For salary distribution:  
      {{
        "chart_type": "box",
        "x_axis": "job_title",
        "y_axis": "salary_in_usd",
        "group_by": "experience_level",
        "title": "Salary Distribution by Job Title and Experience",
        "description": "A box plot showing salary ranges across job titles and experience levels."
      }}

    - For company size comparison:  
      {{
        "chart_type": "bar",
        "x_axis": "company_size",
        "y_axis": "salary_in_usd",
        "group_by": null,
        "title": "Average Salary by Company Size",
        "description": "A bar chart comparing the average salaries across different company sizes."
      }}

    - For revenue trends over time:  
      {{
        "chart_type": "line",
        "x_axis": "year",
        "y_axis": "revenue",
        "group_by": null,
        "title": "Revenue Growth Over Years",
        "description": "A line chart showing the trend of revenue over the years."
      }}

    - For market share breakdown:  
      {{
        "chart_type": "pie",
        "x_axis": "market_segment",
        "y_axis": null,
        "group_by": null,
        "title": "Market Share by Segment",
        "description": "A pie chart showing the distribution of market share across various segments."
      }}

    - For correlation analysis:  
      {{
        "chart_type": "scatter",
        "x_axis": "years_of_experience",
        "y_axis": "salary_in_usd",
        "group_by": "job_title",
        "title": "Experience vs Salary by Job Title",
        "description": "A scatter plot showing the relationship between years of experience and salary across job titles."
      }}

    - For data density:  
      {{
        "chart_type": "heatmap",
        "x_axis": "department",
        "y_axis": "region",
        "group_by": null,
        "title": "Employee Distribution by Department and Region",
        "description": "A heatmap showing the concentration of employees across departments and regions."
      }}

    Only suggest visualizations that make sense for the data and the query.
    """

    for attempt in range(retries + 1):
        try:
            # Generate response from the model
            response = llm.generate(prompt)

            # Load JSON response
            suggestions = json.loads(response)

            # Validate response structure
            if isinstance(suggestions, list):
                valid_suggestions = [
                    s for s in suggestions if all(k in s for k in ["chart_type", "x_axis", "y_axis"])
                ]
                if valid_suggestions:
                    return valid_suggestions
                else:
                    st.warning("⚠️ GPT-4o did not suggest valid visualizations.")
                    return None

            elif isinstance(suggestions, dict):
                if all(k in suggestions for k in ["chart_type", "x_axis", "y_axis"]):
                    return [suggestions]
                else:
                    st.warning("⚠️ GPT-4o's suggestion is incomplete.")
                    return None

        except json.JSONDecodeError:
            st.warning(f"⚠️ Attempt {attempt + 1}: GPT-4o returned invalid JSON.")
        except Exception as e:
            st.error(f"⚠️ Error during GPT-4o call: {e}")
        
        # Retry if necessary
        if attempt < retries:
            st.info("πŸ”„ Retrying visualization suggestion...")

    st.error("❌ Failed to generate a valid visualization after multiple attempts.")
    return None