Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -234,7 +234,6 @@ def update_vector_store_with_plot(plot_text, existing_vector_store):
|
|
234 |
return existing_vector_store
|
235 |
|
236 |
def extract_plot_data(plot_info, df):
|
237 |
-
# Updated to handle Plotly.js JSON
|
238 |
plot_type = plot_info["type"]
|
239 |
x_col = plot_info["x"]
|
240 |
y_col = plot_info["y"] if "y" in plot_info else None
|
@@ -272,16 +271,6 @@ def extract_plot_data(plot_info, df):
|
|
272 |
plot_text += f"{col1} vs {col2}: {corr.loc[col2, col1]:.2f}\n"
|
273 |
return plot_text
|
274 |
|
275 |
-
def generate_3d_scatter_plot(params):
|
276 |
-
df = st.session_state.cleaned_data
|
277 |
-
match = re.search(r"([\w\s]+)\s+vs\s+([\w\s]+)\s+vs\s+([\w\s]+)", params)
|
278 |
-
if match and len(match.groups()) >= 3:
|
279 |
-
x_axis, y_axis, z_axis = match.group(1).strip(), match.group(2).strip(), match.group(3).strip()
|
280 |
-
if x_axis in df.columns and y_axis in df.columns and z_axis in df.columns:
|
281 |
-
fig = px.scatter_3d(df, x=x_axis, y=y_axis, z=z_axis, title=f'3D Scatter Plot of {x_axis} vs {y_axis} vs {z_axis}')
|
282 |
-
return fig.to_json()
|
283 |
-
return None
|
284 |
-
|
285 |
def get_chatbot_response(user_input, app_mode, vector_store=None, model="llama3-70b-8192"):
|
286 |
system_prompt = (
|
287 |
"You are an AI assistant in Data-Vision Pro, a data analysis app with RAG capabilities. "
|
@@ -379,34 +368,6 @@ def display_dataset_preview():
|
|
379 |
st.dataframe(st.session_state.cleaned_data.head(10), use_container_width=True)
|
380 |
st.markdown("---")
|
381 |
|
382 |
-
def suggest_data_cleaning(df):
|
383 |
-
suggestions = []
|
384 |
-
if df.isna().sum().sum() > 0:
|
385 |
-
for col in df.columns:
|
386 |
-
na_count = df[col].isna().sum()
|
387 |
-
if na_count > 0:
|
388 |
-
if na_count / df.shape[0] > 0.5:
|
389 |
-
suggestions.append(f"- Drop column '{col}' (>{50}% missing values)")
|
390 |
-
else:
|
391 |
-
suggestions.append(f"- Impute missing values in column '{col}' ({na_count} missing values)")
|
392 |
-
return "\n".join(suggestions) if suggestions else "No automatic cleaning suggestions."
|
393 |
-
|
394 |
-
def parse_command(command):
|
395 |
-
# ... (Previous command parser) ...
|
396 |
-
elif "show a 3d scatter plot" in command or "3d scatter plot of" in command:
|
397 |
-
params = command.replace("show a 3d scatter plot of", "").replace("3d scatter plot of", "").strip()
|
398 |
-
return generate_3d_scatter_plot, params
|
399 |
-
# ... (rest of the function is same)
|
400 |
-
|
401 |
-
def parse_multistep_command(command):
|
402 |
-
steps = command.split(';')
|
403 |
-
parsed_steps = []
|
404 |
-
for step in steps:
|
405 |
-
func, param = parse_command(step.strip())
|
406 |
-
if func:
|
407 |
-
parsed_steps.append((func, param))
|
408 |
-
return parsed_steps
|
409 |
-
|
410 |
# Main App
|
411 |
def main():
|
412 |
# Header
|
@@ -601,89 +562,84 @@ def main():
|
|
601 |
new_df[scale_cols] = scaler.fit_transform(new_df[scale_cols])
|
602 |
update_cleaned_data(new_df)
|
603 |
|
604 |
-
|
605 |
-
st.header("🔍
|
606 |
if 'cleaned_data' not in st.session_state:
|
607 |
st.warning("Please upload and clean data first.")
|
608 |
st.stop()
|
609 |
df = st.session_state.cleaned_data.copy()
|
610 |
|
611 |
-
|
612 |
-
|
613 |
-
|
614 |
-
|
615 |
-
|
616 |
-
|
617 |
-
|
618 |
-
|
619 |
-
|
620 |
-
|
621 |
-
|
622 |
-
|
623 |
-
|
624 |
-
|
625 |
-
|
626 |
-
|
627 |
-
|
628 |
-
|
629 |
-
|
630 |
-
|
631 |
-
|
632 |
-
|
633 |
-
|
634 |
-
|
635 |
-
|
636 |
-
|
637 |
-
|
638 |
-
|
639 |
-
|
640 |
-
|
641 |
-
|
642 |
-
|
643 |
-
|
644 |
-
|
645 |
-
|
646 |
-
|
647 |
-
|
648 |
-
|
649 |
-
|
650 |
-
|
651 |
-
|
652 |
-
|
653 |
-
|
654 |
-
fig = px.
|
655 |
-
|
656 |
-
|
657 |
-
|
658 |
-
|
659 |
-
|
660 |
-
|
661 |
-
|
662 |
-
|
663 |
-
|
664 |
-
|
665 |
-
|
666 |
-
|
667 |
-
|
668 |
-
|
669 |
-
|
670 |
-
|
671 |
-
|
672 |
-
|
673 |
-
|
674 |
-
|
675 |
-
|
676 |
-
|
677 |
-
|
678 |
-
|
679 |
-
|
680 |
-
|
681 |
-
|
682 |
-
st.text(plot_text)
|
683 |
-
|
684 |
-
except Exception as e:
|
685 |
-
st.error(f"Couldn't generate plot: {str(e)}")
|
686 |
-
|
687 |
|
688 |
# Chatbot Section
|
689 |
st.markdown("---")
|
|
|
234 |
return existing_vector_store
|
235 |
|
236 |
def extract_plot_data(plot_info, df):
|
|
|
237 |
plot_type = plot_info["type"]
|
238 |
x_col = plot_info["x"]
|
239 |
y_col = plot_info["y"] if "y" in plot_info else None
|
|
|
271 |
plot_text += f"{col1} vs {col2}: {corr.loc[col2, col1]:.2f}\n"
|
272 |
return plot_text
|
273 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
274 |
def get_chatbot_response(user_input, app_mode, vector_store=None, model="llama3-70b-8192"):
|
275 |
system_prompt = (
|
276 |
"You are an AI assistant in Data-Vision Pro, a data analysis app with RAG capabilities. "
|
|
|
368 |
st.dataframe(st.session_state.cleaned_data.head(10), use_container_width=True)
|
369 |
st.markdown("---")
|
370 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
371 |
# Main App
|
372 |
def main():
|
373 |
# Header
|
|
|
562 |
new_df[scale_cols] = scaler.fit_transform(new_df[scale_cols])
|
563 |
update_cleaned_data(new_df)
|
564 |
|
565 |
+
elif app_mode == "EDA":
|
566 |
+
st.header("🔍 Interactive Data Explorer")
|
567 |
if 'cleaned_data' not in st.session_state:
|
568 |
st.warning("Please upload and clean data first.")
|
569 |
st.stop()
|
570 |
df = st.session_state.cleaned_data.copy()
|
571 |
|
572 |
+
enhance_section_title("Dataset Overview")
|
573 |
+
with st.container():
|
574 |
+
col1, col2, col3, col4 = st.columns(4)
|
575 |
+
col1.metric("Total Rows", df.shape[0])
|
576 |
+
col2.metric("Total Columns", df.shape[1])
|
577 |
+
missing_percentage = df.isna().sum().sum() / df.size * 100
|
578 |
+
col3.metric("Missing Values", f"{df.isna().sum().sum()} ({missing_percentage:.1f}%)")
|
579 |
+
col4.metric("Duplicates", df.duplicated().sum())
|
580 |
+
|
581 |
+
tab1, tab2, tab3 = st.tabs(["Quick Preview", "Column Types", "Missing Matrix"])
|
582 |
+
with tab1:
|
583 |
+
st.write("First few rows of the dataset:")
|
584 |
+
st.dataframe(df.head(), use_container_width=True)
|
585 |
+
with tab2:
|
586 |
+
st.write("Column Data Types:")
|
587 |
+
type_counts = df.dtypes.value_counts().reset_index()
|
588 |
+
type_counts.columns = ['Type', 'Count']
|
589 |
+
st.dataframe(type_counts, use_container_width=True)
|
590 |
+
with tab3:
|
591 |
+
st.write("Missing Values Matrix:")
|
592 |
+
fig_missing = px.imshow(df.isna(), color_continuous_scale=['#e0e0e0', '#66c2a5'])
|
593 |
+
fig_missing.update_layout(coloraxis_colorscale=[[0, 'lightgrey'], [1, '#FF4B4B']])
|
594 |
+
st.plotly_chart(fig_missing, use_container_width=True)
|
595 |
+
|
596 |
+
enhance_section_title("Interactive Visualization Builder")
|
597 |
+
with st.container():
|
598 |
+
col1, col2 = st.columns([1, 3])
|
599 |
+
with col1:
|
600 |
+
plot_type = st.selectbox("Choose visualization type", [
|
601 |
+
"Scatter Plot", "Histogram", "Box Plot", "Line Chart", "Bar Chart", "Correlation Matrix"
|
602 |
+
])
|
603 |
+
x_axis = st.selectbox("X-axis", df.columns) if plot_type != "Correlation Matrix" else None
|
604 |
+
y_axis = st.selectbox("Y-axis", df.columns) if plot_type in ["Scatter Plot", "Box Plot", "Line Chart"] else None
|
605 |
+
color_by = st.selectbox("Color encoding", ["None"] + df.columns.tolist(), format_func=lambda x: "No color" if x == "None" else x) if plot_type != "Correlation Matrix" else None
|
606 |
+
|
607 |
+
with col2:
|
608 |
+
try:
|
609 |
+
fig = None
|
610 |
+
if plot_type == "Scatter Plot" and x_axis and y_axis:
|
611 |
+
fig = px.scatter(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None, title=f'Scatter Plot of {x_axis} vs {y_axis}')
|
612 |
+
elif plot_type == "Histogram" and x_axis:
|
613 |
+
fig = px.histogram(df, x=x_axis, color=color_by if color_by != "None" else None, nbins=30, title=f'Histogram of {x_axis}')
|
614 |
+
elif plot_type == "Box Plot" and x_axis and y_axis:
|
615 |
+
fig = px.box(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None, title=f'Box Plot of {x_axis} vs {y_axis}')
|
616 |
+
elif plot_type == "Line Chart" and x_axis and y_axis:
|
617 |
+
fig = px.line(df, x=x_axis, y=y_axis, color=color_by if color_by != "None" else None, title=f'Line Chart of {x_axis} vs {y_axis}')
|
618 |
+
elif plot_type == "Bar Chart" and x_axis:
|
619 |
+
fig = px.bar(df, x=x_axis, color=color_by if color_by != "None" else None, title=f'Bar Chart of {x_axis}')
|
620 |
+
elif plot_type == "Correlation Matrix":
|
621 |
+
numeric_df = df.select_dtypes(include=np.number)
|
622 |
+
if len(numeric_df.columns) > 1:
|
623 |
+
corr = numeric_df.corr()
|
624 |
+
fig = px.imshow(corr, text_auto=True, color_continuous_scale='RdBu_r', zmin=-1, zmax=1, title='Correlation Matrix')
|
625 |
+
|
626 |
+
if fig:
|
627 |
+
fig.update_layout(template="plotly_white")
|
628 |
+
st.plotly_chart(fig, use_container_width=True)
|
629 |
+
st.session_state.last_plot = {
|
630 |
+
"type": plot_type,
|
631 |
+
"x": x_axis,
|
632 |
+
"y": y_axis,
|
633 |
+
"data": df[[x_axis, y_axis]].to_json() if y_axis else df[[x_axis]].to_json()
|
634 |
+
}
|
635 |
+
plot_text = extract_plot_data(st.session_state.last_plot, df)
|
636 |
+
st.session_state.vector_store = update_vector_store_with_plot(plot_text, st.session_state.vector_store)
|
637 |
+
with st.expander("Extracted Plot Data"):
|
638 |
+
st.text(plot_text)
|
639 |
+
else:
|
640 |
+
st.error("Please provide required inputs for the selected plot type.")
|
641 |
+
except Exception as e:
|
642 |
+
st.error(f"Couldn't create visualization: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
643 |
|
644 |
# Chatbot Section
|
645 |
st.markdown("---")
|