Spaces:

DrishtiSharma
/

sql-rag

Running

App Files Files Community

DrishtiSharma commited on Jan 13

Commit

9dc25a4

verified ·

1 Parent(s): 7ff7723

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -106

app.py CHANGED Viewed

@@ -191,11 +191,12 @@ COLUMN_SYNONYMS = {
 }
-# Fuzzy match to map query terms to dataset columns
 def fuzzy_match_columns(query, n=2):
     query = query.lower()
     all_synonyms = {synonym: col for col, synonyms in COLUMN_SYNONYMS.items() for synonym in synonyms}
-    words = query.replace("and", "").replace("vs", "").split()  # Remove "and"/"vs" for better matching
     matched_columns = []
     for word in words:
@@ -203,70 +204,70 @@ def fuzzy_match_columns(query, n=2):
         for match in matches:
             matched_columns.append(all_synonyms[match])
-    # Remove duplicates while preserving order
-    matched_columns = list(dict.fromkeys(matched_columns))
-    return matched_columns
-# Visualization generator with dynamic groupby handling
 def generate_visual_from_query(query, df):
     try:
-        # Step 1: Fuzzy match columns mentioned in the query
         matched_columns = fuzzy_match_columns(query)
-        # Step 2: Detect groupby intent (handling "and", "vs", "by")
-        if "and" in query or "vs" in query or "by" in query or len(matched_columns) > 1:
-            if len(matched_columns) >= 2:
-                x_axis = matched_columns[0]
-                group_by = matched_columns[1]
-            else:
-                x_axis, group_by = matched_columns[0], None
         else:
-            x_axis = matched_columns[0] if matched_columns else None
-            group_by = None
-        # Step 3: Visualization logic
-        if "distribution" in query and x_axis:
             fig = px.box(df, x=x_axis, y="salary_in_usd", color=group_by,
                          title=f"Salary Distribution by {x_axis.replace('_', ' ').title()}"
                                + (f" and {group_by.replace('_', ' ').title()}" if group_by else ""))
-            return fig
         elif "average" in query or "mean" in query:
             grouped_df = df.groupby([x_axis] + ([group_by] if group_by else []))["salary_in_usd"].mean().reset_index()
             fig = px.bar(grouped_df, x=x_axis, y="salary_in_usd", color=group_by,
-                         barmode="group",
                          title=f"Average Salary by {x_axis.replace('_', ' ').title()}"
                                + (f" and {group_by.replace('_', ' ').title()}" if group_by else ""))
-            return fig
-        elif "trend" in query and "work_year" in df.columns and x_axis:
             grouped_df = df.groupby(["work_year", x_axis])["salary_in_usd"].mean().reset_index()
             fig = px.line(grouped_df, x="work_year", y="salary_in_usd", color=x_axis,
-                          title=f"Salary Trend over Years by {x_axis.replace('_', ' ').title()}")
-            return fig
         elif "remote" in query:
             grouped_df = df.groupby(["remote_ratio"] + ([group_by] if group_by else []))["salary_in_usd"].mean().reset_index()
             fig = px.bar(grouped_df, x="remote_ratio", y="salary_in_usd", color=group_by,
-                         barmode="group", title="Remote Work Impact on Salary")
-            return fig
-        elif "company size" in query:
-            grouped_df = df.groupby(["company_size"] + ([group_by] if group_by else []))["salary_in_usd"].mean().reset_index()
-            fig = px.bar(grouped_df, x="company_size", y="salary_in_usd", color=group_by,
-                         title=f"Salary by Company Size"
-                               + (f" and {group_by.replace('_', ' ').title()}" if group_by else ""))
-            return fig
-        elif "country" in query or "location" in query:
-            grouped_df = df.groupby(["employee_residence"] + ([group_by] if group_by else []))["salary_in_usd"].mean().reset_index()
-            fig = px.bar(grouped_df, x="employee_residence", y="salary_in_usd", color=group_by,
-                         title=f"Salary by Employee Residence"
-                               + (f" and {group_by.replace('_', ' ').title()}" if group_by else ""))
-            return fig
         else:
-            st.warning("❓ No suitable visualization detected. Please refine your query.")
             return None
     except Exception as e:
@@ -274,71 +275,6 @@ def generate_visual_from_query(query, df):
         return None
-"""def map_query_to_column(query):
-    query = query.lower()
-    all_synonyms = {synonym: col for col, synonyms in COLUMN_SYNONYMS.items() for synonym in synonyms}
-    matches = get_close_matches(query, all_synonyms.keys(), n=1, cutoff=0.6)
-    if matches:
-        return all_synonyms[matches[0]]
-    else:
-        for col, synonyms in COLUMN_SYNONYMS.items():
-            if any(term in query for term in synonyms):
-                return col
-    return None"""
-"""# Visualization generator with synonym handling
-def generate_visual_from_query(query, df):
-    try:
-        query = query.lower()
-        # Map user terms to actual dataset columns
-        col1 = map_query_to_column(query)
-        col2 = None  # For dual-column charts
-        # Handle common queries
-        if "distribution" in query and col1:
-            fig = px.box(df, x=col1, y="salary_in_usd", title=f"Salary Distribution by {col1.replace('_', ' ').title()}")
-            return fig
-        elif "average salary" in query and col1:
-            grouped_df = df.groupby(col1)["salary_in_usd"].mean().reset_index()
-            fig = px.bar(grouped_df, x=col1, y="salary_in_usd", title=f"Average Salary by {col1.replace('_', ' ').title()}")
-            return fig
-        elif "remote" in query:
-            grouped_df = df.groupby("remote_ratio")["salary_in_usd"].mean().reset_index()
-            fig = px.bar(grouped_df, x="remote_ratio", y="salary_in_usd", title="Remote Work Impact on Salary")
-            return fig
-        elif "company size" in query or "organization size" in query:
-            grouped_df = df.groupby("company_size")["salary_in_usd"].mean().reset_index()
-            fig = px.bar(grouped_df, x="company_size", y="salary_in_usd", title="Salary by Company Size")
-            return fig
-        elif "country" in query or "location" in query:
-            grouped_df = df.groupby("employee_residence")["salary_in_usd"].mean().reset_index()
-            fig = px.bar(grouped_df, x="employee_residence", y="salary_in_usd", title="Salary by Employee Residence")
-            return fig
-        else:
-            st.warning("❓ I couldn't understand the query for visualization. Try asking about salary distribution, experience level, remote work, etc.")
-            return None
-    except Exception as e:
-        st.error(f"Error generating visualization: {e}")
-        return None"""
 # SQL-RAG Analysis
 if st.session_state.df is not None:
     temp_dir = tempfile.TemporaryDirectory()

 }
+# Fuzzy matcher for mapping query terms to dataset columns
 def fuzzy_match_columns(query, n=2):
     query = query.lower()
     all_synonyms = {synonym: col for col, synonyms in COLUMN_SYNONYMS.items() for synonym in synonyms}
+    words = query.replace("and", "").replace("vs", "").replace("by", "").split()
     matched_columns = []
     for word in words:
         for match in matches:
             matched_columns.append(all_synonyms[match])
+    return list(dict.fromkeys(matched_columns))
+# Statistical annotations for plots
+def add_stats_to_figure(fig, df, y_axis):
+    min_salary = df[y_axis].min()
+    max_salary = df[y_axis].max()
+    avg_salary = df[y_axis].mean()
+    fig.add_annotation(
+        text=f"Min: ${min_salary:,.2f} | Max: ${max_salary:,.2f} | Avg: ${avg_salary:,.2f}",
+        xref="paper", yref="paper",
+        x=0.5, y=1.1,
+        showarrow=False,
+        font=dict(size=12, color="black"),
+        bgcolor="rgba(255, 255, 255, 0.7)"
+    )
+    return fig
+# Visualization generator
 def generate_visual_from_query(query, df):
     try:
         matched_columns = fuzzy_match_columns(query)
+        # Detect and handle multiple grouping columns
+        if len(matched_columns) >= 2:
+            x_axis, group_by = matched_columns[0], matched_columns[1]
+        elif len(matched_columns) == 1:
+            x_axis, group_by = matched_columns[0], None
         else:
+            st.warning("❓ No matching columns found. Try rephrasing your query.")
+            return None
+        # Handle distribution queries
+        if "distribution" in query:
             fig = px.box(df, x=x_axis, y="salary_in_usd", color=group_by,
                          title=f"Salary Distribution by {x_axis.replace('_', ' ').title()}"
                                + (f" and {group_by.replace('_', ' ').title()}" if group_by else ""))
+            return add_stats_to_figure(fig, df, "salary_in_usd")
+        # Handle average salary queries
         elif "average" in query or "mean" in query:
             grouped_df = df.groupby([x_axis] + ([group_by] if group_by else []))["salary_in_usd"].mean().reset_index()
             fig = px.bar(grouped_df, x=x_axis, y="salary_in_usd", color=group_by,
                          title=f"Average Salary by {x_axis.replace('_', ' ').title()}"
                                + (f" and {group_by.replace('_', ' ').title()}" if group_by else ""))
+            return add_stats_to_figure(fig, df, "salary_in_usd")
+        # Handle salary trends over time
+        elif "trend" in query and "work_year" in df.columns:
             grouped_df = df.groupby(["work_year", x_axis])["salary_in_usd"].mean().reset_index()
             fig = px.line(grouped_df, x="work_year", y="salary_in_usd", color=x_axis,
+                          title=f"Salary Trend Over Years by {x_axis.replace('_', ' ').title()}")
+            return add_stats_to_figure(fig, df, "salary_in_usd")
+        # Handle remote work queries
         elif "remote" in query:
             grouped_df = df.groupby(["remote_ratio"] + ([group_by] if group_by else []))["salary_in_usd"].mean().reset_index()
             fig = px.bar(grouped_df, x="remote_ratio", y="salary_in_usd", color=group_by,
+                         title="Remote Work Impact on Salary")
+            return add_stats_to_figure(fig, df, "salary_in_usd")
+        # Default behavior if query doesn't match anything specific
         else:
+            st.warning("❓ No suitable visualization generated. Try refining your query.")
             return None
     except Exception as e:
         return None
 # SQL-RAG Analysis
 if st.session_state.df is not None:
     temp_dir = tempfile.TemporaryDirectory()