Spaces:
Sleeping
Sleeping
# core/llm.py | |
# -*- coding: utf-8 -*- | |
# | |
# PROJECT: CognitiveEDA v5.0 - The QuantumLeap Intelligence Platform | |
# | |
# DESCRIPTION: Handles all interactions with the Google Gemini large language model. | |
# This module encapsulates prompt engineering and API communication. | |
import logging | |
from typing import Any, Dict | |
import google.generativeai as genai | |
from core.analyzer import DataAnalyzer | |
from core.config import settings | |
class GeminiNarrativeGenerator: | |
""" | |
A client for generating data analysis narratives using the Gemini API. | |
""" | |
def __init__(self, api_key: str): | |
""" | |
Initializes the Gemini client. | |
Args: | |
api_key: The Google API key for authentication. | |
""" | |
self.api_key = api_key | |
genai.configure(api_key=self.api_key) | |
self.model = genai.GenerativeModel(settings.GEMINI_MODEL) | |
def _build_prompt(self, analyzer: DataAnalyzer, context: Dict[str, Any]) -> str: | |
"""Constructs a detailed, context-aware prompt for the AI.""" | |
meta = analyzer.metadata | |
data_snippet_md = analyzer.df.head(5).to_markdown(index=False) | |
context_prompt = "**PRIMARY ANALYSIS CONTEXT:**\n" | |
if context.get('is_timeseries'): | |
context_prompt += "- **Time-Series Detected:** The primary focus should be on temporal patterns, trends, seasonality, and stationarity. Suggest forecasting models like ARIMA or Prophet.\n" | |
if context.get('has_text'): | |
context_prompt += "- **Long-Form Text Detected:** Highlight the potential for Natural Language Processing (NLP) tasks such as sentiment analysis, topic modeling (LDA), or named-entity recognition (NER).\n" | |
if context.get('is_clusterable'): | |
context_prompt += "- **High-Dimensional Numeric Data Detected:** The dataset is a strong candidate for clustering. Discuss customer segmentation, anomaly detection, or grouping.\n" | |
if not any(context.values()): | |
context_prompt += "- **General Tabular Data:** Focus on distributions, correlations, and suitability for standard classification or regression models (e.g., Logistic Regression, Random Forest, XGBoost).\n" | |
return f""" | |
**ROLE:** You are "Cognitive Analyst," an elite AI data scientist from a top-tier consultancy. Your analysis is sharp, insightful, and business-oriented. | |
**TASK:** Generate a comprehensive, multi-part data discovery report in Markdown. Your insights must be directly actionable. | |
{context_prompt} | |
--- | |
**DATASET METADATA:** | |
- **Shape:** {meta['shape'][0]} rows, {meta['shape'][1]} columns. | |
- **Data Quality Score:** {meta['data_quality_score']}% (A measure of non-missing cells). | |
- **Total Missing Values:** {meta['total_missing']:,} | |
- **Highly Correlated Pairs (Spearman > 0.8):** {meta['high_corr_pairs'] if meta['high_corr_pairs'] else 'None detected.'} | |
- **Data Snippet (First 5 Rows):** | |
{data_snippet_md} | |
--- | |
**REQUIRED REPORT STRUCTURE (Strictly follow this Markdown format):** | |
# π AI Data Discovery & Strategy Report | |
## π 1. Executive Summary | |
* **Primary Business Objective:** (Deduce the most probable business goal. Is it customer churn prediction, sales forecasting, market segmentation?) | |
* **Key Insight:** (State the single most surprising or valuable finding from the metadata. E.g., "The high correlation between `ad_spend` and `revenue` suggests a strong ROI, but the presence of missing values in `ad_spend` could be skewing this.") | |
* **Overall Readiness:** (Give a verdict on the data's quality and its readiness for machine learning. E.g., "Requires moderate cleaning," or "Excellent quality, ready for advanced modeling.") | |
## π§ 2. Deep Dive & Quality Assessment | |
* **Structural Profile:** (Describe the dataset's composition: # numeric, # categorical, # datetime, # text features. Mention any notable column names.) | |
* **Data Integrity Audit:** (Elaborate on the `Data Quality Score`. Discuss the impact of the {meta['total_missing']:,} missing values. Are they concentrated in key columns? Suggest an imputation strategy like median/mode fill or a more advanced one like MICE.) | |
* **Redundancy & Multicollinearity:** (Comment on the detected high-correlation pairs. Explain the risk of multicollinearity in linear models and suggest a potential action, like dropping one of the features.) | |
## π‘ 3. Strategic Recommendations | |
* **Data Enhancement:** (Suggest a specific, valuable feature to engineer. E.g., "Create a `day_of_week` feature from the `{meta['datetime_cols'][0] if meta['datetime_cols'] else 'date'}` column to capture weekly patterns.") | |
* **Hypothesis to Test:** (Propose a clear, testable hypothesis. E.g., "Hypothesis: Customers acquired on weekends (`day_of_week` = Sat/Sun) have a higher lifetime value.") | |
* **Next Analytical Step:** (Based on the context, recommend a specific, advanced analysis. E.g., "Build a K-Means clustering model on the numeric features to identify distinct customer segments," or "Apply a `seasonal_decompose` on the time-series to validate the observed seasonality.") | |
""" | |
def generate_narrative(self, analyzer: DataAnalyzer) -> str: | |
""" | |
Generates and returns the AI-powered narrative. | |
Args: | |
analyzer: An instance of the DataAnalyzer class. | |
Returns: | |
A markdown string containing the AI-generated report. | |
""" | |
logging.info("Generating AI narrative...") | |
try: | |
context = { | |
'is_timeseries': bool(analyzer.metadata['datetime_cols']), | |
'has_text': bool(analyzer.metadata['text_cols']), | |
'is_clusterable': len(analyzer.metadata['numeric_cols']) > 2 | |
} | |
prompt = self._build_prompt(analyzer, context) | |
response = self.model.generate_content(prompt) | |
if not response.parts: | |
reason = response.prompt_feedback.block_reason.name if response.prompt_feedback else "Unknown" | |
logging.warning(f"AI response blocked. Reason: {reason}") | |
return f"β **AI Report Generation Blocked by Safety Settings**\n**Reason:** `{reason}`." | |
logging.info("AI narrative generated successfully.") | |
return response.text | |
except Exception as e: | |
logging.error(f"Gemini API call failed: {e}", exc_info=True) | |
return f"β **AI Report Generation Failed**\n**Error:** An unexpected error occurred while communicating with the API. Please check your API key and network connection. Details: `{str(e)}`" |