Spaces:
Sleeping
Sleeping
Create core/llm.py
Browse files- core/llm.py +111 -0
core/llm.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# core/llm.py
|
2 |
+
|
3 |
+
# -*- coding: utf-8 -*-
|
4 |
+
#
|
5 |
+
# PROJECT: CognitiveEDA v5.0 - The QuantumLeap Intelligence Platform
|
6 |
+
#
|
7 |
+
# DESCRIPTION: Handles all interactions with the Google Gemini large language model.
|
8 |
+
# This module encapsulates prompt engineering and API communication.
|
9 |
+
|
10 |
+
import logging
|
11 |
+
from typing import Any, Dict
|
12 |
+
|
13 |
+
import google.generativeai as genai
|
14 |
+
from core.analyzer import DataAnalyzer
|
15 |
+
from core.config import settings
|
16 |
+
|
17 |
+
class GeminiNarrativeGenerator:
|
18 |
+
"""
|
19 |
+
A client for generating data analysis narratives using the Gemini API.
|
20 |
+
"""
|
21 |
+
def __init__(self, api_key: str):
|
22 |
+
"""
|
23 |
+
Initializes the Gemini client.
|
24 |
+
|
25 |
+
Args:
|
26 |
+
api_key: The Google API key for authentication.
|
27 |
+
"""
|
28 |
+
self.api_key = api_key
|
29 |
+
genai.configure(api_key=self.api_key)
|
30 |
+
self.model = genai.GenerativeModel(settings.GEMINI_MODEL)
|
31 |
+
|
32 |
+
def _build_prompt(self, analyzer: DataAnalyzer, context: Dict[str, Any]) -> str:
|
33 |
+
"""Constructs a detailed, context-aware prompt for the AI."""
|
34 |
+
meta = analyzer.metadata
|
35 |
+
data_snippet_md = analyzer.df.head(5).to_markdown(index=False)
|
36 |
+
|
37 |
+
context_prompt = "**PRIMARY ANALYSIS CONTEXT:**\n"
|
38 |
+
if context.get('is_timeseries'):
|
39 |
+
context_prompt += "- **Time-Series Detected:** The primary focus should be on temporal patterns, trends, seasonality, and stationarity. Suggest forecasting models like ARIMA or Prophet.\n"
|
40 |
+
if context.get('has_text'):
|
41 |
+
context_prompt += "- **Long-Form Text Detected:** Highlight the potential for Natural Language Processing (NLP) tasks such as sentiment analysis, topic modeling (LDA), or named-entity recognition (NER).\n"
|
42 |
+
if context.get('is_clusterable'):
|
43 |
+
context_prompt += "- **High-Dimensional Numeric Data Detected:** The dataset is a strong candidate for clustering. Discuss customer segmentation, anomaly detection, or grouping.\n"
|
44 |
+
if not any(context.values()):
|
45 |
+
context_prompt += "- **General Tabular Data:** Focus on distributions, correlations, and suitability for standard classification or regression models (e.g., Logistic Regression, Random Forest, XGBoost).\n"
|
46 |
+
|
47 |
+
return f"""
|
48 |
+
**ROLE:** You are "Cognitive Analyst," an elite AI data scientist from a top-tier consultancy. Your analysis is sharp, insightful, and business-oriented.
|
49 |
+
|
50 |
+
**TASK:** Generate a comprehensive, multi-part data discovery report in Markdown. Your insights must be directly actionable.
|
51 |
+
|
52 |
+
{context_prompt}
|
53 |
+
---
|
54 |
+
**DATASET METADATA:**
|
55 |
+
- **Shape:** {meta['shape'][0]} rows, {meta['shape'][1]} columns.
|
56 |
+
- **Data Quality Score:** {meta['data_quality_score']}% (A measure of non-missing cells).
|
57 |
+
- **Total Missing Values:** {meta['total_missing']:,}
|
58 |
+
- **Highly Correlated Pairs (Spearman > 0.8):** {meta['high_corr_pairs'] if meta['high_corr_pairs'] else 'None detected.'}
|
59 |
+
- **Data Snippet (First 5 Rows):**
|
60 |
+
{data_snippet_md}
|
61 |
+
---
|
62 |
+
**REQUIRED REPORT STRUCTURE (Strictly follow this Markdown format):**
|
63 |
+
|
64 |
+
# π AI Data Discovery & Strategy Report
|
65 |
+
|
66 |
+
## π 1. Executive Summary
|
67 |
+
* **Primary Business Objective:** (Deduce the most probable business goal. Is it customer churn prediction, sales forecasting, market segmentation?)
|
68 |
+
* **Key Insight:** (State the single most surprising or valuable finding from the metadata. E.g., "The high correlation between `ad_spend` and `revenue` suggests a strong ROI, but the presence of missing values in `ad_spend` could be skewing this.")
|
69 |
+
* **Overall Readiness:** (Give a verdict on the data's quality and its readiness for machine learning. E.g., "Requires moderate cleaning," or "Excellent quality, ready for advanced modeling.")
|
70 |
+
|
71 |
+
## π§ 2. Deep Dive & Quality Assessment
|
72 |
+
* **Structural Profile:** (Describe the dataset's composition: # numeric, # categorical, # datetime, # text features. Mention any notable column names.)
|
73 |
+
* **Data Integrity Audit:** (Elaborate on the `Data Quality Score`. Discuss the impact of the {meta['total_missing']:,} missing values. Are they concentrated in key columns? Suggest an imputation strategy like median/mode fill or a more advanced one like MICE.)
|
74 |
+
* **Redundancy & Multicollinearity:** (Comment on the detected high-correlation pairs. Explain the risk of multicollinearity in linear models and suggest a potential action, like dropping one of the features.)
|
75 |
+
|
76 |
+
## π‘ 3. Strategic Recommendations
|
77 |
+
* **Data Enhancement:** (Suggest a specific, valuable feature to engineer. E.g., "Create a `day_of_week` feature from the `{meta['datetime_cols'][0] if meta['datetime_cols'] else 'date'}` column to capture weekly patterns.")
|
78 |
+
* **Hypothesis to Test:** (Propose a clear, testable hypothesis. E.g., "Hypothesis: Customers acquired on weekends (`day_of_week` = Sat/Sun) have a higher lifetime value.")
|
79 |
+
* **Next Analytical Step:** (Based on the context, recommend a specific, advanced analysis. E.g., "Build a K-Means clustering model on the numeric features to identify distinct customer segments," or "Apply a `seasonal_decompose` on the time-series to validate the observed seasonality.")
|
80 |
+
"""
|
81 |
+
|
82 |
+
def generate_narrative(self, analyzer: DataAnalyzer) -> str:
|
83 |
+
"""
|
84 |
+
Generates and returns the AI-powered narrative.
|
85 |
+
|
86 |
+
Args:
|
87 |
+
analyzer: An instance of the DataAnalyzer class.
|
88 |
+
|
89 |
+
Returns:
|
90 |
+
A markdown string containing the AI-generated report.
|
91 |
+
"""
|
92 |
+
logging.info("Generating AI narrative...")
|
93 |
+
try:
|
94 |
+
context = {
|
95 |
+
'is_timeseries': bool(analyzer.metadata['datetime_cols']),
|
96 |
+
'has_text': bool(analyzer.metadata['text_cols']),
|
97 |
+
'is_clusterable': len(analyzer.metadata['numeric_cols']) > 2
|
98 |
+
}
|
99 |
+
prompt = self._build_prompt(analyzer, context)
|
100 |
+
response = self.model.generate_content(prompt)
|
101 |
+
|
102 |
+
if not response.parts:
|
103 |
+
reason = response.prompt_feedback.block_reason.name if response.prompt_feedback else "Unknown"
|
104 |
+
logging.warning(f"AI response blocked. Reason: {reason}")
|
105 |
+
return f"β **AI Report Generation Blocked by Safety Settings**\n**Reason:** `{reason}`."
|
106 |
+
|
107 |
+
logging.info("AI narrative generated successfully.")
|
108 |
+
return response.text
|
109 |
+
except Exception as e:
|
110 |
+
logging.error(f"Gemini API call failed: {e}", exc_info=True)
|
111 |
+
return f"β **AI Report Generation Failed**\n**Error:** An unexpected error occurred while communicating with the API. Please check your API key and network connection. Details: `{str(e)}`"
|