mgbam commited on
Commit
b834ff0
Β·
verified Β·
1 Parent(s): 2a9e2e4

Create core/llm.py

Browse files
Files changed (1) hide show
  1. core/llm.py +111 -0
core/llm.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # core/llm.py
2
+
3
+ # -*- coding: utf-8 -*-
4
+ #
5
+ # PROJECT: CognitiveEDA v5.0 - The QuantumLeap Intelligence Platform
6
+ #
7
+ # DESCRIPTION: Handles all interactions with the Google Gemini large language model.
8
+ # This module encapsulates prompt engineering and API communication.
9
+
10
+ import logging
11
+ from typing import Any, Dict
12
+
13
+ import google.generativeai as genai
14
+ from core.analyzer import DataAnalyzer
15
+ from core.config import settings
16
+
17
+ class GeminiNarrativeGenerator:
18
+ """
19
+ A client for generating data analysis narratives using the Gemini API.
20
+ """
21
+ def __init__(self, api_key: str):
22
+ """
23
+ Initializes the Gemini client.
24
+
25
+ Args:
26
+ api_key: The Google API key for authentication.
27
+ """
28
+ self.api_key = api_key
29
+ genai.configure(api_key=self.api_key)
30
+ self.model = genai.GenerativeModel(settings.GEMINI_MODEL)
31
+
32
+ def _build_prompt(self, analyzer: DataAnalyzer, context: Dict[str, Any]) -> str:
33
+ """Constructs a detailed, context-aware prompt for the AI."""
34
+ meta = analyzer.metadata
35
+ data_snippet_md = analyzer.df.head(5).to_markdown(index=False)
36
+
37
+ context_prompt = "**PRIMARY ANALYSIS CONTEXT:**\n"
38
+ if context.get('is_timeseries'):
39
+ context_prompt += "- **Time-Series Detected:** The primary focus should be on temporal patterns, trends, seasonality, and stationarity. Suggest forecasting models like ARIMA or Prophet.\n"
40
+ if context.get('has_text'):
41
+ context_prompt += "- **Long-Form Text Detected:** Highlight the potential for Natural Language Processing (NLP) tasks such as sentiment analysis, topic modeling (LDA), or named-entity recognition (NER).\n"
42
+ if context.get('is_clusterable'):
43
+ context_prompt += "- **High-Dimensional Numeric Data Detected:** The dataset is a strong candidate for clustering. Discuss customer segmentation, anomaly detection, or grouping.\n"
44
+ if not any(context.values()):
45
+ context_prompt += "- **General Tabular Data:** Focus on distributions, correlations, and suitability for standard classification or regression models (e.g., Logistic Regression, Random Forest, XGBoost).\n"
46
+
47
+ return f"""
48
+ **ROLE:** You are "Cognitive Analyst," an elite AI data scientist from a top-tier consultancy. Your analysis is sharp, insightful, and business-oriented.
49
+
50
+ **TASK:** Generate a comprehensive, multi-part data discovery report in Markdown. Your insights must be directly actionable.
51
+
52
+ {context_prompt}
53
+ ---
54
+ **DATASET METADATA:**
55
+ - **Shape:** {meta['shape'][0]} rows, {meta['shape'][1]} columns.
56
+ - **Data Quality Score:** {meta['data_quality_score']}% (A measure of non-missing cells).
57
+ - **Total Missing Values:** {meta['total_missing']:,}
58
+ - **Highly Correlated Pairs (Spearman > 0.8):** {meta['high_corr_pairs'] if meta['high_corr_pairs'] else 'None detected.'}
59
+ - **Data Snippet (First 5 Rows):**
60
+ {data_snippet_md}
61
+ ---
62
+ **REQUIRED REPORT STRUCTURE (Strictly follow this Markdown format):**
63
+
64
+ # πŸš€ AI Data Discovery & Strategy Report
65
+
66
+ ## πŸ“„ 1. Executive Summary
67
+ * **Primary Business Objective:** (Deduce the most probable business goal. Is it customer churn prediction, sales forecasting, market segmentation?)
68
+ * **Key Insight:** (State the single most surprising or valuable finding from the metadata. E.g., "The high correlation between `ad_spend` and `revenue` suggests a strong ROI, but the presence of missing values in `ad_spend` could be skewing this.")
69
+ * **Overall Readiness:** (Give a verdict on the data's quality and its readiness for machine learning. E.g., "Requires moderate cleaning," or "Excellent quality, ready for advanced modeling.")
70
+
71
+ ## 🧐 2. Deep Dive & Quality Assessment
72
+ * **Structural Profile:** (Describe the dataset's composition: # numeric, # categorical, # datetime, # text features. Mention any notable column names.)
73
+ * **Data Integrity Audit:** (Elaborate on the `Data Quality Score`. Discuss the impact of the {meta['total_missing']:,} missing values. Are they concentrated in key columns? Suggest an imputation strategy like median/mode fill or a more advanced one like MICE.)
74
+ * **Redundancy & Multicollinearity:** (Comment on the detected high-correlation pairs. Explain the risk of multicollinearity in linear models and suggest a potential action, like dropping one of the features.)
75
+
76
+ ## πŸ’‘ 3. Strategic Recommendations
77
+ * **Data Enhancement:** (Suggest a specific, valuable feature to engineer. E.g., "Create a `day_of_week` feature from the `{meta['datetime_cols'][0] if meta['datetime_cols'] else 'date'}` column to capture weekly patterns.")
78
+ * **Hypothesis to Test:** (Propose a clear, testable hypothesis. E.g., "Hypothesis: Customers acquired on weekends (`day_of_week` = Sat/Sun) have a higher lifetime value.")
79
+ * **Next Analytical Step:** (Based on the context, recommend a specific, advanced analysis. E.g., "Build a K-Means clustering model on the numeric features to identify distinct customer segments," or "Apply a `seasonal_decompose` on the time-series to validate the observed seasonality.")
80
+ """
81
+
82
+ def generate_narrative(self, analyzer: DataAnalyzer) -> str:
83
+ """
84
+ Generates and returns the AI-powered narrative.
85
+
86
+ Args:
87
+ analyzer: An instance of the DataAnalyzer class.
88
+
89
+ Returns:
90
+ A markdown string containing the AI-generated report.
91
+ """
92
+ logging.info("Generating AI narrative...")
93
+ try:
94
+ context = {
95
+ 'is_timeseries': bool(analyzer.metadata['datetime_cols']),
96
+ 'has_text': bool(analyzer.metadata['text_cols']),
97
+ 'is_clusterable': len(analyzer.metadata['numeric_cols']) > 2
98
+ }
99
+ prompt = self._build_prompt(analyzer, context)
100
+ response = self.model.generate_content(prompt)
101
+
102
+ if not response.parts:
103
+ reason = response.prompt_feedback.block_reason.name if response.prompt_feedback else "Unknown"
104
+ logging.warning(f"AI response blocked. Reason: {reason}")
105
+ return f"❌ **AI Report Generation Blocked by Safety Settings**\n**Reason:** `{reason}`."
106
+
107
+ logging.info("AI narrative generated successfully.")
108
+ return response.text
109
+ except Exception as e:
110
+ logging.error(f"Gemini API call failed: {e}", exc_info=True)
111
+ return f"❌ **AI Report Generation Failed**\n**Error:** An unexpected error occurred while communicating with the API. Please check your API key and network connection. Details: `{str(e)}`"