Spaces:

mgbam
/

CognitiveEDA

Sleeping

App Files Files Community

CognitiveEDA / core /llm.py

mgbam

Create core/llm.py

b834ff0 verified 10 days ago

raw

history blame contribute delete

6.79 kB

	# core/llm.py

	# -- coding: utf-8 --
	#
	# PROJECT: CognitiveEDA v5.0 - The QuantumLeap Intelligence Platform
	#
	# DESCRIPTION: Handles all interactions with the Google Gemini large language model.
	# This module encapsulates prompt engineering and API communication.

	import logging
	from typing import Any, Dict

	import google.generativeai as genai
	from core.analyzer import DataAnalyzer
	from core.config import settings

	class GeminiNarrativeGenerator:
	"""
	A client for generating data analysis narratives using the Gemini API.
	"""
	def __init__(self, api_key: str):
	"""
	Initializes the Gemini client.

	Args:
	api_key: The Google API key for authentication.
	"""
	self.api_key = api_key
	genai.configure(api_key=self.api_key)
	self.model = genai.GenerativeModel(settings.GEMINI_MODEL)

	def _build_prompt(self, analyzer: DataAnalyzer, context: Dict[str, Any]) -> str:
	"""Constructs a detailed, context-aware prompt for the AI."""
	meta = analyzer.metadata
	data_snippet_md = analyzer.df.head(5).to_markdown(index=False)

	context_prompt = "PRIMARY ANALYSIS CONTEXT:\n"
	if context.get('is_timeseries'):
	context_prompt += "- Time-Series Detected: The primary focus should be on temporal patterns, trends, seasonality, and stationarity. Suggest forecasting models like ARIMA or Prophet.\n"
	if context.get('has_text'):
	context_prompt += "- Long-Form Text Detected: Highlight the potential for Natural Language Processing (NLP) tasks such as sentiment analysis, topic modeling (LDA), or named-entity recognition (NER).\n"
	if context.get('is_clusterable'):
	context_prompt += "- High-Dimensional Numeric Data Detected: The dataset is a strong candidate for clustering. Discuss customer segmentation, anomaly detection, or grouping.\n"
	if not any(context.values()):
	context_prompt += "- General Tabular Data: Focus on distributions, correlations, and suitability for standard classification or regression models (e.g., Logistic Regression, Random Forest, XGBoost).\n"

	return f"""
	ROLE: You are "Cognitive Analyst," an elite AI data scientist from a top-tier consultancy. Your analysis is sharp, insightful, and business-oriented.

	TASK: Generate a comprehensive, multi-part data discovery report in Markdown. Your insights must be directly actionable.

	{context_prompt}
	---
	DATASET METADATA:
	- Shape: {meta['shape'][0]} rows, {meta['shape'][1]} columns.
	- Data Quality Score: {meta['data_quality_score']}% (A measure of non-missing cells).
	- Total Missing Values: {meta['total_missing']:,}
	- Highly Correlated Pairs (Spearman > 0.8): {meta['high_corr_pairs'] if meta['high_corr_pairs'] else 'None detected.'}
	- Data Snippet (First 5 Rows):
	{data_snippet_md}
	---
	REQUIRED REPORT STRUCTURE (Strictly follow this Markdown format):

	# 🚀 AI Data Discovery & Strategy Report

	## 📄 1. Executive Summary
	* Primary Business Objective: (Deduce the most probable business goal. Is it customer churn prediction, sales forecasting, market segmentation?)
	* Key Insight: (State the single most surprising or valuable finding from the metadata. E.g., "The high correlation between `ad_spend` and `revenue` suggests a strong ROI, but the presence of missing values in `ad_spend` could be skewing this.")
	* Overall Readiness: (Give a verdict on the data's quality and its readiness for machine learning. E.g., "Requires moderate cleaning," or "Excellent quality, ready for advanced modeling.")

	## 🧐 2. Deep Dive & Quality Assessment
	* Structural Profile: (Describe the dataset's composition: # numeric, # categorical, # datetime, # text features. Mention any notable column names.)
	* Data Integrity Audit: (Elaborate on the `Data Quality Score`. Discuss the impact of the {meta['total_missing']:,} missing values. Are they concentrated in key columns? Suggest an imputation strategy like median/mode fill or a more advanced one like MICE.)
	* Redundancy & Multicollinearity: (Comment on the detected high-correlation pairs. Explain the risk of multicollinearity in linear models and suggest a potential action, like dropping one of the features.)

	## 💡 3. Strategic Recommendations
	* Data Enhancement: (Suggest a specific, valuable feature to engineer. E.g., "Create a `day_of_week` feature from the `{meta['datetime_cols'][0] if meta['datetime_cols'] else 'date'}` column to capture weekly patterns.")
	* Hypothesis to Test: (Propose a clear, testable hypothesis. E.g., "Hypothesis: Customers acquired on weekends (`day_of_week` = Sat/Sun) have a higher lifetime value.")
	* Next Analytical Step: (Based on the context, recommend a specific, advanced analysis. E.g., "Build a K-Means clustering model on the numeric features to identify distinct customer segments," or "Apply a `seasonal_decompose` on the time-series to validate the observed seasonality.")
	"""

	def generate_narrative(self, analyzer: DataAnalyzer) -> str:
	"""
	Generates and returns the AI-powered narrative.

	Args:
	analyzer: An instance of the DataAnalyzer class.

	Returns:
	A markdown string containing the AI-generated report.
	"""
	logging.info("Generating AI narrative...")
	try:
	context = {
	'is_timeseries': bool(analyzer.metadata['datetime_cols']),
	'has_text': bool(analyzer.metadata['text_cols']),
	'is_clusterable': len(analyzer.metadata['numeric_cols']) > 2
	}
	prompt = self._build_prompt(analyzer, context)
	response = self.model.generate_content(prompt)

	if not response.parts:
	reason = response.prompt_feedback.block_reason.name if response.prompt_feedback else "Unknown"
	logging.warning(f"AI response blocked. Reason: {reason}")
	return f"❌ AI Report Generation Blocked by Safety Settings\nReason: `{reason}`."

	logging.info("AI narrative generated successfully.")
	return response.text
	except Exception as e:
	logging.error(f"Gemini API call failed: {e}", exc_info=True)
	return f"❌ AI Report Generation Failed\nError: An unexpected error occurred while communicating with the API. Please check your API key and network connection. Details: `{str(e)}`"