import pandas as pd
import numpy as np
import gradio as gr
import sqlite3
import os
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)

class DataQualitySystem:
    def __init__(self):
        self.db_name = 'data_quality.db'
        self.setup_database()
        
    def setup_database(self):
        conn = sqlite3.connect(self.db_name)
        cursor = conn.cursor()

        cursor.execute('''
            CREATE TABLE IF NOT EXISTS quality_metrics 
            (timestamp TEXT, metric TEXT, value REAL)
        ''')

        cursor.execute('''
            CREATE TABLE IF NOT EXISTS user_feedback 
            (timestamp TEXT, row_index INTEGER, feedback TEXT)  
        ''')

        conn.commit()
        conn.close()

    def load_and_process_data(self, file):
        try:
            file_path = file.name  # This should work for both CSV and XLSX files
            if file_path.endswith('.csv'):
                df = pd.read_csv(file_path)
            elif file_path.endswith('.xlsx'):
                df = pd.read_excel(file_path)
            else:
                return None, "Unsupported file format. Please use CSV or XLSX."
            
            # Initial data checks
            metrics = self.initial_data_checks(df)
            
            # Anomaly detection
            df_with_anomalies = self.detect_anomalies(df)
            
            # Store quality metrics
            self.store_quality_metrics(metrics)
            
            return df_with_anomalies, "Data processed successfully!"
        except Exception as e:
            logging.error(f"Error processing file: {str(e)}")
            return None, f"Error processing file: {str(e)}"

    def initial_data_checks(self, df):
        metrics = {
            'total_rows': len(df),
            'null_values': df.isnull().sum().sum(),
            'duplicate_entries': df.duplicated().sum(),
        }
        
        # Calculate standard deviation for numeric columns
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            metrics[f'std_dev_{col}'] = df[col].std()
            
        return metrics

    def detect_anomalies(self, df):
        numeric_df = df.select_dtypes(include=[np.number])
        if len(numeric_df.columns) > 0:
            scaler = StandardScaler()
            scaled_data = scaler.fit_transform(numeric_df)
            model = IsolationForest(contamination=0.1, random_state=42)
            df['anomaly'] = model.fit_predict(scaled_data)
            df['anomaly_score'] = model.score_samples(scaled_data)
        return df

    def store_quality_metrics(self, metrics):
        conn = sqlite3.connect(self.db_name)
        cursor = conn.cursor()
        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        
        for metric, value in metrics.items():
            cursor.execute(
                'INSERT INTO quality_metrics (timestamp, metric, value) VALUES (?, ?, ?)',
                (timestamp, metric, float(value))
            )
        
        conn.commit()
        conn.close()

    def save_feedback(self, index, feedback):
        conn = sqlite3.connect(self.db_name)
        cursor = conn.cursor()
        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        
        cursor.execute(
            'INSERT INTO user_feedback (timestamp, row_index, feedback) VALUES (?, ?, ?)',
            (timestamp, index, feedback)
        )
        
        conn.commit()
        conn.close()
        return "Feedback saved successfully!"

    def generate_report(self, df):
        if df is None:
            return None, None, None
            
        # Create summary statistics plot
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        summary_stats = df[numeric_cols].describe()
        summary_fig = go.Figure(data=[
            go.Table(
                header=dict(values=['Statistic'] + list(summary_stats.columns)),
                cells=dict(values=[summary_stats.index] + [summary_stats[col].tolist() for col in summary_stats.columns])
            )
        ])
        
        # Create anomaly distribution plot
        if 'anomaly_score' in df.columns:
            anomaly_fig = px.histogram(df, x='anomaly_score', 
                                       title='Distribution of Anomaly Scores')
        else:
            anomaly_fig = None
            
        # Create missing values plot
        missing_data = df.isnull().sum()
        missing_fig = px.bar(x=missing_data.index, y=missing_data.values,
                             title='Missing Values by Column')
            
        return summary_fig, anomaly_fig, missing_fig

def create_gradio_interface():
    system = DataQualitySystem()
    
    def process_file(file):
        df, message = system.load_and_process_data(file)
        if df is not None:
            summary_fig, anomaly_fig, missing_fig = system.generate_report(df)
            return message, summary_fig, anomaly_fig, missing_fig
        return message, None, None, None
    
    def submit_feedback(index, feedback):
        return system.save_feedback(index, feedback)
    
    # Create the interface
    with gr.Blocks() as app:
        gr.Markdown("# Data Quality Assurance System")
        
        with gr.Row():
            file_input = gr.File(label="Upload Data File (CSV or XLSX)")
        
        with gr.Row():
            process_btn = gr.Button("Process Data")
            
        output_message = gr.Textbox(label="Status")
        
        with gr.Tabs():
            with gr.TabItem("Summary Statistics"):
                summary_plot = gr.Plot()
            with gr.TabItem("Anomaly Distribution"):
                anomaly_plot = gr.Plot()
            with gr.TabItem("Missing Values"):
                missing_plot = gr.Plot()
                
        with gr.Row():
            feedback_index = gr.Number(label="Row Index")
            feedback_text = gr.Textbox(label="Feedback")
            feedback_btn = gr.Button("Submit Feedback")
            
        # Set up event handlers
        process_btn.click(
            process_file,
            inputs=[file_input],
            outputs=[output_message, summary_plot, anomaly_plot, missing_plot]
        )
        
        feedback_btn.click(
            submit_feedback,
            inputs=[feedback_index, feedback_text],
            outputs=[output_message]
        )
    
    return app

# Launch the interface
if __name__ == "__main__":
    app = create_gradio_interface()
    app.launch()