Spaces:

CausalNLP
/

causal-agent

Running

App Files Files Community

FireShadow commited on 18 days ago

Commit

1721aea

0 Parent(s):

Initial clean commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.env.example +0 -0
.gitattributes +37 -0
.gitignore +217 -0
README copy.md +198 -0
README.md +13 -0
app.py +339 -0
auto_causal/__init__.py +50 -0
auto_causal/agent.py +394 -0
auto_causal/components/__init__.py +28 -0
auto_causal/components/dataset_analyzer.py +853 -0
auto_causal/components/decision_tree.py +366 -0
auto_causal/components/decision_tree_llm.py +218 -0
auto_causal/components/explanation_generator.py +404 -0
auto_causal/components/input_parser.py +456 -0
auto_causal/components/method_validator.py +327 -0
auto_causal/components/output_formatter.py +138 -0
auto_causal/components/query_interpreter.py +580 -0
auto_causal/components/state_manager.py +40 -0
auto_causal/config.py +97 -0
auto_causal/methods/__init__.py +44 -0
auto_causal/methods/backdoor_adjustment/__init__.py +0 -0
auto_causal/methods/backdoor_adjustment/diagnostics.py +92 -0
auto_causal/methods/backdoor_adjustment/estimator.py +105 -0
auto_causal/methods/backdoor_adjustment/llm_assist.py +176 -0
auto_causal/methods/causal_method.py +88 -0
auto_causal/methods/diff_in_means/__init__.py +0 -0
auto_causal/methods/diff_in_means/diagnostics.py +60 -0
auto_causal/methods/diff_in_means/estimator.py +107 -0
auto_causal/methods/diff_in_means/llm_assist.py +95 -0
auto_causal/methods/difference_in_differences/diagnostics.py +345 -0
auto_causal/methods/difference_in_differences/estimator.py +463 -0
auto_causal/methods/difference_in_differences/llm_assist.py +362 -0
auto_causal/methods/difference_in_differences/utils.py +65 -0
auto_causal/methods/generalized_propensity_score/__init__.py +3 -0
auto_causal/methods/generalized_propensity_score/diagnostics.py +196 -0
auto_causal/methods/generalized_propensity_score/estimator.py +386 -0
auto_causal/methods/generalized_propensity_score/llm_assist.py +208 -0
auto_causal/methods/instrumental_variable/__init__.py +1 -0
auto_causal/methods/instrumental_variable/diagnostics.py +218 -0
auto_causal/methods/instrumental_variable/estimator.py +370 -0
auto_causal/methods/instrumental_variable/llm_assist.py +240 -0
auto_causal/methods/linear_regression/__init__.py +0 -0
auto_causal/methods/linear_regression/diagnostics.py +76 -0
auto_causal/methods/linear_regression/estimator.py +355 -0
auto_causal/methods/linear_regression/llm_assist.py +146 -0
auto_causal/methods/propensity_score/__init__.py +13 -0
auto_causal/methods/propensity_score/base.py +80 -0
auto_causal/methods/propensity_score/diagnostics.py +74 -0
auto_causal/methods/propensity_score/llm_assist.py +45 -0
auto_causal/methods/propensity_score/matching.py +341 -0

.env.example ADDED Viewed

File without changes

.gitattributes ADDED Viewed

	@@ -0,0 +1,37 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,217 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be added to the global gitignore or merged into this project gitignore.  For a PyCharm
+#  project, it is recommended to include directory-based project settings:
+.idea/
+# VS Code
+.vscode/
+# Data files
+*.csv
+*.xlsx
+*.xls
+*.json
+*.parquet
+*.pickle
+*.pkl
+*.h5
+*.hdf5
+# Model files
+*.model
+*.joblib
+*.sav
+# Output directories
+outputs/
+results/
+logs/
+checkpoints/
+wandb/
+# Temporary files
+*.tmp
+*.temp
+*~
+# OS generated files
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+# LLM API keys and secrets
+.env.local
+.env.production
+secrets.json
+api_keys.txt
+# Experiment tracking
+mlruns/
+.mlflow/
+# Large files (adjust sizes as needed)
+*.zip
+*.tar.gz
+*.rar
+# Project specific
+tests/output/

README copy.md ADDED Viewed

	@@ -0,0 +1,198 @@

+<h1 align="center">
+<img src="blob/main/asset/cais.png" width="400" alt="CAIS" />
+<br>
+Causal AI Scientist: Facilitating Causal Data Science with
+Large Language Models
+</h1>
+<!-- <p align="center">
+  <a href="https://causalcopilot.com/"><b>[Demo]</b></a> •
+  <a href="https://github.com/Lancelot39/Causal-Copilot"><b>[Code]</b></a> •
+  <a href="">"Coming Soon"<b>[Arxiv(coming soon)]</b></a>
+</p> -->
+**Causal AI Scientist (CAIS)** is an LLM-powered tool for generating data-driven answers to natural language causal queries. It takes a natural language query (for example, "Does participating in a job training program lead to higher income?"), an accompanying dataset, and the corresponding description as inputs. CAIS then frames a suitable causal estimation problem by selecting appropriate treatment and outcome variables. It finds the suitable method for causal effect estimation, implements it, runs diagnostic tests, and finally interprets the numerical results in the context of the original query.
+This repo includes instructions on both using the tool to perform causal analysis on a dataset of interest and reproducing results from our paper.
+**Note** : This repository is a work in progress and will be updated with additional instructions and files.
+<!-- ## 1. Introduction
+Causal effect estimation is central to evidence-based decision-making across domains like social sciences, healthcare, and economics. However, it requires specialized expertise to select the right inference method, identify valid variables, and validate results.
+**CAIS (Causal AI Scientist)** automates this process using Large Language Models (LLMs) to:
+- Parse a natural language causal query.
+- Analyze the dataset characteristics.
+- Select the appropriate causal inference method via a decision tree and prompting strategies.
+- Execute the method using pre-defined code templates.
+- Validate and interpret the results.
+<div style="text-align: center;">
+    <img src="blob/main/asset/CAIS-arch.png" width="990" alt="CAIS" />
+</div>
+</h1>
+**Key Features:**
+- End-to-end causal estimation with minimal user input.
+- Supports a wide range of methods:
+  - **Econometric:** Difference-in-Differences (DiD), Instrumental Variables (IV), Ordinary Least Squares (OLS), Regression Discontinuity Design (RDD).
+  - **Causal Graph-based:** Backdoor adjustment, Frontdoor adjustment.
+- Combines structured reasoning (decision tree) with LLM-powered interpretation.
+- Works on clean textbook datasets, messy real-world datasets, and synthetic scenarios.
+CAIS consists of three main stages, powered by a **decision-tree-driven reasoning pipeline**:
+### **Stage 1: Variable and Method Selection**
+1. **Dataset & Query Analysis**
+   - The LLM inspects the dataset description, variable names, and statistical summaries.
+   - Identifies treatment, outcome, and covariates.
+2. **Property Detection**
+   - Uses targeted prompts to detect dataset properties:
+     - Randomized vs observational
+     - Presence of temporal/running variables
+     - Availability of valid instruments
+3. **Decision Tree Traversal**
+   - Traverses a predefined causal inference decision tree (Fig. B in paper).
+   - Maps detected properties to the most appropriate estimation method.
+---
+### **Stage 2: Causal Inference Execution**
+1. **Template-based Code Generation**
+   - Predefined Python templates for each method (e.g., DiD, IV, OLS).
+   - Variables from Stage 1 are substituted into templates.
+2. **Diagnostics & Validation**
+   - Runs statistical tests and checks assumptions where applicable.
+   - Handles basic data preprocessing (e.g., type conversion for DoWhy).
+---
+### **Stage 3: Result Interpretation**
+- LLM interprets numerical results and diagnostics in the context of the user’s causal query.
+- Outputs:
+  - Estimated causal effect (ATE, ATT, or LATE).
+  - Standard errors, confidence intervals.
+  - Plain-language explanation.
+---
+## 3. Evaluation
+We evaluate **CAIS** across three diverse dataset collections:
+1. **QRData (Textbook Examples)** – curated, clean datasets with known causal effects.
+2. **Real-World Studies** – empirical datasets from research papers (economics, health, political science).
+3. **Synthetic Data** – generated with controlled causal structures to ensure balanced method coverage.
+### **Metrics**
+We assess CAIS on:
+- **Method Selection Accuracy (MSA)** – % of cases where CAIS selects the correct inference method as per the reference.
+- **Mean Relative Error (MRE)** – Average relative error between CAIS’s estimated causal effect and the reference value.
+<p align="center">
+  <table>
+    <tr>
+      <td align="center">
+        <img src="blob/main/asset/CAIS-MRE.png" width="450" alt="CAIS MRE"/>
+      </td>
+      <td align="center">
+        <img src="blob/main/asset/CAIS-msa.png" width="450" alt="CAIS MSA"/>
+      </td>
+    </tr>
+  </table>
+</p>
+-->
+## Getting Started
+#### 🔧 Environment Installation
+**Prerequisites:**
+- **Python 3.10** (create a new conda environment first)
+- Required Python libraries (specified in `requirements.txt`)
+**Step 1: Copy the example configuration**
+```bash
+cp .env.example .env
+```
+**Step 2: Create Python 3.10 environment**
+```bash
+# Create a new conda environment with Python 3.10
+conda create -n auto_causal python=3.10
+conda activate auto_causal
+pip install -r requirement.txt
+```
+**Step3: Setup auto_causal library**
+```bash
+pip install -e .
+```
+## Dataset Information
+All datasets used to evaluate CAIs and the baseline models are available in the data/ directory. Specifically:
+* `all_data`: Folder containing all CSV files from the QRData and real-world study collections.
+* `synthetic_data`: Folder containing all CSV files corresponding to synthetic datasets.
+* `qr_info.csv`: Metadata for QRData files. For each file, this includes the filename, description, causal query, reference causal effect, intended inference method, and additional remarks.
+* `real_info.csv`: Metadata for the real-world datasets.
+* `synthetic_info.csv`: Metadata for the synthetic datasets.
+## Run
+To execute CAIS, run
+```python
+python main/run_cais.py \
+    --metadata_path {path_to_metadata} \
+    --data_dir {path_to_data_folder} \
+    --output_dir {output_folder} \
+    --output_name {output_filename} \
+    --llm_name {llm_name}
+```
+Args:
+* metadata_path (str): Path to the CSV file containing the queries, dataset descriptions, and data file names
+* data_dir (str): Path to the folder containing the data in CSV format
+* output_dir (str): Path to the folder where the output JSON results will be saved
+* output_name (str): Name of the JSON file where the outputs will be saved
+* llm_name (str): Name of the LLM to be used (e.g., 'gpt-4', 'claude-3', etc.)
+A specific example,
+```python
+python main/run_cais.py \
+    --metadata_path "data/qr_info.csv" \
+    --data_dir "data/all_data" \
+    --output_dir "output" \
+    --output_name "results_qr_4o" \
+    --llm_name "gpt-4o-mini"
+```
+## Reproducing paper results
+**Will be updated soon**
+**⚠️ Important Notes:**
+- Keep your `.env` file secure and never commit it to version control
+## License
+Distributed under the MIT License. See `LICENSE` for more information.
+<!--## Contributors
+**Core Contributors**: Vishal Verma, Sawal Acharya, Devansh Bhardwaj
+**Other Contributors**:  Zhijing Jin, Ana Hagihat, Samuel Simko
+---
+## Contact
+For additional information, questions, or feedback, please contact ours **[Vishal Verma]([email protected])**, **[Sawal Acharya]([email protected])**, **[Devansh Bhardwaj]([email protected])**. We welcome contributions! Come and join us now!
+-->

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Causal AI Scientist
+emoji: 🌍
+colorFrom: green
+colorTo: pink
+sdk: gradio
+sdk_version: 5.41.1
+app_file: app.py
+pinned: false
+license: mit
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,339 @@

+import os
+import sys
+import json
+from pathlib import Path
+import gradio as gr
+import time
+# Make your repo importable (expecting a folder named causal-agent at repo root)
+sys.path.append(str(Path(__file__).parent / "causal-agent"))
+from auto_causal.agent import run_causal_analysis  # uses env for provider/model
+# -------- LLM config (OpenAI only; key via HF Secrets) --------
+os.environ.setdefault("LLM_PROVIDER", "openai")
+os.environ.setdefault("LLM_MODEL", "gpt-4o")
+# Lazy import to avoid import-time errors if key missing
+def _get_openai_client():
+    if os.getenv("LLM_PROVIDER", "openai") != "openai":
+        raise RuntimeError("Only LLM_PROVIDER=openai is supported in this demo.")
+    if not os.getenv("OPENAI_API_KEY"):
+        raise RuntimeError("Missing OPENAI_API_KEY (set as a Space Secret).")
+    try:
+        # OpenAI SDK v1+
+        from openai import OpenAI
+        return OpenAI()
+    except Exception as e:
+        raise RuntimeError(f"OpenAI SDK not available: {e}")
+# -------- System prompt you asked for (verbatim) --------
+SYSTEM_PROMPT = """You are an expert in statistics and causal inference.
+You will be given:
+1) The original research question.
+2) The analysis method used.
+3) The estimated effects, confidence intervals, standard errors, and p-values for each treatment group compared to the control group.
+4) A brief dataset description.
+Your task is to produce a clear, concise, and non-technical summary that:
+- Directly answers the research question.
+- States whether the effect is statistically significant.
+- Quantifies the effect size and explains what it means in practical terms (e.g., percentage point change).
+- Mentions the method used in one sentence.
+- Optionally ranks the treatment effects from largest to smallest if multiple treatments exist.
+Formatting rules:
+- Use bullet points or short paragraphs.
+- Report effect sizes to two decimal places.
+- Clearly state the interpretation in plain English without technical jargon.
+Example Output Structure:
+- **Method:** [Name of method + 1-line rationale]
+- **Key Finding:** [Main answer to the research question]
+- **Details:**
+  - [Treatment name]: +X.XX percentage points (95% CI: [L, U]), p < 0.001 — [Significance comment]
+  - …
+- **Rank Order of Effects:** [Largest → Smallest]
+"""
+def _extract_minimal_payload(agent_result: dict) -> dict:
+    """
+    Extract the minimal, LLM-friendly payload from run_causal_analysis output.
+    Falls back gracefully if any fields are missing.
+    """
+    # Try both top-level and nested (your JSON showed both patterns)
+    res = agent_result or {}
+    results = res.get("results", {}) if isinstance(res.get("results"), dict) else {}
+    inner = results.get("results", {}) if isinstance(results.get("results"), dict) else {}
+    vars_ = results.get("variables", {}) if isinstance(results.get("variables"), dict) else {}
+    dataset_analysis = results.get("dataset_analysis", {}) if isinstance(results.get("dataset_analysis"), dict) else {}
+    # Pull best-available fields
+    question = (
+        results.get("original_query")
+        or dataset_analysis.get("original_query")
+        or res.get("query")
+        or "N/A"
+    )
+    method = (
+        inner.get("method_used")
+        or res.get("method_used")
+        or results.get("method_used")
+        or "N/A"
+    )
+    effect_estimate = (
+        inner.get("effect_estimate")
+        or res.get("effect_estimate")
+        or {}
+    )
+    confidence_interval = (
+        inner.get("confidence_interval")
+        or res.get("confidence_interval")
+        or {}
+    )
+    standard_error = (
+        inner.get("standard_error")
+        or res.get("standard_error")
+        or {}
+    )
+    p_value = (
+        inner.get("p_value")
+        or res.get("p_value")
+        or {}
+    )
+    dataset_desc = (
+        results.get("dataset_description")
+        or res.get("dataset_description")
+        or "N/A"
+    )
+    return {
+        "original_question": question,
+        "method_used": method,
+        "estimates": {
+            "effect_estimate": effect_estimate,
+            "confidence_interval": confidence_interval,
+            "standard_error": standard_error,
+            "p_value": p_value,
+        },
+        "dataset_description": dataset_desc,
+    }
+def _format_effects_md(effect_estimate: dict) -> str:
+    """
+    Minimal human-readable view of effect estimates for display.
+    """
+    if not effect_estimate or not isinstance(effect_estimate, dict):
+        return "_No effect estimates found._"
+    # Render as bullet list
+    lines = []
+    for k, v in effect_estimate.items():
+        try:
+            lines.append(f"- **{k}**: {float(v):+.4f}")
+        except Exception:
+            lines.append(f"- **{k}**: {v}")
+    return "\n".join(lines)
+def _summarize_with_llm(payload: dict) -> str:
+    """
+    Calls OpenAI with the provided SYSTEM_PROMPT and the JSON payload as the user message.
+    Returns the model's text, or raises on error.
+    """
+    client = _get_openai_client()
+    model_name = os.getenv("LLM_MODEL", "gpt-4o-mini")
+    user_content = (
+        "Summarize the following causal analysis results:\n\n"
+        + json.dumps(payload, indent=2, ensure_ascii=False)
+    )
+    # Use Chat Completions for broad compatibility
+    resp = client.chat.completions.create(
+        model=model_name,
+        messages=[
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": user_content},
+        ],
+        temperature=0
+    )
+    text = resp.choices[0].message.content.strip()
+    return text
+def run_agent(query: str, csv_path: str, dataset_description: str):
+    """
+    Modified to use yield for progressive updates and immediate feedback
+    """
+    # Immediate feedback - show processing has started
+    processing_html = """
+    <div style='padding: 15px; border: 1px solid #ddd; border-radius: 8px; margin: 5px 0; background-color: #333333;'>
+        <div style='font-size: 16px; margin-bottom: 5px;'>🔄 Analysis in Progress...</div>
+        <div style='font-size: 14px; color: #666;'>This may take 1-2 minutes depending on dataset size</div>
+    </div>
+    """
+    yield (
+        processing_html,  # method_out
+        processing_html,  # effects_out
+        processing_html,  # explanation_out
+        {"status": "Processing started..."}  # raw_results
+    )
+    # Input validation
+    if not os.getenv("OPENAI_API_KEY"):
+        error_html = "<div style='padding: 10px; border: 1px solid #dc3545; border-radius: 5px; color: #dc3545; background-color: #333333;'>⚠️ Set a Space Secret named OPENAI_API_KEY</div>"
+        yield (error_html, "", "", {})
+        return
+    if not csv_path:
+        error_html = "<div style='padding: 10px; border: 1px solid #ffc107; border-radius: 5px; color: #856404; background-color: #333333;'>Please upload a CSV dataset.</div>"
+        yield (error_html, "", "", {})
+        return
+    try:
+        # Update status to show causal analysis is running
+        analysis_html = """
+        <div style='padding: 15px; border: 1px solid #ddd; border-radius: 8px; margin: 5px 0; background-color: #333333;'>
+            <div style='font-size: 16px; margin-bottom: 5px;'>📊 Running Causal Analysis...</div>
+            <div style='font-size: 14px; color: #666;'>Analyzing dataset and selecting optimal method</div>
+        </div>
+        """
+        yield (
+            analysis_html,
+            analysis_html,
+            analysis_html,
+            {"status": "Running causal analysis..."}
+        )
+        result = run_causal_analysis(
+            query=(query or "What is the effect of treatment T on outcome Y controlling for X?").strip(),
+            dataset_path=csv_path,
+            dataset_description=(dataset_description or "").strip(),
+        )
+        # Update to show LLM summarization step
+        llm_html = """
+        <div style='padding: 15px; border: 1px solid #ddd; border-radius: 8px; margin: 5px 0; background-color: #333333;'>
+            <div style='font-size: 16px; margin-bottom: 5px;'>🤖 Generating Summary...</div>
+            <div style='font-size: 14px; color: #666;'>Creating human-readable interpretation</div>
+        </div>
+        """
+        yield (
+            llm_html,
+            llm_html,
+            llm_html,
+            {"status": "Generating explanation...", "raw_analysis": result if isinstance(result, dict) else {}}
+        )
+    except Exception as e:
+        error_html = f"<div style='padding: 10px; border: 1px solid #dc3545; border-radius: 5px; color: #dc3545; background-color: #333333;'>❌ Error: {e}</div>"
+        yield (error_html, "", "", {})
+        return
+    try:
+        payload = _extract_minimal_payload(result if isinstance(result, dict) else {})
+        method = payload.get("method_used", "N/A")
+        # Format method output with simple styling
+        method_html = f"""
+        <div style='padding: 15px; border: 1px solid #ddd; border-radius: 8px; margin: 5px 0; background-color: #333333;'>
+            <h3 style='margin: 0 0 10px 0; font-size: 18px;'>Selected Method</h3>
+            <p style='margin: 0; font-size: 16px;'>{method}</p>
+        </div>
+        """
+        # Format effects with simple styling
+        effect_estimate = payload.get("estimates", {}).get("effect_estimate", {})
+        if effect_estimate:
+            effects_html = "<div style='padding: 15px; border: 1px solid #ddd; border-radius: 8px; margin: 5px 0; background-color: #333333;'>"
+            effects_html += "<h3 style='margin: 0 0 10px 0; font-size: 18px;'>Effect Estimates</h3>"
+            # for k, v in effect_estimate.items():
+            #     try:
+            #         value = f"{float(v):+.4f}"
+            #         effects_html += f"<div style='margin: 8px 0; padding: 8px; border: 1px solid #eee; border-radius: 4px; background-color: #ffffff;'><strong>{k}:</strong> <span style='font-size: 16px;'>{value}</span></div>"
+            #     except:
+            effects_html += f"<div style='margin: 8px 0; padding: 8px; border: 1px solid #eee; border-radius: 4px; background-color: #333333;'>{effect_estimate}</div>"
+            effects_html += "</div>"
+        else:
+            effects_html = "<div style='padding: 10px; border: 1px solid #ddd; border-radius: 8px; color: #666; font-style: italic; background-color: #333333;'>No effect estimates found</div>"
+        # Generate explanation and format it
+        try:
+            explanation = _summarize_with_llm(payload)
+            explanation_html = f"""
+            <div style='padding: 15px; border: 1px solid #ddd; border-radius: 8px; margin: 5px 0; background-color: #333333;'>
+                <h3 style='margin: 0 0 15px 0; font-size: 18px;'>Detailed Explanation</h3>
+                <div style='line-height: 1.6; white-space: pre-wrap;'>{explanation}</div>
+            </div>
+            """
+        except Exception as e:
+            explanation_html = f"<div style='padding: 10px; border: 1px solid #ffc107; border-radius: 5px; color: #856404; background-color: #333333;'>⚠️ LLM summary failed: {e}</div>"
+    except Exception as e:
+        error_html = f"<div style='padding: 10px; border: 1px solid #dc3545; border-radius: 5px; color: #dc3545; background-color: #333333;'>❌ Failed to parse results: {e}</div>"
+        yield (error_html, "", "", {})
+        return
+    # Final result
+    yield (method_html, effects_html, explanation_html, result if isinstance(result, dict) else {})
+with gr.Blocks() as demo:
+    gr.Markdown("# Causal Agent")
+    gr.Markdown("Upload your dataset and ask causal questions in natural language. The system will automatically select the appropriate causal inference method and provide clear explanations.")
+    with gr.Row():
+        query = gr.Textbox(
+            label="Your causal question (natural language)",
+            placeholder="e.g., What is the effect of attending the program (T) on income (Y), controlling for education and age?",
+            lines=2,
+        )
+    with gr.Row():
+        csv_file = gr.File(
+            label="Dataset (CSV)",
+            file_types=[".csv"],
+            type="filepath"
+        )
+    dataset_description = gr.Textbox(
+        label="Dataset description (optional)",
+        placeholder="Brief schema, how it was collected, time period, units, treatment/outcome variables, etc.",
+        lines=4,
+    )
+    run_btn = gr.Button("Run analysis", variant="primary")
+    with gr.Row():
+        with gr.Column(scale=1):
+            method_out = gr.HTML(label="Selected Method")
+        with gr.Column(scale=1):
+            effects_out = gr.HTML(label="Effect Estimates")
+    with gr.Row():
+        explanation_out = gr.HTML(label="Detailed Explanation")
+    # Add the collapsible raw results section
+    with gr.Accordion("Raw Results (Advanced)", open=False):
+        raw_results = gr.JSON(label="Complete Analysis Output", show_label=False)
+    run_btn.click(
+        fn=run_agent,
+        inputs=[query, csv_file, dataset_description],
+        outputs=[method_out, effects_out, explanation_out, raw_results],
+        show_progress=True
+    )
+    gr.Markdown(
+        """
+        **Tips:**
+        - Be specific about your treatment, outcome, and control variables
+        - Include relevant context in the dataset description
+        - The analysis may take 1-2 minutes for complex datasets
+        """
+    )
+if __name__ == "__main__":
+    demo.queue().launch()

auto_causal/__init__.py ADDED Viewed

	@@ -0,0 +1,50 @@

+"""
+Auto Causal module for causal inference.
+This module provides automated causal inference capabilities
+through a pipeline that selects and applies appropriate causal methods.
+"""
+__version__ = "0.1.0"
+# Import components
+from auto_causal.components import (
+    parse_input,
+    analyze_dataset,
+    interpret_query,
+    validate_method,
+    generate_explanation,
+    format_output,
+    create_workflow_state_update
+)
+# Import tools
+from auto_causal.tools import (
+    input_parser_tool,
+    dataset_analyzer_tool,
+    query_interpreter_tool,
+    method_selector_tool,
+    method_validator_tool,
+    method_executor_tool,
+    explanation_generator_tool,
+    output_formatter_tool
+)
+# Import the main agent function
+from .agent import run_causal_analysis
+# Remove backward compatibility for old pipeline
+# try:
+#     from .pipeline import CausalInferencePipeline
+# except ImportError:
+#     # Define a placeholder class if the old pipeline doesn't exist
+#     class CausalInferencePipeline:
+#         """Placeholder for CausalInferencePipeline."""
+#
+#         def __init__(self, *args, **kwargs):
+#             pass
+# Update __all__ to export the main function
+__all__ = [
+    'run_causal_analysis'
+]

auto_causal/agent.py ADDED Viewed

	@@ -0,0 +1,394 @@

+"""
+LangChain agent for the auto_causal module.
+This module configures a LangChain agent with specialized tools for causal inference,
+allowing for an interactive approach to analyzing datasets and applying appropriate
+causal inference methods.
+"""
+import logging
+from typing import Dict, List, Any, Optional
+from langchain.agents.react.agent import create_react_agent
+from langchain.agents import AgentExecutor, create_structured_chat_agent, create_tool_calling_agent
+from langchain.chains.conversation.memory import ConversationBufferMemory
+from langchain_core.messages import SystemMessage, HumanMessage
+from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, MessagesPlaceholder
+from langchain.tools import tool
+# Import the callback handler
+from langchain.callbacks.tracers.stdout import ConsoleCallbackHandler
+# Import tool rendering utility
+from langchain.tools.render import render_text_description
+# Import LCEL components
+from langchain.agents.format_scratchpad.tools import format_to_tool_messages
+from langchain.agents.output_parsers.tools import ToolsAgentOutputParser
+from langchain_core.runnables import RunnablePassthrough
+from langchain_core.language_models import BaseChatModel
+from langchain_anthropic.chat_models import convert_to_anthropic_tool
+import os
+# Import actual tools from the tools directory
+from auto_causal.tools.input_parser_tool import input_parser_tool
+from auto_causal.tools.dataset_analyzer_tool import dataset_analyzer_tool
+from auto_causal.tools.query_interpreter_tool import query_interpreter_tool
+from auto_causal.tools.method_selector_tool import method_selector_tool
+from auto_causal.tools.method_validator_tool import method_validator_tool
+from auto_causal.tools.method_executor_tool import method_executor_tool
+from auto_causal.tools.explanation_generator_tool import explanation_generator_tool
+from auto_causal.tools.output_formatter_tool import output_formatter_tool
+#from auto_causal.prompts import SYSTEM_PROMPT # Assuming SYSTEM_PROMPT is defined here or imported
+from langchain_core.output_parsers import StrOutputParser
+# Import the centralized factory function
+from .config import get_llm_client
+#from .prompts import SYSTEM_PROMPT
+from langchain_core.messages import AIMessage, AIMessageChunk
+import re
+import json
+from typing import Union
+from langchain_core.output_parsers import BaseOutputParser
+from langchain.schema import AgentAction, AgentFinish
+from langchain_anthropic.output_parsers import ToolsOutputParser
+from langchain.agents.react.output_parser import ReActOutputParser
+from langchain.agents import AgentOutputParser
+from langchain.agents.agent import AgentAction, AgentFinish, OutputParserException
+import re
+from typing import Union, List
+from auto_causal.models import *
+from langchain_core.agents import AgentAction, AgentFinish
+from langchain_core.exceptions import OutputParserException
+from langchain.agents.agent import AgentOutputParser
+from langchain.agents.mrkl.prompt import FORMAT_INSTRUCTIONS
+FINAL_ANSWER_ACTION = "Final Answer:"
+MISSING_ACTION_AFTER_THOUGHT_ERROR_MESSAGE = (
+    "Invalid Format: Missing 'Action:' after 'Thought:'"
+)
+MISSING_ACTION_INPUT_AFTER_ACTION_ERROR_MESSAGE = (
+    "Invalid Format: Missing 'Action Input:' after 'Action:'"
+)
+FINAL_ANSWER_AND_PARSABLE_ACTION_ERROR_MESSAGE = (
+    "Parsing LLM output produced both a final answer and parse-able actions"
+)
+class ReActMultiInputOutputParser(AgentOutputParser):
+    """Parses ReAct-style output that may contain multiple tool calls."""
+    def get_format_instructions(self) -> str:
+        # You can reuse the original FORMAT_INSTRUCTIONS,
+        # but let the model know it may emit multiple actions.
+        return FORMAT_INSTRUCTIONS + (
+            "\n\nIf you need to call more than one tool, simply repeat:\n"
+            "Action: <tool_name>\n"
+            "Action Input: <json or text>\n"
+            "…for each tool in sequence."
+        )
+    @property
+    def _type(self) -> str:
+        return "react-multi-input"
+    def parse(self, text: str) -> Union[List[AgentAction], AgentFinish]:
+        includes_answer = FINAL_ANSWER_ACTION in text
+        print('-------------------')
+        print(text)
+        print('-------------------')
+        # Grab every Action / Action Input block
+        pattern = (
+            r"Action\s*\d*\s*:[\s]*(.*?)\s*"
+            r"Action\s*\d*\s*Input\s*\d*\s*:[\s]*(.*?)(?=(?:Action\s*\d*\s*:|$))"
+        )
+        matches = list(re.finditer(pattern, text, re.DOTALL))
+        # If we found tool calls…
+        if matches:
+            if includes_answer:
+                # both a final answer *and* tool calls is ambiguous
+                raise OutputParserException(
+                    f"{FINAL_ANSWER_AND_PARSABLE_ACTION_ERROR_MESSAGE}: {text}"
+                )
+            actions: List[AgentAction] = []
+            for m in matches:
+                tool_name = m.group(1).strip()
+                tool_input = m.group(2).strip().strip('"')
+                print('\n--------------------------')
+                print(tool_input)
+                print('--------------------------')
+                actions.append(AgentAction(tool_name, json.loads(tool_input), text))
+            return actions
+        # Otherwise, if there's a final answer, finish
+        if includes_answer:
+            answer = text.split(FINAL_ANSWER_ACTION, 1)[1].strip()
+            return AgentFinish({"output": answer}, text)
+        # No calls and no final answer → figure out which error to throw
+        if not re.search(r"Action\s*\d*\s*Input\s*\d*:", text):
+            raise OutputParserException(
+                f"Could not parse LLM output: `{text}`",
+                observation=MISSING_ACTION_INPUT_AFTER_ACTION_ERROR_MESSAGE,
+                llm_output=text,
+                send_to_llm=True,
+            )
+        # Fallback
+        raise OutputParserException(f"Could not parse LLM output: `{text}`")
+# Set up basic logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+def create_agent_prompt(tools: List[tool]) -> ChatPromptTemplate:
+    """Create the prompt template for the causal inference agent, emphasizing workflow and data handoff.
+       (This is the version required by the LCEL agent structure below)
+    """
+    # Get the tool descriptions
+    tool_description = render_text_description(tools)
+    tool_names = ", ".join([t.name for t in tools])
+    # Define the system prompt template string
+    system_template = """
+You are a causal inference expert helping users answer causal questions by following a strict workflow using specialized tools.
+Remember you always have to always generate the Thought, Action and Action Input block.
+TOOLS:
+------
+You have access to the following tools:
+{tools}
+To use a tool, please use the following format:
+Thought: Do I need to use a tool? Yes
+Action: the action to take, should be one of [{tool_names}]
+Action Input: the input to the action, as a single, valid JSON object string. Check the tool definition for required arguments and structure.
+Observation: the result of the action, often containing structured data like 'variables', 'dataset_analysis', 'method_info', etc.
+When you have a response to say to the Human, or if you do not need to use a tool, you MUST use the format:
+Thought: Do I need to use a tool? No
+Final Answer: [your response here]
+DO NOT UNDER ANY CIRCUMSTANCE CALL MORE THAN ONE TOOL IN A  STEP
+**IMPORTANT TOOL USAGE:**
+1.  **Action Input Format:** The value for 'Action Input' MUST be a single, valid JSON object string. Do NOT include any other text or formatting around the JSON string.
+2.  **Argument Gathering:** You MUST gather ALL required arguments for the Action Input JSON from the initial Human input AND the 'Observation' outputs of PREVIOUS steps. Look carefully at the required arguments for the tool you are calling.
+3.  **Data Handoff:** The 'Observation' from a previous step often contains structured data needed by the next tool. For example, the 'variables' output from `query_interpreter_tool` contains fields like `treatment_variable`, `outcome_variable`, `covariates`, `time_variable`, `instrument_variable`, `running_variable`, `cutoff_value`, and `is_rct`. When calling `method_selector_tool`, you MUST construct its required `variables` input argument by including **ALL** these relevant fields identified by the `query_interpreter_tool` in the previous Observation. Similarly, pass the full `dataset_analysis`, `dataset_description`, and `original_query` when required by the next tool.
+IMPORTANT WORKFLOW:
+-------------------
+You must follow this exact workflow, selecting the appropriate tool for each step:
+1. ALWAYS start with `input_parser_tool` to understand the query
+2. THEN use `dataset_analyzer_tool` to analyze the dataset
+3. THEN use `query_interpreter_tool` to identify variables (output includes `variables` and `dataset_analysis`)
+4. THEN use `method_selector_tool` (input requires `variables` and `dataset_analysis` from previous step)
+5. THEN use `method_validator_tool` (input requires `method_info` and `variables` from previous step)
+6. THEN use `method_executor_tool` (input requires `method`, `variables`, `dataset_path`)
+7. THEN use `explanation_generator_tool` (input requires results, method_info, variables, etc.)
+8. FINALLY use `output_formatter_tool` to return the results
+REASONING PROCESS:
+------------------
+EXPLICITLY REASON about:
+1. What step you're currently on (based on previous tool's Observation)
+2. Why you're selecting a particular tool (should follow the workflow)
+3. How the output of the previous tool (especially structured data like `variables`, `dataset_analysis`, `method_info`) informs the inputs required for the current tool.
+IMPORTANT RULES:
+1. Do not make more than one tool call in a single step.
+2. Do not include ``` in your output at all.
+3. Don't use action names like default_api.dataset_analyzer_tool, instead use tool names like dataset_analyzer_tool.
+4. Always start, action, and observation with a new line.
+5. Don't use '\\' before double quotes
+6. Don't include ```json for Action Input. Also ensure that Action Input is a valid json. DO no add any text after Action Iput.
+7. You have to always choose one of the tools unless it's the final answer.
+Begin!
+"""
+    # Create the prompt template
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", system_template),
+        MessagesPlaceholder("chat_history", optional=True), # Use MessagesPlaceholder
+        # MessagesPlaceholder("agent_scratchpad"),
+        ("human", "{input}\n Thought:{agent_scratchpad}"),
+        # ("ai", "{agent_scratchpad}"),
+        # MessagesPlaceholder("agent_scratchpad" ), # Use MessagesPlaceholder
+        # "agent_scratchpad"
+    ])
+    return prompt
+def create_causal_agent(llm: BaseChatModel) -> AgentExecutor:
+    """
+    Create and configure the LangChain agent with causal inference tools.
+    (Using explicit LCEL construction, compatible with shared LLM client)
+    """
+    # Define tools available to the agent
+    agent_tools = [
+        input_parser_tool,
+        dataset_analyzer_tool,
+        query_interpreter_tool,
+        method_selector_tool,
+        method_validator_tool,
+        method_executor_tool,
+        explanation_generator_tool,
+        output_formatter_tool
+    ]
+    # anthropic_agent_tools = [ convert_to_anthropic_tool(anthropic_tool) for anthropic_tool in agent_tools]
+    # Create the prompt using the helper
+    prompt = create_agent_prompt(agent_tools)
+    # Bind tools to the LLM (using the passed shared instance)
+    # Create memory
+    # Consider if memory needs to be passed in or created here
+    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
+    # Manually construct the agent runnable using LCEL
+    from langchain_anthropic.output_parsers import ToolsOutputParser
+    from langchain.agents.output_parsers.json import JSONAgentOutputParser
+    # from langchain.agents.react.output_parser import MultiActionAgentOutputParsers ReActMultiInputOutputParser
+    provider = os.getenv("LLM_PROVIDER", "openai")
+    if provider == "gemini":
+        base_parser=ReActMultiInputOutputParser()
+        llm_with_tools = llm.bind_tools(agent_tools)
+    else:
+        base_parser=ToolsAgentOutputParser()
+        llm_with_tools = llm.bind_tools(agent_tools, tool_choice="any")
+    agent = create_react_agent(llm_with_tools, agent_tools, prompt, output_parser=base_parser)
+    # Create executor (should now work with the manually constructed agent)
+    executor = AgentExecutor(
+        agent=agent,
+        tools=agent_tools,
+        memory=memory, # Pass the memory object
+        verbose=True,
+        callbacks=[ConsoleCallbackHandler()], # Optional: for console debugging
+        handle_parsing_errors=True, # Let AE handle parsing errors
+        max_retries = 100
+    )
+    return executor
+def run_causal_analysis(query: str, dataset_path: str,
+                        dataset_description: Optional[str] = None,
+                        api_key: Optional[str] = None) -> Dict[str, Any]:
+    """
+    Run causal analysis on a dataset based on a user query.
+    Args:
+        query: User's causal question
+        dataset_path: Path to the dataset
+        dataset_description: Optional textual description of the dataset
+        api_key: Optional OpenAI API key (DEPRECATED - will be ignored)
+    Returns:
+        Dictionary containing the final formatted analysis results from the agent's last step.
+    """
+    # Log the start of the analysis
+    logger.info("Starting causal analysis run...")
+    try:
+        # --- Instantiate the shared LLM client ---
+        model_name = os.getenv("LLM_MODEL", "gpt-4")
+        if model_name in ['o3', 'o4-mini', 'o3-mini']:
+            print('-------------------------')
+            shared_llm = get_llm_client()
+        else:
+            shared_llm = get_llm_client(temperature=0) # Or read provider/model from env
+        # --- Dependency Injection Note (REMAINS RELEVANT) ---
+        # If tools need the LLM, they must be adapted. Example using partial:
+        # from functools import partial
+        # from .components import input_parser
+        # # Assume input_parser.parse_input needs llm
+        # input_parser_tool_with_llm = tool(partial(input_parser.parse_input, llm=shared_llm))
+        # Use input_parser_tool_with_llm in the tools list passed to the agent below.
+        # Similar adjustments needed for decision_tree._recommend_ps_method if used.
+        # --- End Note ---
+        # --- Create agent using the shared LLM ---
+        # agent_executor = create_causal_agent(shared_llm)
+        # Construct input, including description if available
+        # IMPORTANT: Agent now expects 'input' and potentially 'chat_history'
+        # The input needs to contain all initial info the first tool might need.
+        input_text = f"My question is: {query}\n"
+        input_text += f"The dataset is located at: {dataset_path}\n"
+        if dataset_description:
+            input_text += f"Dataset Description: {dataset_description}\n"
+        input_text += "Please perform the causal analysis following the workflow."
+        # Log the constructed input text
+        logger.info(f"Constructed input for agent: \n{input_text}")
+        input_parsing_result = input_parser_tool(input_text)
+        dataset_analysis_result = dataset_analyzer_tool.func(dataset_path=input_parsing_result["dataset_path"], dataset_description=input_parsing_result["dataset_description"], original_query=input_parsing_result["original_query"]).analysis_results
+        query_info = QueryInfo(
+        query_text=input_parsing_result["original_query"],
+        potential_treatments=input_parsing_result["extracted_variables"].get("treatment"),
+        potential_outcomes=input_parsing_result["extracted_variables"].get("outcome"),
+        covariates_hints=input_parsing_result["extracted_variables"].get("covariates_mentioned"),
+        instrument_hints=input_parsing_result["extracted_variables"].get("instruments_mentioned")
+    )
+        query_interpreter_output = query_interpreter_tool.func(query_info=query_info, dataset_analysis=dataset_analysis_result, dataset_description=input_parsing_result["dataset_description"], original_query = input_parsing_result["original_query"]).variables
+        method_selector_output = method_selector_tool.func(variables=query_interpreter_output,
+            dataset_analysis=dataset_analysis_result,
+            dataset_description=input_parsing_result["dataset_description"],
+            original_query = input_parsing_result["original_query"],
+            excluded_methods=None)
+        method_info = MethodInfo(
+            **method_selector_output['method_info']
+        )
+        method_validator_input = MethodValidatorInput(
+            method_info=method_info,
+            variables=query_interpreter_output,
+            dataset_analysis=dataset_analysis_result,
+            dataset_description=input_parsing_result["dataset_description"],
+            original_query = input_parsing_result["original_query"]
+        )
+        method_validator_output = method_validator_tool.func(method_validator_input)
+        method_executor_input = MethodExecutorInput(
+            **method_validator_output
+        )
+        method_executor_output = method_executor_tool.func(method_executor_input, original_query = input_parsing_result["original_query"])
+        explainer_output = explanation_generator_tool.func(            method_info=method_info,
+            validation_info=method_validator_output,
+            variables=query_interpreter_output,
+            results=method_executor_output,
+            dataset_analysis=dataset_analysis_result,
+            dataset_description=input_parsing_result["dataset_description"],
+            original_query = input_parsing_result["original_query"])
+        result = explainer_output
+        result['results']['results']["method_used"] = method_validator_output['method']
+        logger.info(result)
+        logger.info("Causal analysis run finished.")
+        # Ensure result is a dict and extract the 'output' part
+        if isinstance(result, dict):
+            final_output = result
+            if isinstance(final_output, dict):
+                return final_output # Return only the dictionary from the final tool
+            else:
+                logger.error(f"Agent result['output'] was not a dictionary: {type(final_output)}. Returning error dict.")
+                return {"error": "Agent did not produce the expected dictionary output in the 'output' key.", "raw_agent_result": result}
+        else:
+            logger.error(f"Agent returned non-dict type: {type(result)}. Returning error dict.")
+            return {"error": "Agent did not return expected dictionary output.", "raw_output": str(result)}
+    except ValueError as e:
+        logger.error(f"Configuration Error: {e}")
+        # Return an error dictionary in case of exception too
+        return {"error": f"Error: Configuration issue - {e}"} # Ensure consistent error return type
+    except Exception as e:
+        logger.error(f"An unexpected error occurred during causal analysis: {e}", exc_info=True)
+        # Return an error dictionary in case of exception too
+        return {"error": f"An unexpected error occurred: {e}"}

auto_causal/components/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""
+Auto Causal components package.
+This package contains the core components for the auto_causal module,
+each handling a specific part of the causal inference workflow.
+"""
+from auto_causal.components.input_parser import parse_input
+from auto_causal.components.dataset_analyzer import analyze_dataset
+from auto_causal.components.query_interpreter import interpret_query
+from auto_causal.components.decision_tree import select_method
+from auto_causal.components.method_validator import validate_method
+from auto_causal.components.explanation_generator import generate_explanation
+from auto_causal.components.output_formatter import format_output
+from auto_causal.components.state_manager import create_workflow_state_update
+__all__ = [
+    "parse_input",
+    "analyze_dataset",
+    "interpret_query",
+    "select_method",
+    "validate_method",
+    "generate_explanation",
+    "format_output",
+    "create_workflow_state_update"
+]
+# This file makes Python treat the directory as a package.

auto_causal/components/dataset_analyzer.py ADDED Viewed

	@@ -0,0 +1,853 @@

+"""
+Dataset analyzer component for causal inference.
+This module provides functionality to analyze datasets to detect characteristics
+relevant for causal inference methods, including temporal structure, potential
+instrumental variables, discontinuities, and variable relationships.
+"""
+import os
+import pandas as pd
+import numpy as np
+from typing import Dict, List, Any, Optional, Tuple
+from scipy import stats
+import logging
+import json
+from langchain_core.language_models import BaseChatModel
+from auto_causal.utils.llm_helpers import llm_identify_temporal_and_unit_vars
+logger = logging.getLogger(__name__)
+def _calculate_per_group_stats(df: pd.DataFrame, potential_treatments: List[str]) -> Dict[str, Dict]:
+    """Calculates summary stats for numeric covariates grouped by potential binary treatments."""
+    stats_dict = {}
+    numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
+    for treat_var in potential_treatments:
+        if treat_var not in df.columns:
+            logger.warning(f"Potential treatment '{treat_var}' not found in DataFrame columns.")
+            continue
+        # Ensure treatment is binary (0/1 or similar)
+        unique_vals = df[treat_var].dropna().unique()
+        if len(unique_vals) != 2:
+            logger.info(f"Skipping stats for potential treatment '{treat_var}' as it is not binary ({len(unique_vals)} unique values).")
+            continue
+        # Attempt to map values to 0 and 1 if possible
+        try:
+             # Ensure boolean is converted to int
+            if df[treat_var].dtype == 'bool':
+                 df[treat_var] = df[treat_var].astype(int)
+                 unique_vals = df[treat_var].dropna().unique()
+            # Basic check if values are interpretable as 0/1
+            if not set(unique_vals).issubset({0, 1}):
+                 # Attempt conversion if possible (e.g., True/False strings?)
+                 logger.warning(f"Potential treatment '{treat_var}' has values {unique_vals}, not {0, 1}. Cannot calculate group stats reliably.")
+                 continue
+        except Exception as e:
+            logger.warning(f"Could not process potential treatment '{treat_var}' values ({unique_vals}): {e}")
+            continue
+        logger.info(f"Calculating group stats for treatment: '{treat_var}'")
+        treat_stats = {'group_sizes': {}, 'covariate_stats': {}}
+        try:
+            grouped = df.groupby(treat_var)
+            sizes = grouped.size()
+            treat_stats['group_sizes']['treated'] = int(sizes.get(1, 0))
+            treat_stats['group_sizes']['control'] = int(sizes.get(0, 0))
+            if treat_stats['group_sizes']['treated'] == 0 or treat_stats['group_sizes']['control'] == 0:
+                logger.warning(f"Treatment '{treat_var}' has zero samples in one group. Skipping covariate stats.")
+                stats_dict[treat_var] = treat_stats
+                continue
+            # Calculate mean and std for numeric covariates
+            cov_stats = grouped[numeric_cols].agg(['mean', 'std']).unstack()
+            for cov in numeric_cols:
+                if cov == treat_var: continue # Skip treatment variable itself
+                mean_control = cov_stats.get(('mean', 0, cov), np.nan)
+                std_control = cov_stats.get(('std', 0, cov), np.nan)
+                mean_treated = cov_stats.get(('mean', 1, cov), np.nan)
+                std_treated = cov_stats.get(('std', 1, cov), np.nan)
+                treat_stats['covariate_stats'][cov] = {
+                    'mean_control': float(mean_control) if pd.notna(mean_control) else None,
+                    'std_control': float(std_control) if pd.notna(std_control) else None,
+                    'mean_treat': float(mean_treated) if pd.notna(mean_treated) else None,
+                    'std_treat': float(std_treated) if pd.notna(std_treated) else None,
+                }
+            stats_dict[treat_var] = treat_stats
+        except Exception as e:
+            logger.error(f"Error calculating stats for treatment '{treat_var}': {e}", exc_info=True)
+            # Store partial info if possible
+            if treat_var not in stats_dict:
+                 stats_dict[treat_var] = {'error': str(e)}
+            elif 'error' not in stats_dict[treat_var]:
+                 stats_dict[treat_var]['error'] = str(e)
+    return stats_dict
+def analyze_dataset(
+    dataset_path: str,
+    llm_client: Optional[BaseChatModel] = None,
+    dataset_description: Optional[str] = None,
+    original_query: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Analyze a dataset to identify important characteristics for causal inference.
+    Args:
+        dataset_path: Path to the dataset file
+        llm_client: Optional LLM client for enhanced analysis
+        dataset_description: Optional description of the dataset for context
+    Returns:
+        Dict containing dataset analysis results:
+            - dataset_info: Basic information about the dataset
+            - columns: List of column names
+            - potential_treatments: List of potential treatment variables (possibly LLM augmented)
+            - potential_outcomes: List of potential outcome variables (possibly LLM augmented)
+            - temporal_structure_detected: Whether temporal structure was detected
+            - panel_data_detected: Whether panel data structure was detected
+            - potential_instruments_detected: Whether potential instruments were detected
+            - discontinuities_detected: Whether discontinuities were detected
+            - llm_augmentation: Status of LLM augmentation if used
+    """
+    llm_augmentation = "Not used" if not llm_client else "Initialized"
+    # Check if file exists
+    if not os.path.exists(dataset_path):
+        logger.error(f"Dataset file not found at {dataset_path}")
+        return {"error": f"Dataset file not found at {dataset_path}"}
+    try:
+        # Load the dataset
+        df = pd.read_csv(dataset_path)
+        # Basic dataset information
+        sample_size = len(df)
+        columns_list = df.columns.tolist()
+        num_covariates = len(columns_list) - 2 # Rough estimate (total - T - Y)
+        dataset_info = {
+            "num_rows": sample_size,
+            "num_columns": len(columns_list),
+            "file_path": dataset_path,
+            "file_name": os.path.basename(dataset_path)
+        }
+        # --- Detailed Analysis (Keep internal) ---
+        column_types_detailed = {col: str(df[col].dtype) for col in df.columns}
+        missing_values_detailed = df.isnull().sum().to_dict()
+        column_categories_detailed = _categorize_columns(df)
+        column_nunique_counts_detailed = {col: df[col].nunique() for col in df.columns} # Calculate nunique
+        numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
+        correlations_detailed = df[numeric_cols].corr() if numeric_cols else pd.DataFrame()
+        temporal_structure_detailed = detect_temporal_structure(df, llm_client, dataset_description, original_query)
+        # First, identify potential treatment and outcome variables
+        potential_variables = _identify_potential_variables(
+            df,
+            column_categories_detailed,
+            llm_client=llm_client,
+            dataset_description=dataset_description
+        )
+        if llm_client:
+            llm_augmentation = "Used for variable identification"
+        # Then use that info to help find potential instrumental variables
+        potential_instruments_detailed = find_potential_instruments(
+            df,
+            llm_client=llm_client,
+            potential_treatments=potential_variables.get("potential_treatments", []),
+            potential_outcomes=potential_variables.get("potential_outcomes", []),
+            dataset_description=dataset_description
+        )
+        # Other analyses
+        discontinuities_detailed = detect_discontinuities(df)
+        variable_relationships_detailed = assess_variable_relationships(df, correlations_detailed)
+        # Calculate per-group stats for potential binary treatments
+        potential_binary_treatments = [
+            t for t in potential_variables["potential_treatments"]
+            if column_categories_detailed.get(t) == 'binary'
+            or column_categories_detailed.get(t) == 'binary_categorical'
+        ]
+        per_group_stats = _calculate_per_group_stats(df.copy(), potential_binary_treatments)
+        # --- Summarized Analysis (For Output) ---
+        # Get boolean flags and essential lists
+        has_temporal = temporal_structure_detailed.get("has_temporal_structure", False)
+        is_panel = temporal_structure_detailed.get("is_panel_data", False)
+        logger.info(f"iv is {potential_instruments_detailed}")
+        has_instruments = len(potential_instruments_detailed) > 0
+        has_discontinuities = discontinuities_detailed.get("has_discontinuities", False)
+        # --- Extract only instrument names for the final output ---
+        potential_instrument_names = [
+            inst_dict.get('variable')
+            for inst_dict in potential_instruments_detailed
+            if isinstance(inst_dict, dict) and 'variable' in inst_dict
+        ]
+        logger.info(f"iv is {potential_instrument_names}")
+        # --- Final Output Dictionary (Highly Summarized) ---
+        return {
+            "dataset_info": dataset_info, # Keep basic info
+            "columns": columns_list,
+            "potential_treatments": potential_variables["potential_treatments"],
+            "potential_outcomes": potential_variables["potential_outcomes"],
+            # Return concise flags instead of detailed dicts/lists
+            "temporal_structure_detected": has_temporal,
+            "panel_data_detected": is_panel,
+            "potential_instruments_detected": has_instruments,
+            "discontinuities_detected": has_discontinuities,
+            # Use the extracted list of names here
+            "potential_instruments": potential_instrument_names,
+            "discontinuities": discontinuities_detailed,
+            "temporal_structure": temporal_structure_detailed,
+            "column_categories": column_categories_detailed,
+            "column_nunique_counts": column_nunique_counts_detailed, # Add nunique counts to output
+            "sample_size": sample_size,
+            "num_covariates_estimate": num_covariates,
+            "llm_augmentation": llm_augmentation
+        }
+    except Exception as e:
+        logger.error(f"Error analyzing dataset '{dataset_path}': {e}", exc_info=True)
+        return {
+            "error": f"Error analyzing dataset: {str(e)}",
+            "llm_augmentation": llm_augmentation
+        }
+def _categorize_columns(df: pd.DataFrame) -> Dict[str, str]:
+    """
+    Categorize columns into types relevant for causal inference.
+    Args:
+        df: DataFrame to analyze
+    Returns:
+        Dict mapping column names to their types
+    """
+    result = {}
+    for col in df.columns:
+        # Check if column is numeric
+        if pd.api.types.is_numeric_dtype(df[col]):
+            # Count number of unique values
+            n_unique = df[col].nunique()
+            # Binary numeric variable
+            if n_unique == 2:
+                result[col] = "binary"
+            # Likely categorical represented as numeric
+            elif n_unique < 10:
+                result[col] = "categorical_numeric"
+            # Discrete numeric (integers)
+            elif pd.api.types.is_integer_dtype(df[col]):
+                result[col] = "discrete_numeric"
+            # Continuous numeric
+            else:
+                result[col] = "continuous_numeric"
+        # Check for datetime
+        elif pd.api.types.is_datetime64_any_dtype(df[col]) or _is_date_string(df, col):
+            result[col] = "datetime"
+        # Check for categorical
+        elif pd.api.types.is_categorical_dtype(df[col]) or df[col].nunique() < 20:
+            if df[col].nunique() == 2:
+                result[col] = "binary_categorical"
+            else:
+                result[col] = "categorical"
+        # Must be text or other
+        else:
+            result[col] = "text_or_other"
+    return result
+def _is_date_string(df: pd.DataFrame, col: str) -> bool:
+    """
+    Check if a column contains date strings.
+    Args:
+        df: DataFrame to check
+        col: Column name to check
+    Returns:
+        True if the column appears to contain date strings
+    """
+    # Try to convert to datetime
+    if not pd.api.types.is_string_dtype(df[col]):
+        return False
+    # Check sample of values
+    sample = df[col].dropna().sample(min(10, len(df[col].dropna()))).tolist()
+    try:
+        for val in sample:
+            pd.to_datetime(val)
+        return True
+    except:
+        return False
+def _identify_potential_variables(
+    df: pd.DataFrame,
+    column_categories: Dict[str, str],
+    llm_client: Optional[BaseChatModel] = None,
+    dataset_description: Optional[str] = None
+) -> Dict[str, List[str]]:
+    """
+    Identify potential treatment and outcome variables in the dataset, using LLM if available.
+    Falls back to heuristic method if LLM fails or is not available.
+    Args:
+        df: DataFrame to analyze
+        column_categories: Dictionary mapping column names to their types
+        llm_client: Optional LLM client for enhanced identification
+        dataset_description: Optional description of the dataset for context
+    Returns:
+        Dict with potential treatment and outcome variables
+    """
+    # Try LLM approach if client is provided
+    if llm_client:
+        try:
+            logger.info("Using LLM to identify potential treatment and outcome variables")
+            # Create a concise prompt with just column information
+            columns_list = df.columns.tolist()
+            column_types = {col: str(df[col].dtype) for col in columns_list}
+            # Get binary columns for extra context
+            binary_cols = [col for col in columns_list
+                          if pd.api.types.is_numeric_dtype(df[col]) and df[col].nunique() == 2]
+            # Add dataset description if available
+            description_text = f"\nDataset Description: {dataset_description}" if dataset_description else ""
+            prompt = f"""
+You are an expert causal inference data scientist. Identify potential treatment and outcome variables from this dataset.{description_text}
+Dataset columns:
+{columns_list}
+Column types:
+{column_types}
+Binary columns (good treatment candidates):
+{binary_cols}
+Instructions:
+1. Identify TREATMENT variables: interventions, treatments, programs, policies, or binary state changes.
+   Look for binary variables or names with 'treatment', 'intervention', 'program', 'policy', etc.
+2. Identify OUTCOME variables: results, effects, or responses to treatments.
+   Look for numeric variables (especially non-binary) or names with 'outcome', 'result', 'effect', 'score', etc.
+Return ONLY a valid JSON object with two lists: "potential_treatments" and "potential_outcomes".
+Example: {{"potential_treatments": ["treatment_a", "program_b"], "potential_outcomes": ["result_score", "outcome_measure"]}}
+"""
+            # Call the LLM and parse the response
+            response = llm_client.invoke(prompt)
+            response_text = response.content if hasattr(response, 'content') else str(response)
+            # Extract JSON from the response text
+            import re
+            json_match = re.search(r'{.*}', response_text, re.DOTALL)
+            if json_match:
+                result = json.loads(json_match.group(0))
+                # Validate the response
+                if (isinstance(result, dict) and
+                    "potential_treatments" in result and
+                    "potential_outcomes" in result and
+                    isinstance(result["potential_treatments"], list) and
+                    isinstance(result["potential_outcomes"], list)):
+                    # Ensure all suggestions are valid columns
+                    valid_treatments = [col for col in result["potential_treatments"] if col in df.columns]
+                    valid_outcomes = [col for col in result["potential_outcomes"] if col in df.columns]
+                    if valid_treatments and valid_outcomes:
+                        logger.info(f"LLM identified {len(valid_treatments)} treatments and {len(valid_outcomes)} outcomes")
+                        return {
+                            "potential_treatments": valid_treatments,
+                            "potential_outcomes": valid_outcomes
+                        }
+                    else:
+                        logger.warning("LLM suggested invalid columns, falling back to heuristic method")
+                else:
+                    logger.warning("Invalid LLM response format, falling back to heuristic method")
+            else:
+                logger.warning("Could not extract JSON from LLM response, falling back to heuristic method")
+        except Exception as e:
+            logger.error(f"Error in LLM identification: {e}", exc_info=True)
+            logger.info("Falling back to heuristic method")
+    # Fallback to heuristic method
+    logger.info("Using heuristic method to identify potential treatment and outcome variables")
+    # Identify potential treatment variables
+    potential_treatments = []
+    # Look for binary variables (good treatment candidates)
+    binary_cols = [col for col in df.columns
+                   if pd.api.types.is_numeric_dtype(df[col]) and df[col].nunique() == 2]
+    # Look for variables with names suggesting treatment
+    treatment_keywords = ['treatment', 'treat', 'intervention', 'program', 'policy',
+                         'exposed', 'assigned', 'received', 'participated']
+    for col in df.columns:
+        col_lower = col.lower()
+        if any(keyword in col_lower for keyword in treatment_keywords):
+            potential_treatments.append(col)
+    # Add binary variables if we don't have enough candidates
+    if len(potential_treatments) < 3:
+        for col in binary_cols:
+            if col not in potential_treatments:
+                potential_treatments.append(col)
+                if len(potential_treatments) >= 3:
+                    break
+    # Identify potential outcome variables
+    potential_outcomes = []
+    # Look for numeric variables that aren't binary
+    numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
+    non_binary_numeric = [col for col in numeric_cols if col not in binary_cols]
+    # Look for variables with names suggesting outcomes
+    outcome_keywords = ['outcome', 'result', 'effect', 'response', 'score', 'performance',
+                       'achievement', 'success', 'failure', 'improvement']
+    for col in df.columns:
+        col_lower = col.lower()
+        if any(keyword in col_lower for keyword in outcome_keywords):
+            potential_outcomes.append(col)
+    # Add numeric non-binary variables if we don't have enough candidates
+    if len(potential_outcomes) < 3:
+        for col in non_binary_numeric:
+            if col not in potential_outcomes and col not in potential_treatments:
+                potential_outcomes.append(col)
+                if len(potential_outcomes) >= 3:
+                    break
+    return {
+        "potential_treatments": potential_treatments,
+        "potential_outcomes": potential_outcomes
+    }
+def detect_temporal_structure(
+    df: pd.DataFrame,
+    llm_client: Optional[BaseChatModel] = None,
+    dataset_description: Optional[str] = None,
+    original_query: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Detect temporal structure in the dataset, using LLM for enhanced identification.
+    Args:
+        df: DataFrame to analyze
+        llm_client: Optional LLM client for enhanced identification
+        dataset_description: Optional description of the dataset for context
+    Returns:
+        Dict with information about temporal structure:
+            - has_temporal_structure: Whether temporal structure exists
+            - temporal_columns: Primary time column identified (or list if multiple from heuristic)
+            - is_panel_data: Whether data is in panel format
+            - time_column: Primary time column identified for panel data
+            - id_column: Primary unit ID column identified for panel data
+            - time_periods: Number of time periods (if panel data)
+            - units: Number of unique units (if panel data)
+            - identification_method: How time/unit vars were identified ('LLM', 'Heuristic', 'None')
+    """
+    result = {
+        "has_temporal_structure": False,
+        "temporal_columns": [], # Will store primary time column or heuristic list
+        "is_panel_data": False,
+        "time_column": None,
+        "id_column": None,
+        "time_periods": None,
+        "units": None,
+        "identification_method": "None"
+    }
+    # --- Step 1: Heuristic identification (as before) ---
+    #heuristic_datetime_cols = []
+    #for col in df.columns:
+    #    if pd.api.types.is_datetime64_any_dtype(df[col]):
+    #        heuristic_datetime_cols.append(col)
+    #    elif pd.api.types.is_string_dtype(df[col]):
+    #        try:
+    #            if pd.to_datetime(df[col], errors='coerce').notna().any():
+    #                heuristic_datetime_cols.append(col)
+    #        except:
+    #            pass # Ignore conversion errors
+    #time_keywords = ['year', 'month', 'day', 'date', 'time', 'period', 'quarter', 'week']
+    #for col in df.columns:
+    #    col_lower = col.lower()
+    #    if any(keyword in col_lower for keyword in time_keywords) and col not in heuristic_datetime_cols:
+    #        heuristic_datetime_cols.append(col)
+    #id_keywords = ['id', 'individual', 'person', 'unit', 'entity', 'firm', 'company', 'state', 'country']
+    #heuristic_potential_id_cols = []
+    #for col in df.columns:
+    #    col_lower = col.lower()
+    #    # Exclude columns already identified as time-related by heuristics
+    #    if any(keyword in col_lower for keyword in id_keywords) and col not in heuristic_datetime_cols:
+    #        heuristic_potential_id_cols.append(col)
+    # --- Step 2: LLM-assisted identification ---
+    llm_identified_time_var = None
+    llm_identified_unit_var = None
+    heuristic_datetime_cols = []
+    heuristic_potential_id_cols = []
+    dataset_summary = df.describe(include='all')
+    if llm_client:
+        logger.info("Attempting LLM-assisted identification of temporal/unit variables.")
+        column_names = df.columns.tolist()
+        column_dtypes_dict = {col: str(df[col].dtype) for col in column_names}
+        try:
+            llm_suggestions = llm_identify_temporal_and_unit_vars(
+                column_names=column_names,
+                column_dtypes=column_dtypes_dict,
+                dataset_description=dataset_description if dataset_description else "No dataset description provided.",
+                dataset_summary=dataset_summary,
+                heuristic_time_candidates=heuristic_datetime_cols,
+                heuristic_id_candidates=heuristic_potential_id_cols,
+                query=original_query if original_query else "No query provided.",
+                llm=llm_client
+            )
+            llm_identified_time_var = llm_suggestions.get("time_variable")
+            llm_identified_unit_var = llm_suggestions.get("unit_variable")
+            result["identification_method"] = "LLM"
+            if not llm_identified_time_var and not llm_identified_unit_var:
+                result["identification_method"] = "LLM_NoIdentification"
+        except Exception as e:
+            logger.warning(f"LLM call for temporal/unit vars failed: {e}. Falling back to heuristics.")
+            result["identification_method"] = "Heuristic_LLM_Error"
+    else:
+        result["identification_method"] = "Heuristic_NoLLM"
+    # --- Step 3: Combine LLM and Heuristic Results ---
+    final_time_var = None
+    final_unit_var = None
+    if llm_identified_time_var:
+        final_time_var = llm_identified_time_var
+        logger.info(f"Prioritizing LLM identified time variable: {final_time_var}")
+    elif heuristic_datetime_cols:
+        final_time_var = heuristic_datetime_cols[0] # Fallback to first heuristic time col
+        logger.info(f"Using heuristic time variable: {final_time_var}")
+    if llm_identified_unit_var:
+        final_unit_var = llm_identified_unit_var
+        logger.info(f"Prioritizing LLM identified unit variable: {final_unit_var}")
+    elif heuristic_potential_id_cols:
+        final_unit_var = heuristic_potential_id_cols[0] # Fallback to first heuristic ID col
+        logger.info(f"Using heuristic unit variable: {final_unit_var}")
+    # Update results based on final selections
+    if final_time_var:
+        result["has_temporal_structure"] = True
+        result["temporal_columns"] = [final_time_var] # Store as a list with the primary time var
+        result["time_column"] = final_time_var
+    else: # If no time var found by LLM or heuristic, use original heuristic list for temporal_columns
+        if heuristic_datetime_cols:
+            result["has_temporal_structure"] = True
+            result["temporal_columns"] = heuristic_datetime_cols
+        # time_column remains None
+    if final_unit_var:
+        result["id_column"] = final_unit_var
+    # --- Step 4: Update Panel Data Logic (based on final_time_var and final_unit_var) ---
+    if final_time_var and final_unit_var:
+        # Check if there are multiple time periods per unit using the identified variables
+        try:
+            # Ensure columns exist before groupby
+            if final_time_var in df.columns and final_unit_var in df.columns:
+                if df.groupby(final_unit_var)[final_time_var].nunique().mean() > 1.0:
+                    result["is_panel_data"] = True
+                    result["time_periods"] = df[final_time_var].nunique()
+                    result["units"] = df[final_unit_var].nunique()
+                    logger.info(f"Panel data detected: Time='{final_time_var}', Unit='{final_unit_var}', Periods={result['time_periods']}, Units={result['units']}")
+                else:
+                    logger.info("Not panel data: Each unit does not have multiple time periods.")
+            else:
+                logger.warning(f"Final time ('{final_time_var}') or unit ('{final_unit_var}') var not in DataFrame. Cannot confirm panel structure.")
+        except Exception as e:
+            logger.error(f"Error checking panel data structure with time='{final_time_var}', unit='{final_unit_var}': {e}")
+            result["is_panel_data"] = False # Default to false on error
+    else:
+        logger.info("Not panel data: Missing either time or unit variable for panel structure.")
+    logger.debug(f"Final temporal structure detection result: {result}")
+    return result
+def find_potential_instruments(
+    df: pd.DataFrame,
+    llm_client: Optional[BaseChatModel] = None,
+    potential_treatments: List[str] = None,
+    potential_outcomes: List[str] = None,
+    dataset_description: Optional[str] = None
+) -> List[Dict[str, Any]]:
+    """
+    Find potential instrumental variables in the dataset, using LLM if available.
+    Falls back to heuristic method if LLM fails or is not available.
+    Args:
+        df: DataFrame to analyze
+        llm_client: Optional LLM client for enhanced identification
+        potential_treatments: Optional list of potential treatment variables
+        potential_outcomes: Optional list of potential outcome variables
+        dataset_description: Optional description of the dataset for context
+    Returns:
+        List of potential instrumental variables with their properties
+    """
+    # Try LLM approach if client is provided
+    if llm_client:
+        try:
+            logger.info("Using LLM to identify potential instrumental variables")
+            # Create a concise prompt with just column information
+            columns_list = df.columns.tolist()
+            # Exclude known treatment and outcome variables from consideration
+            excluded_columns = []
+            if potential_treatments:
+                excluded_columns.extend(potential_treatments)
+            if potential_outcomes:
+                excluded_columns.extend(potential_outcomes)
+            # Filter columns to exclude treatments and outcomes
+            candidate_columns = [col for col in columns_list if col not in excluded_columns]
+            if not candidate_columns:
+                logger.warning("No eligible columns for instrumental variables after filtering treatments and outcomes")
+                return []
+            # Get column types for context
+            column_types = {col: str(df[col].dtype) for col in candidate_columns}
+            # Add dataset description if available
+            description_text = f"\nDataset Description: {dataset_description}" if dataset_description else ""
+            prompt = f"""
+You are an expert causal inference data scientist. Identify potential instrumental variables from this dataset.{description_text}
+DEFINITION: Instrumental variables must:
+1. Be correlated with the treatment variable (relevance)
+2. Only affect the outcome through the treatment (exclusion restriction)
+3. Not be correlated with unmeasured confounders (exogeneity)
+Treatment variables: {potential_treatments if potential_treatments else "Unknown"}
+Outcome variables: {potential_outcomes if potential_outcomes else "Unknown"}
+Available columns (excluding treatments and outcomes):
+{candidate_columns}
+Column types:
+{column_types}
+Look for variables likely to be:
+- Random assignments
+- Policy changes
+- Geographic or temporal variations
+- Variables with names containing: 'instrument', 'iv', 'assigned', 'random', 'lottery', 'exogenous'
+Return ONLY a JSON array of objects, each with "variable", "reason", and "data_type" fields.
+Example:
+[
+  {{"variable": "random_assignment", "reason": "Random assignment variable", "data_type": "int64"}},
+  {{"variable": "distance_to_facility", "reason": "Geographic variation", "data_type": "float64"}}
+]
+"""
+            # Call the LLM and parse the response
+            response = llm_client.invoke(prompt)
+            response_text = response.content if hasattr(response, 'content') else str(response)
+            # Extract JSON from the response text
+            import re
+            json_match = re.search(r'\[\s*{.*}\s*\]', response_text, re.DOTALL)
+            if json_match:
+                result = json.loads(json_match.group(0))
+                # Validate the response
+                if isinstance(result, list) and len(result) > 0:
+                    # Filter for valid entries
+                    valid_instruments = []
+                    for item in result:
+                        if not isinstance(item, dict) or "variable" not in item:
+                            continue
+                        if item["variable"] not in df.columns:
+                            continue
+                        # Ensure all required fields are present
+                        if "reason" not in item:
+                            item["reason"] = "Identified by LLM"
+                        if "data_type" not in item:
+                            item["data_type"] = str(df[item["variable"]].dtype)
+                        valid_instruments.append(item)
+                    if valid_instruments:
+                        logger.info(f"LLM identified {len(valid_instruments)} potential instrumental variables {valid_instruments}")
+                        return valid_instruments
+                    else:
+                        logger.warning("No valid instruments found by LLM, falling back to heuristic method")
+                else:
+                    logger.warning("Invalid LLM response format, falling back to heuristic method")
+            else:
+                logger.warning("Could not extract JSON from LLM response, falling back to heuristic method")
+        except Exception as e:
+            logger.error(f"Error in LLM identification of instruments: {e}", exc_info=True)
+            logger.info("Falling back to heuristic method")
+    # Fallback to heuristic method
+    logger.info("Using heuristic method to identify potential instrumental variables")
+    potential_instruments = []
+    # Look for variables with instrumental-related names
+    instrument_keywords = ['instrument', 'iv', 'assigned', 'random', 'lottery', 'exogenous']
+    for col in df.columns:
+        # Skip treatment and outcome variables
+        if potential_treatments and col in potential_treatments:
+            continue
+        if potential_outcomes and col in potential_outcomes:
+            continue
+        col_lower = col.lower()
+        if any(keyword in col_lower for keyword in instrument_keywords):
+            instrument_info = {
+                "variable": col,
+                "reason": f"Name contains instrument-related keyword",
+                "data_type": str(df[col].dtype)
+            }
+            potential_instruments.append(instrument_info)
+    return potential_instruments
+def detect_discontinuities(df: pd.DataFrame) -> Dict[str, Any]:
+    """
+    Identify discontinuities in continuous variables (for RDD).
+    Args:
+        df: DataFrame to analyze
+    Returns:
+        Dict with information about detected discontinuities
+    """
+    discontinuities = []
+    # For each numeric column, check for potential discontinuities
+    numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
+    for col in numeric_cols:
+        # Skip columns with too many unique values
+        if df[col].nunique() > 100:
+            continue
+        values = df[col].dropna().sort_values().values
+        # Calculate gaps between consecutive values
+        if len(values) > 10:
+            gaps = np.diff(values)
+            mean_gap = np.mean(gaps)
+            std_gap = np.std(gaps)
+            # Look for unusually large gaps (potential discontinuities)
+            large_gaps = np.where(gaps > mean_gap + 2*std_gap)[0]
+            if len(large_gaps) > 0:
+                for idx in large_gaps:
+                    cutpoint = (values[idx] + values[idx+1]) / 2
+                    discontinuities.append({
+                        "variable": col,
+                        "cutpoint": float(cutpoint),
+                        "gap_size": float(gaps[idx]),
+                        "mean_gap": float(mean_gap)
+                    })
+    return {
+        "has_discontinuities": len(discontinuities) > 0,
+        "discontinuities": discontinuities
+    }
+def assess_variable_relationships(df: pd.DataFrame, corr_matrix: pd.DataFrame) -> Dict[str, Any]:
+    """
+    Assess relationships between variables in the dataset.
+    Args:
+        df: DataFrame to analyze
+        corr_matrix: Precomputed correlation matrix for numeric columns
+    Returns:
+        Dict with information about variable relationships:
+            - strongly_correlated_pairs: Pairs of strongly correlated variables
+            - potential_confounders: Variables that might be confounders
+    """
+    result = {"strongly_correlated_pairs": [], "potential_confounders": []}
+    numeric_cols = corr_matrix.columns.tolist()
+    if len(numeric_cols) < 2:
+        return result
+    # Use the precomputed correlation matrix
+    corr_matrix_abs = corr_matrix.abs()
+    # Find strongly correlated variable pairs
+    for i in range(len(numeric_cols)):
+        for j in range(i+1, len(numeric_cols)):
+            if abs(corr_matrix_abs.iloc[i, j]) > 0.7:  # Correlation threshold
+                result["strongly_correlated_pairs"].append({
+                    "variables": [numeric_cols[i], numeric_cols[j]],
+                    "correlation": float(corr_matrix.iloc[i, j])
+                })
+    # Identify potential confounders (variables correlated with multiple others)
+    confounder_counts = {col: 0 for col in numeric_cols}
+    for pair in result["strongly_correlated_pairs"]:
+        confounder_counts[pair["variables"][0]] += 1
+        confounder_counts[pair["variables"][1]] += 1
+    # Variables correlated with multiple others are potential confounders
+    for col, count in confounder_counts.items():
+        if count >= 2:
+            result["potential_confounders"].append({"variable": col, "num_correlations": count})
+    return result

auto_causal/components/decision_tree.py ADDED Viewed

	@@ -0,0 +1,366 @@

+"""
+decision tree component for selecting causal inference methods
+this module implements the decision tree logic to select the most appropriate
+causal inference method based on dataset characteristics and available variables
+"""
+import logging
+from typing import Dict, List, Any, Optional
+import pandas as pd
+# define method names
+BACKDOOR_ADJUSTMENT = "backdoor_adjustment"
+LINEAR_REGRESSION = "linear_regression"
+DIFF_IN_MEANS = "diff_in_means"
+DIFF_IN_DIFF = "difference_in_differences"
+REGRESSION_DISCONTINUITY = "regression_discontinuity_design"
+PROPENSITY_SCORE_MATCHING = "propensity_score_matching"
+INSTRUMENTAL_VARIABLE = "instrumental_variable"
+CORRELATION_ANALYSIS = "correlation_analysis"
+PROPENSITY_SCORE_WEIGHTING = "propensity_score_weighting"
+GENERALIZED_PROPENSITY_SCORE = "generalized_propensity_score"
+FRONTDOOR_ADJUSTMENT = "frontdoor_adjustment"
+logger = logging.getLogger(__name__)
+# method assumptions mapping
+METHOD_ASSUMPTIONS = {
+    BACKDOOR_ADJUSTMENT: [
+        "no unmeasured confounders (conditional ignorability given covariates)",
+        "correct model specification for outcome conditional on treatment and covariates",
+        "positivity/overlap (for all covariate values, units could potentially receive either treatment level)"
+    ],
+    LINEAR_REGRESSION: [
+        "linear relationship between treatment, covariates, and outcome",
+        "no unmeasured confounders (if observational)",
+        "correct model specification",
+        "homoscedasticity of errors",
+        "normally distributed errors (for inference)"
+    ],
+    DIFF_IN_MEANS: [
+        "treatment is randomly assigned (or as-if random)",
+        "no spillover effects",
+        "stable unit treatment value assumption (SUTVA)"
+    ],
+    DIFF_IN_DIFF: [
+        "parallel trends between treatment and control groups before treatment",
+        "no spillover effects between groups",
+        "no anticipation effects before treatment",
+        "stable composition of treatment and control groups",
+        "treatment timing is exogenous"
+    ],
+    REGRESSION_DISCONTINUITY: [
+        "units cannot precisely manipulate the running variable around the cutoff",
+        "continuity of conditional expectation functions of potential outcomes at the cutoff",
+        "no other changes occurring precisely at the cutoff"
+    ],
+    PROPENSITY_SCORE_MATCHING: [
+        "no unmeasured confounders (conditional ignorability)",
+        "sufficient overlap (common support) between treatment and control groups",
+        "correct propensity score model specification"
+    ],
+    INSTRUMENTAL_VARIABLE: [
+        "instrument is correlated with treatment (relevance)",
+        "instrument affects outcome only through treatment (exclusion restriction)",
+        "instrument is independent of unmeasured confounders (exogeneity/independence)"
+    ],
+    CORRELATION_ANALYSIS: [
+        "data represents a sample from the population of interest",
+        "variables are measured appropriately"
+    ],
+    PROPENSITY_SCORE_WEIGHTING: [
+        "no unmeasured confounders (conditional ignorability)",
+        "sufficient overlap (common support) between treatment and control groups",
+        "correct propensity score model specification",
+        "weights correctly specified (e.g., ATE, ATT)"
+    ],
+    GENERALIZED_PROPENSITY_SCORE: [
+        "conditional mean independence",
+        "positivity/common support for GPS",
+        "correct specification of the GPS model",
+        "correct specification of the outcome model",
+        "no unmeasured confounders affecting both treatment and outcome, given X",
+        "treatment variable is continuous"
+    ],
+    FRONTDOOR_ADJUSTMENT: [
+        "mediator is affected by treatment and affects outcome",
+        "mediator is not affected by any confounders of the treatment-outcome relationship"
+    ]
+}
+def select_method(dataset_properties: Dict[str, Any], excluded_methods: Optional[List[str]] = None) -> Dict[str, Any]:
+    excluded_methods = set(excluded_methods or [])
+    logger.info(f"Excluded methods: {sorted(excluded_methods)}")
+    treatment = dataset_properties.get("treatment_variable")
+    outcome = dataset_properties.get("outcome_variable")
+    if not treatment or not outcome:
+        raise ValueError("Both treatment and outcome variables must be specified")
+    instrument_var = dataset_properties.get("instrument_variable")
+    running_var = dataset_properties.get("running_variable")
+    cutoff_val = dataset_properties.get("cutoff_value")
+    time_var = dataset_properties.get("time_variable")
+    is_rct = dataset_properties.get("is_rct", False)
+    has_temporal = dataset_properties.get("has_temporal_structure", False)
+    frontdoor = dataset_properties.get("frontdoor_criterion", False)
+    covariate_overlap_result = dataset_properties.get("covariate_overlap_score")
+    covariates = dataset_properties.get("covariates", [])
+    treatment_variable_type = dataset_properties.get("treatment_variable_type", "binary")
+    # Helpers to collect candidates
+    candidates = []  # list of (method, priority_index)
+    justifications: Dict[str, str] = {}
+    assumptions: Dict[str, List[str]] = {}
+    def add(method: str, justification: str, prio_order: List[str]):
+        if method in justifications:  # already added
+            return
+        justifications[method] = justification
+        assumptions[method] = METHOD_ASSUMPTIONS[method]
+        # priority index from provided order (fallback large if not present)
+        try:
+            idx = prio_order.index(method)
+        except ValueError:
+            idx = 10**6
+        candidates.append((method, idx))
+    # ----- Build candidate set (no returns here) -----
+    # RCT branch
+    if is_rct:
+        logger.info("Dataset is from a randomized controlled trial (RCT)")
+        rct_priority = [INSTRUMENTAL_VARIABLE, LINEAR_REGRESSION, DIFF_IN_MEANS]
+        if instrument_var and instrument_var != treatment:
+            add(INSTRUMENTAL_VARIABLE,
+                f"RCT encouragement: instrument '{instrument_var}' differs from treatment '{treatment}'.",
+                rct_priority)
+        if covariates:
+            add(LINEAR_REGRESSION,
+                "RCT with covariates—use OLS for precision.",
+                rct_priority)
+        else:
+            add(DIFF_IN_MEANS,
+                "Pure RCT without covariates—difference-in-means.",
+                rct_priority)
+    # Observational branch
+    obs_priority_binary = [
+        INSTRUMENTAL_VARIABLE,
+        PROPENSITY_SCORE_MATCHING,
+        PROPENSITY_SCORE_WEIGHTING,
+        FRONTDOOR_ADJUSTMENT,
+        LINEAR_REGRESSION,
+    ]
+    obs_priority_nonbinary = [
+        INSTRUMENTAL_VARIABLE,
+        FRONTDOOR_ADJUSTMENT,
+        LINEAR_REGRESSION,
+    ]
+    # Common early structural signals first (still only add as candidates)
+    if has_temporal and time_var:
+        add(DIFF_IN_DIFF,
+            f"Temporal structure via '{time_var}'—consider Difference-in-Differences (assumes parallel trends).",
+            [DIFF_IN_DIFF])  # highest among itself
+    if running_var and cutoff_val is not None:
+        add(REGRESSION_DISCONTINUITY,
+            f"Running variable '{running_var}' with cutoff {cutoff_val}—consider RDD.",
+            [REGRESSION_DISCONTINUITY])
+    # Binary vs non-binary pathways
+    if treatment_variable_type == "binary":
+        if instrument_var:
+            add(INSTRUMENTAL_VARIABLE,
+                f"Instrumental variable '{instrument_var}' available.",
+                obs_priority_binary)
+        # Propensity score methods only if covariates exist
+        if covariates:
+            if covariate_overlap_result is not None:
+                ps_method = (PROPENSITY_SCORE_WEIGHTING
+                             if covariate_overlap_result < 0.1
+                             else PROPENSITY_SCORE_MATCHING)
+            else:
+                ps_method = PROPENSITY_SCORE_MATCHING
+            add(ps_method,
+                "Covariates observed; PS method chosen based on overlap.",
+                obs_priority_binary)
+        if frontdoor:
+            add(FRONTDOOR_ADJUSTMENT,
+                "Front-door criterion satisfied.",
+                obs_priority_binary)
+        add(LINEAR_REGRESSION,
+            "OLS as a fallback specification.",
+            obs_priority_binary)
+    else:
+        logger.info(f"Non-binary treatment variable detected: {treatment_variable_type}")
+        if instrument_var:
+            add(INSTRUMENTAL_VARIABLE,
+                f"Instrument '{instrument_var}' candidate for non-binary treatment.",
+                obs_priority_nonbinary)
+        if frontdoor:
+            add(FRONTDOOR_ADJUSTMENT,
+                "Front-door criterion satisfied.",
+                obs_priority_nonbinary)
+        add(LINEAR_REGRESSION,
+            "Fallback for non-binary treatment without stronger identification.",
+            obs_priority_nonbinary)
+    # ----- Centralized exclusion handling -----
+    # Remove excluded
+    filtered = [(m, p) for (m, p) in candidates if m not in excluded_methods]
+    # If nothing survives, attempt a safe fallback not excluded
+    if not filtered:
+        logger.warning(f"All candidates excluded. Candidates were: {[m for m,_ in candidates]}. Excluded: {sorted(excluded_methods)}")
+        fallback_order = [
+            LINEAR_REGRESSION,
+            DIFF_IN_MEANS,
+            PROPENSITY_SCORE_MATCHING,
+            PROPENSITY_SCORE_WEIGHTING,
+            DIFF_IN_DIFF,
+            REGRESSION_DISCONTINUITY,
+            INSTRUMENTAL_VARIABLE,
+            FRONTDOOR_ADJUSTMENT,
+        ]
+        fallback = next((m for m in fallback_order if m in justifications and m not in excluded_methods), None)
+        if not fallback:
+            # truly nothing left; raise with context
+            raise RuntimeError("No viable method remains after exclusions.")
+        selected_method = fallback
+        alternatives = []
+        justifications[selected_method] = justifications.get(selected_method, "Fallback after exclusions.")
+    else:
+        # Pick by smallest priority index, then stable by insertion
+        filtered.sort(key=lambda x: x[1])
+        selected_method = filtered[0][0]
+        alternatives = [m for (m, _) in filtered[1:] if m != selected_method]
+    logger.info(f"Selected method: {selected_method}; alternatives: {alternatives}")
+    return {
+        "selected_method": selected_method,
+        "method_justification": justifications[selected_method],
+        "method_assumptions": assumptions[selected_method],
+        "alternatives": alternatives,
+        "excluded_methods": sorted(excluded_methods),
+    }
+def rule_based_select_method(dataset_analysis, variables, is_rct, llm, dataset_description, original_query, excluded_methods=None):
+    """
+    Wrapped function to select causal method based on dataset properties and query
+    Args:
+      dataset_analysis (Dict): results of dataset analysis
+      variables (Dict): dictionary of variable names and types
+      is_rct (bool): whether the dataset is from a randomized controlled trial
+      llm (BaseChatModel): language model instance for generating prompts
+      dataset_description (str): description of the dataset
+      original_query (str): the original user query
+      excluded_methods (List[str], optional): list of methods to exclude from selection
+    """
+    logger.info("Running rule-based method selection")
+    properties = {"treatment_variable": variables.get("treatment_variable"), "instrument_variable":variables.get("instrument_variable"),
+                  "covariates": variables.get("covariates", []), "outcome_variable": variables.get("outcome_variable"),
+                  "time_variable": variables.get("time_variable"), "running_variable": variables.get("running_variable"),
+                  "treatment_variable_type": variables.get("treatment_variable_type", "binary"),
+                  "has_temporal_structure": dataset_analysis.get("temporal_structure", False).get("has_temporal_structure", False),
+                  "frontdoor_criterion": variables.get("frontdoor_criterion", False),
+                  "cutoff_value": variables.get("cutoff_value"),
+                  "covariate_overlap_score": variables.get("covariate_overlap_result", 0)}
+    properties["is_rct"] = is_rct
+    logger.info(f"Dataset properties for method selection: {properties}")
+    return select_method(properties, excluded_methods)
+class DecisionTreeEngine:
+    """
+    Engine for applying decision trees to select appropriate causal methods.
+    This class wraps the functional decision tree implementation to provide
+    an object-oriented interface for method selection.
+    """
+    def __init__(self, verbose=False):
+        self.verbose = verbose
+    def select_method(self, df: pd.DataFrame, treatment: str, outcome: str, covariates: List[str],
+                      dataset_analysis: Dict[str, Any], query_details: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Apply decision tree to select appropriate causal method.
+        """
+        if self.verbose:
+            print(f"Applying decision tree for treatment: {treatment}, outcome: {outcome}")
+            print(f"Available covariates: {covariates}")
+        treatment_variable_type = query_details.get("treatment_variable_type")
+        covariate_overlap_result = query_details.get("covariate_overlap_result")
+        info = {"treatment_variable": treatment, "outcome_variable": outcome,
+                     "covariates": covariates, "time_variable": query_details.get("time_variable"),
+                     "group_variable": query_details.get("group_variable"),
+                     "instrument_variable": query_details.get("instrument_variable"),
+                     "running_variable": query_details.get("running_variable"),
+                     "cutoff_value": query_details.get("cutoff_value"),
+                     "is_rct": query_details.get("is_rct", False),
+                     "has_temporal_structure": dataset_analysis.get("temporal_structure", False).get("has_temporal_structure", False),
+                     "frontdoor_criterion": query_details.get("frontdoor_criterion", False),
+                     "covariate_overlap_score": covariate_overlap_result,
+                     "treatment_variable_type": treatment_variable_type}
+        result = select_method(info)
+        if self.verbose:
+            print(f"Selected method: {result['selected_method']}")
+            print(f"Justification: {result['method_justification']}")
+        result["decision_path"] = self._get_decision_path(result["selected_method"])
+        return result
+    def _get_decision_path(self, method):
+        if method == "linear_regression":
+            return ["Check if randomized experiment", "Data appears to be from a randomized experiment with covariates"]
+        elif method == "propensity_score_matching":
+            return ["Check if randomized experiment", "Data is observational",
+                    "Check for sufficient covariate overlap", "Sufficient overlap exists"]
+        elif method == "propensity_score_weighting":
+            return ["Check if randomized experiment", "Data is observational",
+                "Check for sufficient covariate overlap", "Low overlap—weighting preferred"]
+        elif method == "backdoor_adjustment":
+            return ["Check if randomized experiment", "Data is observational",
+                "Check for sufficient covariate overlap", "Adjusting for covariates"]
+        elif method == "instrumental_variable":
+            return ["Check if randomized experiment", "Data is observational",
+                "Check for instrumental variables", "Instrument is available"]
+        elif method == "regression_discontinuity_design":
+            return ["Check if randomized experiment", "Data is observational",
+                "Check for discontinuity", "Discontinuity exists"]
+        elif method == "difference_in_differences":
+            return ["Check if randomized experiment", "Data is observational",
+                "Check for temporal structure", "Panel data structure exists"]
+        elif method == "frontdoor_adjustment":
+            return ["Check if randomized experiment", "Data is observational",
+                "Check front-door criterion", "Front-door path identified"]
+        elif method == "diff_in_means":
+            return ["Check if randomized experiment", "Pure RCT without covariates"]
+        else:
+            return ["Default method selection"]

auto_causal/components/decision_tree_llm.py ADDED Viewed

	@@ -0,0 +1,218 @@

+"""
+LLM-based Decision tree component for selecting causal inference methods.
+This module implements the decision tree logic via an LLM prompt
+to select the most appropriate causal inference method based on
+dataset characteristics and available variables.
+"""
+import logging
+import json
+from typing import Dict, Any, Optional, List
+from langchain_core.messages import HumanMessage
+from langchain_core.language_models import BaseChatModel
+# Import constants and assumptions from the original decision_tree module
+from .decision_tree import (
+    METHOD_ASSUMPTIONS,
+    BACKDOOR_ADJUSTMENT,
+    LINEAR_REGRESSION,
+    DIFF_IN_MEANS,
+    DIFF_IN_DIFF,
+    REGRESSION_DISCONTINUITY,
+    PROPENSITY_SCORE_MATCHING,
+    INSTRUMENTAL_VARIABLE,
+    CORRELATION_ANALYSIS,
+    PROPENSITY_SCORE_WEIGHTING,
+    GENERALIZED_PROPENSITY_SCORE
+)
+# Configure logging
+logger = logging.getLogger(__name__)
+# Define a list of all known methods for the LLM prompt
+ALL_METHODS = [
+    DIFF_IN_MEANS,
+    LINEAR_REGRESSION,
+    DIFF_IN_DIFF,
+    REGRESSION_DISCONTINUITY,
+    INSTRUMENTAL_VARIABLE,
+    PROPENSITY_SCORE_MATCHING,
+    PROPENSITY_SCORE_WEIGHTING,
+    GENERALIZED_PROPENSITY_SCORE,
+    BACKDOOR_ADJUSTMENT, # Often a general approach rather than a specific model.
+    CORRELATION_ANALYSIS,
+]
+METHOD_DESCRIPTIONS_FOR_LLM = {
+    DIFF_IN_MEANS: "Appropriate for Randomized Controlled Trials (RCTs) with no covariates. Compares the average outcome between treated and control groups.",
+    LINEAR_REGRESSION: "Can be used for RCTs with covariates to increase precision, or for observational data assuming linear relationships and no unmeasured confounders. Models the outcome as a linear function of treatment and covariates.",
+    DIFF_IN_DIFF: "Suitable for observational data with a temporal structure (e.g., panel data with pre/post treatment periods). Requires the 'parallel trends' assumption: treatment and control groups would have followed similar trends in the outcome in the absence of treatment.",
+    REGRESSION_DISCONTINUITY: "Applicable when treatment assignment is determined by whether an observed 'running variable' crosses a specific cutoff point. Assumes individuals cannot precisely manipulate the running variable.",
+    INSTRUMENTAL_VARIABLE: "Used when there's an 'instrument' variable that is correlated with the treatment, affects the outcome only through the treatment, and is not confounded with the outcome. Useful for handling unobserved confounding.",
+    PROPENSITY_SCORE_MATCHING: "For observational data with covariates. Estimates the probability of receiving treatment (propensity score) for each unit and then matches treated and control units with similar scores. Aims to create balanced groups.",
+    PROPENSITY_SCORE_WEIGHTING: "Similar to PSM, for observational data with covariates. Uses propensity scores to weight units to create a pseudo-population where confounders are balanced. Can estimate ATE, ATT, or ATC.",
+    GENERALIZED_PROPENSITY_SCORE: "An extension of propensity scores for continuous treatment variables. Aims to estimate the dose-response function, assuming unconfoundedness given covariates.",
+    BACKDOOR_ADJUSTMENT: "A general strategy for causal inference in observational studies that involves statistically controlling for all common causes (confounders) of the treatment and outcome. Specific methods like regression or matching implement this.",
+    CORRELATION_ANALYSIS: "A fallback method when causal inference is not feasible due to data limitations (e.g., no clear design, no covariates for adjustment). Measures the statistical association between variables, but does not imply causation."
+}
+class DecisionTreeLLMEngine:
+    """
+    Engine for applying an LLM-based decision tree to select appropriate causal methods.
+    """
+    def __init__(self, verbose: bool = False):
+        """
+        Initialize the LLM decision tree engine.
+        Args:
+            verbose: Whether to print verbose information.
+        """
+        self.verbose = verbose
+    def _construct_prompt(self, dataset_analysis: Dict[str, Any], variables: Dict[str, Any], is_rct: bool, excluded_methods: Optional[List[str]] = None) -> str:
+        """
+        Constructs the detailed prompt for the LLM.
+        """
+        # Filter out excluded methods
+        excluded_methods = excluded_methods or []
+        available_methods = [method for method in ALL_METHODS if method not in excluded_methods]
+        methods_list_str = "\n".join([f"- {method}: {METHOD_DESCRIPTIONS_FOR_LLM[method]}" for method in available_methods if method in METHOD_DESCRIPTIONS_FOR_LLM])
+        excluded_info = ""
+        if excluded_methods:
+            excluded_info = f"\nEXCLUDED METHODS (do not select these): {', '.join(excluded_methods)}\nReason: These methods failed validation in previous attempts.\n"
+        prompt = f"""You are an expert in causal inference. Your task is to select the most appropriate causal inference method based on the provided dataset analysis and variable information.
+Dataset Analysis:
+{json.dumps(dataset_analysis, indent=2)}
+Identified Variables:
+{json.dumps(variables, indent=2)}
+Is the data from a Randomized Controlled Trial (RCT)? {'Yes' if is_rct else 'No'}{excluded_info}
+Available Causal Inference Methods and their descriptions:
+{methods_list_str}
+Instructions:
+1. Carefully review all the provided information: dataset analysis, variables, and RCT status.
+2. Reason step-by-step to determine the most suitable method. Consider the hierarchy of methods (e.g., specific designs like DiD, RDD, IV before general adjustment methods).
+3. Explain your reasoning for selecting a particular method.
+4. Identify any potential alternative methods if applicable.
+5. State the key assumptions for your *selected* method by referring to the general list of assumptions for all methods that will be provided to you separately (you don't need to list them here, just be aware that you need to select a method for which assumptions are known).
+Output your final decision as a JSON object with the following exact keys:
+- "selected_method": string (must be one of {', '.join(available_methods)})
+- "method_justification": string (your detailed reasoning)
+- "alternative_methods": list of strings (alternative method names, can be empty)
+Example JSON output format:
+{{
+  "selected_method": "difference_in_differences",
+  "method_justification": "The dataset has a clear time variable and group variable, indicating a panel structure suitable for DiD. The parallel trends assumption will need to be checked.",
+  "alternative_methods": ["instrumental_variable"]
+}}
+Please provide only the JSON object in your response.
+"""
+        return prompt
+    def select_method_llm(self, dataset_analysis: Dict[str, Any], variables: Dict[str, Any], is_rct: bool = False, llm: Optional[BaseChatModel] = None, excluded_methods: Optional[List[str]] = None) -> Dict[str, Any]:
+        """
+        Apply LLM-based decision tree to select appropriate causal method.
+        Args:
+            dataset_analysis: Dataset analysis results.
+            variables: Identified variables from query_interpreter.
+            is_rct: Boolean indicating if the data comes from an RCT.
+            llm: Langchain BaseChatModel instance for making the call.
+            excluded_methods: Optional list of method names to exclude from selection.
+        Returns:
+            Dict with selected method, justification, and assumptions.
+            Example:
+            {{
+                "selected_method": "difference_in_differences",
+                "method_justification": "Reasoning...",
+                "method_assumptions": ["Assumption 1", ...],
+                "alternative_methods": ["instrumental_variable"]
+            }}
+        """
+        if not llm:
+            logger.error("LLM client not provided to DecisionTreeLLMEngine. Cannot select method.")
+            return {
+                "selected_method": CORRELATION_ANALYSIS,
+                "method_justification": "LLM client not provided. Defaulting to Correlation Analysis as causal inference method selection is not possible. This indicates association, not causation.",
+                "method_assumptions": METHOD_ASSUMPTIONS.get(CORRELATION_ANALYSIS, []),
+                "alternative_methods": []
+            }
+        prompt = self._construct_prompt(dataset_analysis, variables, is_rct, excluded_methods)
+        if self.verbose:
+            logger.info("LLM Prompt for method selection:")
+            logger.info(prompt)
+        messages = [HumanMessage(content=prompt)]
+        llm_output_str = ""  # Initialize llm_output_str here
+        try:
+            response = llm.invoke(messages)
+            llm_output_str = response.content.strip()
+            if self.verbose:
+                logger.info(f"LLM Raw Output: {llm_output_str}")
+            # Attempt to parse the JSON output
+            # The LLM might sometimes include explanations outside the JSON block.
+            # Try to extract JSON from within ```json ... ``` if present.
+            if "```json" in llm_output_str:
+                json_str = llm_output_str.split("```json")[1].split("```")[0].strip()
+            elif "```" in llm_output_str and llm_output_str.startswith("{") == False : # if it doesn't start with { then likely ```{}```
+                 json_str = llm_output_str.split("```")[1].strip()
+            else: # Assume the entire string is the JSON if no triple backticks
+                json_str = llm_output_str
+            parsed_response = json.loads(json_str)
+            selected_method = parsed_response.get("selected_method")
+            justification = parsed_response.get("method_justification", "No justification provided by LLM.")
+            alternatives = parsed_response.get("alternative_methods", [])
+            if selected_method and selected_method in METHOD_ASSUMPTIONS:
+                logger.info(f"LLM selected method: {selected_method}")
+                return {
+                    "selected_method": selected_method,
+                    "method_justification": justification,
+                    "method_assumptions": METHOD_ASSUMPTIONS[selected_method],
+                    "alternative_methods": alternatives
+                }
+            else:
+                logger.warning(f"LLM selected an invalid or unknown method: '{selected_method}'. Or method not in METHOD_ASSUMPTIONS. Raw response: {llm_output_str}")
+                fallback_justification = f"LLM output was problematic (selected: {selected_method}). Defaulting to Correlation Analysis. LLM Raw Response: {llm_output_str}"
+                selected_method = CORRELATION_ANALYSIS
+                justification = fallback_justification
+        except json.JSONDecodeError as e:
+            logger.error(f"Failed to parse JSON response from LLM: {e}. Raw response: {llm_output_str}", exc_info=True)
+            fallback_justification = f"LLM response was not valid JSON. Defaulting to Correlation Analysis. Error: {e}. LLM Raw Response: {llm_output_str}"
+            selected_method = CORRELATION_ANALYSIS
+            justification = fallback_justification
+            alternatives = []
+        except Exception as e:
+            logger.error(f"Error during LLM call for method selection: {e}. Raw response: {llm_output_str}", exc_info=True)
+            fallback_justification = f"An unexpected error occurred during LLM method selection. Defaulting to Correlation Analysis. Error: {e}. LLM Raw Response: {llm_output_str}"
+            selected_method = CORRELATION_ANALYSIS
+            justification = fallback_justification
+            alternatives = []
+        return {
+            "selected_method": selected_method,
+            "method_justification": justification,
+            "method_assumptions": METHOD_ASSUMPTIONS.get(selected_method, []),
+            "alternative_methods": alternatives
+        }

auto_causal/components/explanation_generator.py ADDED Viewed

	@@ -0,0 +1,404 @@

+"""
+Explanation generator component for causal inference methods.
+This module generates explanations for causal inference methods, including
+what the method does, its assumptions, and how it will be applied to the dataset.
+"""
+from typing import Dict, Any, List, Optional
+from langchain_core.language_models import BaseChatModel # For LLM type hint
+def generate_explanation(
+    method_info: Dict[str, Any],
+    validation_result: Dict[str, Any],
+    variables: Dict[str, Any],
+    results: Dict[str, Any],
+    dataset_analysis: Optional[Dict[str, Any]] = None,
+    dataset_description: Optional[str] = None,
+    llm: Optional[BaseChatModel] = None
+) -> Dict[str, str]:
+    """
+    Generates a comprehensive explanation text for the causal analysis.
+    Args:
+        method_info: Dictionary containing selected method details.
+        validation_result: Dictionary containing method validation results.
+        variables: Dictionary containing identified variables.
+        results: Dictionary containing numerical results from the method execution.
+        dataset_analysis: Optional dictionary with dataset analysis details.
+        dataset_description: Optional string describing the dataset.
+        llm: Optional language model instance (for potential future use in generation).
+    Returns:
+        Dictionary containing the final explanation text.
+    """
+    method = method_info.get("method_name")
+    # Handle potential None for validation_result
+    if validation_result and validation_result.get("valid") is False:
+        method = validation_result.get("recommended_method", method)
+    # Get components
+    method_explanation = get_method_explanation(method)
+    assumption_explanations = explain_assumptions(method_info.get("assumptions", []))
+    application_explanation = explain_application(method, variables.get("treatment_variable"),
+                                                variables.get("outcome_variable"),
+                                                variables.get("covariates", []), variables)
+    limitations_explanation = explain_limitations(method, validation_result.get("concerns", []) if validation_result else [])
+    interpretation_guide = generate_interpretation_guide(method, variables.get("treatment_variable"),
+                                                       variables.get("outcome_variable"))
+    # --- Extract Numerical Results ---
+    effect_estimate = results.get("effect_estimate")
+    effect_se = results.get("effect_se")
+    ci = results.get("confidence_interval")
+    p_value = results.get("p_value") # Assuming method executor returns p_value
+    # --- Assemble Final Text ---
+    final_text = f"**Method Used:** {method_info.get('method_name', method)}\n\n"
+    final_text += f"**Method Explanation:**\n{method_explanation}\n\n"
+    # Add Results Section
+    final_text += "**Results:**\n"
+    if effect_estimate is not None:
+        final_text += f"- Estimated Causal Effect: {effect_estimate:.4f}\n"
+    if effect_se is not None:
+         final_text += f"- Standard Error: {effect_se:.4f}\n"
+    if ci and ci[0] is not None and ci[1] is not None:
+         final_text += f"- 95% Confidence Interval: [{ci[0]:.4f}, {ci[1]:.4f}]\n"
+    if p_value is not None:
+         final_text += f"- P-value: {p_value:.4f}\n"
+    final_text += "\n"
+    final_text += f"**Interpretation Guide:**\n{interpretation_guide}\n\n"
+    final_text += f"**Assumptions:**\n"
+    for item in assumption_explanations:
+        final_text += f"- {item['assumption']}: {item['explanation']}\n"
+    final_text += "\n"
+    final_text += f"**Limitations:**\n{limitations_explanation}\n\n"
+    return {
+        "final_explanation_text": final_text
+        # Return only the final text, the tool wrapper adds workflow state
+    }
+def get_method_explanation(method: str) -> str:
+    """
+    Get explanation for what the method does.
+    Args:
+        method: Causal inference method name
+    Returns:
+        String explaining what the method does
+    """
+    explanations = {
+        "propensity_score_matching": (
+            "Propensity Score Matching is a statistical technique that attempts to estimate the effect "
+            "of a treatment by accounting for covariates that predict receiving the treatment. "
+            "It creates matched sets of treated and untreated subjects who share similar characteristics, "
+            "allowing for a more fair comparison between groups."
+        ),
+        "regression_adjustment": (
+            "Regression Adjustment is a method that uses regression models to estimate causal effects "
+            "by controlling for covariates. It models the outcome as a function of the treatment and "
+            "other potential confounding variables, allowing the isolation of the treatment effect."
+        ),
+        "instrumental_variable": (
+            "The Instrumental Variable method addresses issues of endogeneity or unmeasured confounding "
+            "by using an 'instrument' - a variable that affects the treatment but not the outcome directly. "
+            "It effectively finds the natural experiment hidden in your data to estimate causal effects."
+        ),
+        "difference_in_differences": (
+            "Difference-in-Differences compares the changes in outcomes over time between a group that "
+            "receives a treatment and a group that does not. It controls for time-invariant unobserved "
+            "confounders by looking at differences in trends rather than absolute values."
+        ),
+        "regression_discontinuity": (
+            "Regression Discontinuity Design exploits a threshold or cutoff rule that determines treatment "
+            "assignment. By comparing observations just above and below this threshold, where treatment "
+            "status changes but other characteristics remain similar, it estimates the local causal effect."
+        ),
+        "backdoor_adjustment": (
+            "Backdoor Adjustment controls for confounding variables that create 'backdoor paths' between "
+            "treatment and outcome variables in a causal graph. By conditioning on these variables, "
+            "it blocks the non-causal associations, allowing for identification of the causal effect."
+        ),
+    }
+    return explanations.get(method,
+        f"The {method} method is a causal inference technique used to estimate "
+        f"causal effects from observational data.")
+def explain_assumptions(assumptions: List[str]) -> List[Dict[str, str]]:
+    """
+    Explain each assumption of the method.
+    Args:
+        assumptions: List of assumption names
+    Returns:
+        List of dictionaries with assumption name and explanation
+    """
+    assumption_details = {
+        "Treatment is randomly assigned": (
+            "This assumes that treatment assignment is not influenced by any factors "
+            "related to the outcome, similar to a randomized controlled trial. "
+            "In observational data, this assumption rarely holds without conditioning on confounders."
+        ),
+        "No systematic differences between treatment and control groups": (
+            "Treatment and control groups should be balanced on all relevant characteristics "
+            "except for the treatment itself. Any systematic differences could bias the estimate."
+        ),
+        "No unmeasured confounders (conditional ignorability)": (
+            "All variables that simultaneously affect the treatment and outcome are measured and "
+            "included in the analysis. If important confounders are missing, the estimated causal "
+            "effect will be biased."
+        ),
+        "Sufficient overlap between treatment and control groups": (
+            "For each combination of covariate values, there should be both treated and untreated "
+            "units. Without overlap, the model must extrapolate, which can lead to biased estimates."
+        ),
+        "Treatment assignment is not deterministic given covariates": (
+            "No combination of covariates should perfectly predict treatment assignment. "
+            "If treatment is deterministic for some units, causal comparisons become impossible."
+        ),
+        "Instrument is correlated with treatment (relevance)": (
+            "The instrumental variable must have a clear and preferably strong effect on the "
+            "treatment variable. Weak instruments lead to imprecise and potentially biased estimates."
+        ),
+        "Instrument affects outcome only through treatment (exclusion restriction)": (
+            "The instrumental variable must not directly affect the outcome except through its "
+            "effect on the treatment. If this assumption fails, the causal estimate will be biased."
+        ),
+        "Instrument is as good as randomly assigned (exogeneity)": (
+            "The instrumental variable must not be correlated with any confounders of the "
+            "treatment-outcome relationship. It should be as good as randomly assigned."
+        ),
+        "Parallel trends between treatment and control groups": (
+            "In the absence of treatment, the difference between treatment and control groups "
+            "would have remained constant over time. This is the key identifying assumption for "
+            "difference-in-differences and cannot be directly tested for the post-treatment period."
+        ),
+        "No spillover effects between groups": (
+            "The treatment of one unit should not affect the outcomes of other units. "
+            "If spillovers exist, they can bias the estimated treatment effect."
+        ),
+        "No anticipation effects before treatment": (
+            "Units should not change their behavior in anticipation of future treatment. "
+            "If anticipation effects exist, the pre-treatment trends may already reflect treatment effects."
+        ),
+        "Stable composition of treatment and control groups": (
+            "The composition of treatment and control groups should remain stable over time. "
+            "If units move between groups based on outcomes, this can bias the estimates."
+        ),
+        "Units cannot precisely manipulate their position around the cutoff": (
+            "In regression discontinuity, units must not be able to precisely control their position "
+            "relative to the cutoff. If they can, the randomization-like property of the design fails."
+        ),
+        "No other variables change discontinuously at the cutoff": (
+            "Any discontinuity in outcomes at the cutoff should be attributable only to the change "
+            "in treatment status. If other relevant variables also change at the cutoff, the causal "
+            "interpretation is compromised."
+        ),
+        "The relationship between running variable and outcome is continuous at the cutoff": (
+            "In the absence of treatment, the relationship between the running variable and the "
+            "outcome would be continuous at the cutoff. This allows attributing any observed "
+            "discontinuity to the treatment effect."
+        ),
+        "The model correctly specifies the relationship between variables": (
+            "The functional form of the relationship between variables in the model should correctly "
+            "capture the true relationship in the data. Misspecification can lead to biased estimates."
+        ),
+        "No reverse causality": (
+            "The treatment must cause the outcome, not the other way around. If the outcome affects "
+            "the treatment, the estimated relationship will not have a causal interpretation."
+        ),
+    }
+    return [
+        {"assumption": assumption, "explanation": assumption_details.get(assumption,
+            "This is a key assumption for the selected causal inference method.")}
+        for assumption in assumptions
+    ]
+def explain_application(method: str, treatment: str, outcome: str,
+                      covariates: List[str], variables: Dict[str, Any]) -> str:
+    """
+    Explain how the method will be applied to the dataset.
+    Args:
+        method: Causal inference method name
+        treatment: Treatment variable name
+        outcome: Outcome variable name
+        covariates: List of covariate names
+        variables: Dictionary of identified variables
+    Returns:
+        String explaining the application
+    """
+    covariate_str = ", ".join(covariates[:3])
+    if len(covariates) > 3:
+        covariate_str += f", and {len(covariates) - 3} other variables"
+    applications = {
+        "propensity_score_matching": (
+            f"I will estimate the propensity scores (probability of receiving treatment) for each "
+            f"observation based on the covariates ({covariate_str}). Then, I'll match treated and "
+            f"untreated units with similar propensity scores to create balanced comparison groups. "
+            f"Finally, I'll calculate the difference in {outcome} between these matched groups to "
+            f"estimate the causal effect of {treatment}."
+        ),
+        "regression_adjustment": (
+            f"I will build a regression model with {outcome} as the dependent variable and "
+            f"{treatment} as the independent variable of interest, while controlling for "
+            f"potential confounders ({covariate_str}). The coefficient of {treatment} will "
+            f"represent the estimated causal effect after adjusting for these covariates."
+        ),
+        "instrumental_variable": (
+            f"I will use {variables.get('instrument_variable')} as an instrumental variable for "
+            f"{treatment}. First, I'll estimate how the instrument affects {treatment} (first stage). "
+            f"Then, I'll use these predictions to estimate how changes in {treatment} that are induced "
+            f"by the instrument affect {outcome} (second stage). This two-stage approach helps "
+            f"address potential unmeasured confounding."
+        ),
+        "difference_in_differences": (
+            f"I will compare the change in {outcome} before and after the intervention for the "
+            f"group receiving {treatment}, relative to the change in a control group that didn't "
+            f"receive the treatment. This approach controls for time-invariant confounders and "
+            f"common time trends that affect both groups."
+        ),
+        "regression_discontinuity": (
+            f"I will focus on observations close to the cutoff value "
+            f"({variables.get('cutoff_value')}) of the running variable "
+            f"({variables.get('running_variable')}), where treatment assignment changes. "
+            f"By comparing outcomes just above and below this threshold, I can estimate "
+            f"the local causal effect of {treatment} on {outcome}."
+        ),
+        "backdoor_adjustment": (
+            f"I will control for the identified confounding variables ({covariate_str}) to "
+            f"block all backdoor paths between {treatment} and {outcome}. This may involve "
+            f"stratification, regression adjustment, or inverse probability weighting, depending "
+            f"on the data characteristics."
+        ),
+    }
+    return applications.get(method,
+        f"I will apply the {method} method to estimate the causal effect of "
+        f"{treatment} on {outcome}, controlling for relevant confounding factors "
+        f"where appropriate.")
+def explain_limitations(method: str, concerns: List[str]) -> str:
+    """
+    Explain the limitations of the method based on validation concerns.
+    Args:
+        method: Causal inference method name
+        concerns: List of concerns from validation
+    Returns:
+        String explaining the limitations
+    """
+    method_limitations = {
+        "propensity_score_matching": (
+            "Propensity Score Matching can only account for observed confounders, and its "
+            "effectiveness depends on having good overlap between treatment and control groups. "
+            "It may also be sensitive to model specification for the propensity score estimation."
+        ),
+        "regression_adjustment": (
+            "Regression Adjustment relies heavily on correct model specification and can only "
+            "control for observed confounders. Extrapolation to regions with limited data can lead "
+            "to unreliable estimates, and the method may be sensitive to outliers."
+        ),
+        "instrumental_variable": (
+            "Instrumental Variable estimation can be imprecise with weak instruments and is "
+            "sensitive to violations of the exclusion restriction. The estimated effect is a local "
+            "average treatment effect for 'compliers', which may not generalize to the entire population."
+        ),
+        "difference_in_differences": (
+            "Difference-in-Differences relies on the parallel trends assumption, which cannot be fully "
+            "tested for the post-treatment period. It may be sensitive to the choice of comparison group "
+            "and can be biased if there are time-varying confounders or anticipation effects."
+        ),
+        "regression_discontinuity": (
+            "Regression Discontinuity provides estimates that are local to the cutoff point and may not "
+            "generalize to units far from this threshold. It also requires sufficient data around the "
+            "cutoff and is sensitive to the choice of bandwidth and functional form."
+        ),
+        "backdoor_adjustment": (
+            "Backdoor Adjustment requires correctly identifying all confounding variables and their "
+            "relationships. It depends on the assumption of no unmeasured confounders and may be "
+            "sensitive to model misspecification in complex settings."
+        ),
+    }
+    base_limitation = method_limitations.get(method,
+        f"The {method} method has general limitations in terms of its assumptions and applicability.")
+    # Add specific concerns if any
+    if concerns:
+        concern_text = " Additionally, specific concerns for this analysis include: " + \
+                      "; ".join(concerns) + "."
+        return base_limitation + concern_text
+    return base_limitation
+def generate_interpretation_guide(method: str, treatment: str, outcome: str) -> str:
+    """
+    Generate guide for interpreting the results.
+    Args:
+        method: Causal inference method name
+        treatment: Treatment variable name
+        outcome: Outcome variable name
+    Returns:
+        String with interpretation guide
+    """
+    interpretation_guides = {
+        "propensity_score_matching": (
+            f"The estimated effect represents the Average Treatment Effect (ATE) or the Average "
+            f"Treatment Effect on the Treated (ATT), depending on the specific matching approach. "
+            f"It can be interpreted as the expected change in {outcome} if a unit were to receive "
+            f"{treatment}, compared to not receiving it, for units with similar covariate values."
+        ),
+        "regression_adjustment": (
+            f"The coefficient of {treatment} in the regression model represents the estimated "
+            f"average causal effect on {outcome}, holding all included covariates constant. "
+            f"For binary treatments, it's the expected difference in outcomes between treated "
+            f"and untreated units with the same covariate values."
+        ),
+        "instrumental_variable": (
+            f"The estimated effect represents the Local Average Treatment Effect (LATE) for 'compliers' "
+            f"- units whose treatment status is influenced by the instrument. It can be interpreted as "
+            f"the average effect of {treatment} on {outcome} for this specific subpopulation."
+        ),
+        "difference_in_differences": (
+            f"The estimated effect represents the average causal impact of {treatment} on {outcome}, "
+            f"under the assumption that treatment and control groups would have followed parallel "
+            f"trends in the absence of treatment. It accounts for both time-invariant differences "
+            f"between groups and common time trends."
+        ),
+        "regression_discontinuity": (
+            f"The estimated effect represents the local causal impact of {treatment} on {outcome} "
+            f"at the cutoff point. It can be interpreted as the expected difference in outcomes "
+            f"for units just above versus just below the threshold, where treatment status changes."
+        ),
+        "backdoor_adjustment": (
+            f"The estimated effect represents the average causal effect of {treatment} on {outcome} "
+            f"after controlling for all identified confounding variables. It can be interpreted as "
+            f"the expected difference in outcomes if a unit were to receive versus not receive the "
+            f"treatment, holding all confounding factors constant."
+        ),
+    }
+    return interpretation_guides.get(method,
+        f"The estimated effect represents the causal impact of {treatment} on {outcome}, "
+        f"given the assumptions of the method are met. Careful consideration of these "
+        f"assumptions is needed for valid causal interpretation.")

auto_causal/components/input_parser.py ADDED Viewed

	@@ -0,0 +1,456 @@

+"""
+Input parser component for extracting information from causal queries.
+This module provides functionality to parse user queries and extract key
+elements such as the causal question, relevant variables, and constraints.
+"""
+import re
+import os
+import json
+import logging # Added for better logging
+from typing import Dict, List, Any, Optional, Union
+import pandas as pd
+from pydantic import BaseModel, Field, ValidationError
+from functools import partial # Import partial
+# Add dotenv import
+from dotenv import load_dotenv
+# LangChain Imports
+from langchain_openai import ChatOpenAI # Example, replace if using another provider
+from langchain_core.messages import HumanMessage, SystemMessage
+from langchain_core.exceptions import OutputParserException # Correct path
+from langchain_core.language_models import BaseChatModel # Import BaseChatModel
+# --- Load .env file ---
+load_dotenv() # Load environment variables from .env file
+# --- Configure Logging ---
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# --- Instantiate LLM Client ---
+# Ensure OPENAI_API_KEY environment variable is set
+# Consider making model name configurable
+try:
+    # Using with_structured_output later, so instantiate base model here
+    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
+    # Add a check or allow configuration for different providers if needed
+except ImportError:
+    logger.error("langchain_openai not installed. Please install it to use OpenAI models.")
+    llm = None
+except Exception as e:
+    logger.error(f"Error initializing LLM: {e}. Input parsing will rely on fallbacks.")
+    llm = None
+# --- Pydantic Models for Structured Output ---
+class ParsedVariables(BaseModel):
+    treatment: List[str] = Field(default_factory=list, description="Variable(s) representing the treatment/intervention.")
+    outcome: List[str] = Field(default_factory=list, description="Variable(s) representing the outcome/result.")
+    covariates_mentioned: Optional[List[str]] = Field(default_factory=list, description="Covariate/control variable(s) explicitly mentioned in the query.")
+    grouping_vars: Optional[List[str]] = Field(default_factory=list, description="Variable(s) identifying groups or units for analysis.")
+    instruments_mentioned: Optional[List[str]] = Field(default_factory=list, description="Potential instrumental variable(s) mentioned.")
+class ParsedQueryInfo(BaseModel):
+    query_type: str = Field(..., description="Type of query (e.g., EFFECT_ESTIMATION, COUNTERFACTUAL, CORRELATION, DESCRIPTIVE, OTHER). Required.")
+    variables: ParsedVariables = Field(..., description="Variables identified in the query.")
+    constraints: Optional[List[str]] = Field(default_factory=list, description="Constraints or conditions mentioned (e.g., 'X > 10', 'country = USA').")
+    dataset_path_mentioned: Optional[str] = Field(None, description="Dataset path explicitly mentioned in the query, if any.")
+# Add Pydantic model for path extraction
+class ExtractedPath(BaseModel):
+    dataset_path: Optional[str] = Field(None, description="File path or URL for the dataset mentioned in the query.")
+# --- End Pydantic Models ---
+def _build_llm_prompt(query: str, dataset_info: Optional[Dict] = None) -> str:
+    """Builds the prompt for the LLM to extract query information."""
+    dataset_context = "No dataset context provided."
+    if dataset_info:
+        columns = dataset_info.get('columns', [])
+        column_details = "\n".join([f"- {col} (Type: {dataset_info.get('column_types', {}).get(col, 'Unknown')})" for col in columns])
+        sample_rows = dataset_info.get('sample_rows', 'Not available')
+        # Ensure sample rows are formatted reasonably
+        if isinstance(sample_rows, list):
+             sample_rows_str = json.dumps(sample_rows[:3], indent=2) # Show first 3 sample rows
+        elif isinstance(sample_rows, str):
+             sample_rows_str = sample_rows
+        else:
+             sample_rows_str = 'Not available'
+        dataset_context = f"""
+Dataset Context:
+Columns:
+{column_details}
+Sample Rows (first few):
+{sample_rows_str}
+"""
+    prompt = f"""
+Analyze the following causal query **strictly in the context of the provided dataset information (if available)**. Identify the query type, key variables (mapping query terms to actual column names when possible), constraints, and any explicitly mentioned dataset path.
+User Query: "{query}"
+{dataset_context}
+# Add specific guidance for query types
+Guidance for Identifying Query Type:
+- EFFECT_ESTIMATION: Look for keywords like 'effect', 'impact', 'influence', 'cause', 'affect', 'consequence'. Also consider questions asking "how does X affect Y?" or comparing outcomes between groups based on an intervention.
+- COUNTERFACTUAL: Look for hypothetical scenarios, often using phrases like 'what if', 'if X had been', 'would Y have changed', 'imagine if', 'counterfactual'.
+- CORRELATION: Look for keywords like 'correlation', 'association', 'relationship', 'linked to', 'related to'. These queries ask about statistical relationships without necessarily implying causality.
+- DESCRIPTIVE: These queries ask for summaries, descriptions, trends, or statistics about the data without investigating causal links or relationships (e.g., "Show sales over time", "What is the average age?").
+- OTHER: Use this if the query does not fit any of the above categories.
+Choose the most appropriate type from: EFFECT_ESTIMATION, COUNTERFACTUAL, CORRELATION, DESCRIPTIVE, OTHER.
+Variable Roles to Identify:
+- treatment: The intervention or variable whose effect is being studied.
+- outcome: The result or variable being measured.
+- covariates_mentioned: Variables explicitly mentioned to control for or adjust for.
+- grouping_vars: Variables identifying specific subgroups for analysis (e.g., 'for men', 'in the sales department').
+- instruments_mentioned: Variables explicitly mentioned as potential instruments.
+Constraints: Conditions applied to the analysis (e.g., filters on columns, specific time periods).
+Dataset Path Mentioned: Extract the file path or URL if explicitly stated in the query.
+**Output ONLY a valid JSON object** matching this exact schema (no explanations, notes, or surrounding text):
+```json
+{{
+  "query_type": "<Identified Query Type>",
+  "variables": {{
+    "treatment": ["<Treatment Variable(s) Mentioned>"],
+    "outcome": ["<Outcome Variable(s) Mentioned>"],
+    "covariates_mentioned": ["<Covariate(s) Mentioned>"],
+    "grouping_vars": ["<Grouping Variable(s) Mentioned>"],
+    "instruments_mentioned": ["<Instrument(s) Mentioned>"]
+  }},
+  "constraints": ["<Constraint 1>", "<Constraint 2>"],
+  "dataset_path_mentioned": "<Path Mentioned or null>"
+}}
+```
+If Dataset Context is provided, ensure variable names in the output JSON correspond to actual column names where possible. If no context is provided, or if a mentioned variable doesn't map directly, use the phrasing from the query.
+Respond with only the JSON object.
+"""
+    return prompt
+def _validate_llm_output(parsed_info: ParsedQueryInfo, dataset_info: Optional[Dict] = None) -> bool:
+    """Perform basic assertions on the parsed LLM output."""
+    # 1. Check required fields exist (Pydantic handles this on parsing)
+    # 2. Check query type is one of the allowed types (can add enum to Pydantic later)
+    allowed_types = {"EFFECT_ESTIMATION", "COUNTERFACTUAL", "CORRELATION", "DESCRIPTIVE", "OTHER"}
+    print(parsed_info)
+    assert parsed_info.query_type in allowed_types, f"Invalid query_type: {parsed_info.query_type}"
+    # 3. Check that if it's an effect query, treatment and outcome are likely present
+    if parsed_info.query_type == "EFFECT_ESTIMATION":
+        # Check that the lists are not empty
+        assert parsed_info.variables.treatment, "Treatment variable list is empty for effect query."
+        assert parsed_info.variables.outcome, "Outcome variable list is empty for effect query."
+    # 4. If dataset_info provided, check if extracted variables exist in columns
+    if dataset_info and (columns := dataset_info.get('columns')):
+        all_extracted_vars = set()
+        for var_list in parsed_info.variables.model_dump().values(): # Iterate through variable lists
+            if var_list: # Ensure var_list is not None or empty
+                all_extracted_vars.update(var_list)
+        unknown_vars = all_extracted_vars - set(columns)
+        # Allow for non-column variables if context is missing? Maybe relax this.
+        # For now, strict check if columns are provided.
+        if unknown_vars:
+             logger.warning(f"LLM mentioned variables potentially not in dataset columns: {unknown_vars}")
+             # Decide if this should be a hard failure (AssertionError) or just a warning.
+             # Let's make it a hard failure for now to enforce mapping.
+             raise AssertionError(f"LLM hallucinated variables not in dataset columns: {unknown_vars}")
+    logger.info("LLM output validation passed.")
+    return True
+def _extract_query_information_with_llm(query: str, dataset_info: Optional[Dict] = None, llm: Optional[BaseChatModel] = None, max_retries: int = 3) -> Optional[ParsedQueryInfo]:
+    """Extracts query type, variables, and constraints using LLM with retries and validation."""
+    if not llm:
+        logger.error("LLM client not provided. Cannot perform LLM extraction.")
+        return None
+    last_error = None
+    # Bind the Pydantic model to the LLM for structured output
+    structured_llm = llm.with_structured_output(ParsedQueryInfo)
+    # Initial prompt construction
+    system_prompt_content = _build_llm_prompt(query, dataset_info)
+    messages = [HumanMessage(content=system_prompt_content)] # Start with just the detailed prompt as Human message
+    for attempt in range(max_retries):
+        logger.info(f"LLM Extraction Attempt {attempt + 1}/{max_retries}...")
+        try:
+            # --- Invoke LangChain LLM with structured output (using passed llm) ---
+            parsed_info = structured_llm.invoke(messages)
+            # ---------------------------------------------------
+            print(messages)
+            print('---------------------------------------------------')
+            print(parsed_info)
+            # Perform custom assertions/validation
+            if _validate_llm_output(parsed_info, dataset_info):
+                return parsed_info # Success!
+        # Catch errors specific to structured output parsing or Pydantic validation
+        except (OutputParserException, ValidationError, AssertionError) as e:
+            logger.warning(f"Validation/Parsing Error (Attempt {attempt + 1}): {e}")
+            last_error = e
+            # Add feedback message for retry
+            messages.append(SystemMessage(content=f"Your previous response failed validation: {str(e)}. Please revise your response to be valid JSON conforming strictly to the schema and ensure variable names exist in the dataset context."))
+            continue # Go to next retry
+        except Exception as e: # Catch other potential LLM API errors
+            logger.error(f"Unexpected LLM Error (Attempt {attempt + 1}): {e}", exc_info=True)
+            last_error = e
+            break # Stop retrying on unexpected API errors
+    logger.error(f"LLM extraction failed after {max_retries} attempts.")
+    if last_error:
+         logger.error(f"Last error: {last_error}")
+    return None # Indicate failure
+# Add helper function to call LLM for path - needs llm argument
+def _call_llm_for_path(query: str, llm: Optional[BaseChatModel] = None, max_retries: int = 2) -> Optional[str]:
+    """Uses LLM as a fallback to extract just the dataset path."""
+    if not llm:
+        logger.warning("LLM client not provided. Cannot perform LLM path fallback.")
+        return None
+    logger.info("Attempting LLM fallback for dataset path extraction...")
+    path_extractor_llm = llm.with_structured_output(ExtractedPath)
+    prompt = f"Extract the dataset file path (e.g., /path/to/file.csv or https://...) mentioned in the following query. Respond ONLY with the JSON object.\nQuery: \"{query}\""
+    messages = [HumanMessage(content=prompt)]
+    last_error = None
+    for attempt in range(max_retries):
+        try:
+            parsed_info = path_extractor_llm.invoke(messages)
+            if parsed_info.dataset_path:
+                logger.info(f"LLM fallback extracted path: {parsed_info.dataset_path}")
+                return parsed_info.dataset_path
+            else:
+                logger.info("LLM fallback did not find a path.")
+                return None # LLM explicitly found no path
+        except (OutputParserException, ValidationError) as e:
+            logger.warning(f"LLM path extraction parsing/validation error (Attempt {attempt+1}): {e}")
+            last_error = e
+            messages.append(SystemMessage(content=f"Parsing Error: {e}. Please ensure you provide valid JSON with only the 'dataset_path' key."))
+            continue
+        except Exception as e:
+            logger.error(f"Unexpected LLM Error during path fallback (Attempt {attempt+1}): {e}", exc_info=True)
+            last_error = e
+            break # Don't retry on unexpected errors
+    logger.error(f"LLM path fallback failed after {max_retries} attempts. Last error: {last_error}")
+    return None
+# Renamed and modified function for regex path extraction + LLM fallback - needs llm argument
+def extract_dataset_path(query: str, llm: Optional[BaseChatModel] = None) -> Optional[str]:
+    """
+    Extract dataset path from the query using regex patterns, with LLM fallback.
+    Args:
+        query: The user's causal question text
+        llm: The shared LLM client instance for fallback.
+    Returns:
+        String with dataset path or None if not found
+    """
+    # --- Regex Part (existing logic) ---
+    # Check for common patterns indicating dataset paths
+    path_patterns = [
+        # More specific patterns first
+        r"(?:dataset|data|file) (?:at|in|from|located at) [\"\']?([^\"\'.,\s]+\.csv(?:[\\/][^\"\'.,\s]+)*)[\"\']?", # Handles subdirs in path
+        r"(?:use|using|analyze|analyse) (?:the |)(?:dataset|data|file) [\"\']?([^\"\'.,\s]+\.csv(?:[\\/][^\"\'.,\s]+)*)[\"\']?",
+        # Simpler patterns
+        r"[\"']([^\"']+\.csv(?:[\\/][^\"\'.,\s]+)*)[\"']", # Path in quotes
+        r"([a-zA-Z0-9_/.:-]+[\\/][a-zA-Z0-9_.:-]+\.csv)", # More generic path-like structure ending in .csv
+        r"([^\"\'.,\s]+\.csv)" # Just a .csv file name (least specific)
+    ]
+    for pattern in path_patterns:
+        matches = re.search(pattern, query, re.IGNORECASE)
+        if matches:
+            path = matches.group(1).strip()
+            # Basic check if it looks like a path
+            if '/' in path or '\\' in path or os.path.exists(path):
+                 # Check if this is a valid file path immediately
+                if os.path.exists(path):
+                    logger.info(f"Regex found existing path: {path}")
+                    return path
+                # Check if it's in common data directories
+                data_dir_paths = ["data/", "datasets/", "causalscientist/data/"]
+                for data_dir in data_dir_paths:
+                    potential_path = os.path.join(data_dir, os.path.basename(path))
+                    if os.path.exists(potential_path):
+                        logger.info(f"Regex found path in {data_dir}: {potential_path}")
+                        return potential_path
+                # If not found but looks like a path, return it anyway - let downstream handle non-existence
+                logger.info(f"Regex found potential path (existence not verified): {path}")
+                return path
+            # Else: it might just be a word ending in .csv, ignore unless it exists
+            elif os.path.exists(path):
+                 logger.info(f"Regex found existing path (simple pattern): {path}")
+                 return path
+    # --- LLM Fallback ---
+    logger.info("Regex did not find dataset path. Trying LLM fallback...")
+    llm_fallback_path = _call_llm_for_path(query, llm=llm)
+    if llm_fallback_path:
+         # Optional: Add existence check here too? Or let downstream handle it.
+         # For now, return what LLM found.
+         return llm_fallback_path
+    logger.info("No dataset path found via regex or LLM fallback.")
+    return None
+def parse_input(query: str, dataset_path_arg: Optional[str] = None, dataset_info: Optional[Dict] = None, llm: Optional[BaseChatModel] = None) -> Dict[str, Any]:
+    """
+    Parse the user's causal query using LLM and regex.
+    Args:
+        query: The user's causal question text.
+        dataset_path_arg: Path to dataset if provided directly as an argument.
+        dataset_info: Dictionary with dataset context (columns, types, etc.).
+        llm: The shared LLM client instance.
+    Returns:
+        Dict containing parsed query information.
+    """
+    result = {
+        "original_query": query,
+        "dataset_path": dataset_path_arg, # Start with argument path
+        "query_type": "OTHER", # Default values
+        "extracted_variables": {},
+        "constraints": []
+    }
+    # --- 1. Use LLM for core NLP tasks ---
+    parsed_llm_info = _extract_query_information_with_llm(query, dataset_info, llm=llm)
+    if parsed_llm_info:
+        result["query_type"] = parsed_llm_info.query_type
+        result["extracted_variables"] = {k: v if v is not None else [] for k, v in parsed_llm_info.variables.model_dump().items()}
+        result["constraints"] = parsed_llm_info.constraints if parsed_llm_info.constraints is not None else []
+        llm_mentioned_path = parsed_llm_info.dataset_path_mentioned
+    else:
+        logger.warning("LLM-based query information extraction failed.")
+        llm_mentioned_path = None
+        # Consider falling back to old regex methods here if critical
+        # logger.info("Falling back to regex-based parsing (if implemented).")
+    # --- 2. Determine Dataset Path (Hybrid Approach) ---
+    final_dataset_path = dataset_path_arg # Priority 1: Explicit argument
+    # Pass llm instance to the path extractor for its fallback mechanism
+    path_extractor = partial(extract_dataset_path, llm=llm)
+    if not final_dataset_path:
+        # Priority 2: Path mentioned in query (extracted by main LLM call)
+        if llm_mentioned_path and os.path.exists(llm_mentioned_path):
+             logger.info(f"Using dataset path mentioned by LLM: {llm_mentioned_path}")
+             final_dataset_path = llm_mentioned_path
+        elif llm_mentioned_path: # Check data dirs if path not absolute
+            data_dir_paths = ["data/", "datasets/", "causalscientist/data/"]
+            base_name = os.path.basename(llm_mentioned_path)
+            for data_dir in data_dir_paths:
+                potential_path = os.path.join(data_dir, base_name)
+                if os.path.exists(potential_path):
+                    logger.info(f"Using dataset path mentioned by LLM (found in {data_dir}): {potential_path}")
+                    final_dataset_path = potential_path
+                    break
+            if not final_dataset_path:
+                 logger.warning(f"LLM mentioned path '{llm_mentioned_path}' but it was not found.")
+    if not final_dataset_path:
+        # Priority 3: Path extracted by dedicated Regex + LLM fallback function
+        logger.info("Attempting dedicated dataset path extraction (Regex + LLM Fallback)...")
+        extracted_path = path_extractor(query) # Call the partial function with llm bound
+        if extracted_path:
+            final_dataset_path = extracted_path
+    result["dataset_path"] = final_dataset_path
+    # Check if a path was found ultimately
+    if not result["dataset_path"]:
+        logger.warning("Could not determine dataset path from query or arguments.")
+    else:
+        logger.info(f"Final dataset path determined: {result['dataset_path']}")
+    return result
+# --- Old Regex-based functions (Commented out or removed) ---
+# def determine_query_type(query: str) -> str:
+#     ... (implementation removed)
+# def extract_variables(query: str) -> Dict[str, Any]:
+#     ... (implementation removed)
+# def detect_constraints(query: str) -> List[str]:
+#     ... (implementation removed)
+# --- End Old Functions ---
+# Renamed function for regex path extraction
+def extract_dataset_path_regex(query: str) -> Optional[str]:
+    """
+    Extract dataset path from the query using regex patterns.
+    Args:
+        query: The user's causal question text
+    Returns:
+        String with dataset path or None if not found
+    """
+    # Check for common patterns indicating dataset paths
+    path_patterns = [
+        # More specific patterns first
+        r"(?:dataset|data|file) (?:at|in|from|located at) [\"\']?([^\"\'.,\s]+\.csv(?:[\\/][^\"\'.,\s]+)*)[\"\']?", # Handles subdirs in path
+        r"(?:use|using|analyze|analyse) (?:the |)(?:dataset|data|file) [\"\']?([^\"\'.,\s]+\.csv(?:[\\/][^\"\'.,\s]+)*)[\"\']?",
+        # Simpler patterns
+        r"[\"']([^\"']+\.csv(?:[\\/][^\"\'.,\s]+)*)[\"']", # Path in quotes
+        r"([a-zA-Z0-9_/.:-]+[\\/][a-zA-Z0-9_.:-]+\.csv)", # More generic path-like structure ending in .csv
+        r"([^\"\'.,\s]+\.csv)" # Just a .csv file name (least specific)
+    ]
+    for pattern in path_patterns:
+        matches = re.search(pattern, query, re.IGNORECASE)
+        if matches:
+            path = matches.group(1).strip()
+            # Basic check if it looks like a path
+            if '/' in path or '\\' in path or os.path.exists(path):
+                 # Check if this is a valid file path immediately
+                if os.path.exists(path):
+                    logger.info(f"Regex found existing path: {path}")
+                    return path
+                # Check if it's in common data directories
+                data_dir_paths = ["data/", "datasets/", "causalscientist/data/"]
+                # Also check relative to current dir (often useful)
+                # base_name = os.path.basename(path)
+                for data_dir in data_dir_paths:
+                    potential_path = os.path.join(data_dir, os.path.basename(path))
+                    if os.path.exists(potential_path):
+                        logger.info(f"Regex found path in {data_dir}: {potential_path}")
+                        return potential_path
+                # If not found but looks like a path, return it anyway - let downstream handle non-existence
+                logger.info(f"Regex found potential path (existence not verified): {path}")
+                return path
+            # Else: it might just be a word ending in .csv, ignore unless it exists
+            elif os.path.exists(path):
+                 logger.info(f"Regex found existing path (simple pattern): {path}")
+                 return path
+    # TODO: Optional: Add LLM fallback call here if regex fails
+    # if no path found:
+    #     llm_fallback_path = call_llm_for_path(query)
+    #     return llm_fallback_path
+    return None

auto_causal/components/method_validator.py ADDED Viewed

	@@ -0,0 +1,327 @@

+"""
+Method validator component for causal inference methods.
+This module validates the selected causal inference method against
+dataset characteristics and available variables.
+"""
+from typing import Dict, List, Any, Optional
+def validate_method(method_info: Dict[str, Any], dataset_analysis: Dict[str, Any],
+                    variables: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Validate the selected causal method against dataset characteristics.
+    Args:
+        method_info: Information about the selected method from decision_tree
+        dataset_analysis: Dataset analysis results from dataset_analyzer
+        variables: Identified variables from query_interpreter
+    Returns:
+        Dict with validation results:
+            - valid: Boolean indicating if method is valid
+            - concerns: List of concerns/issues with the selected method
+            - alternative_suggestions: Alternative methods if the selected method is problematic
+            - recommended_method: Updated method recommendation if issues are found
+    """
+    method = method_info.get("selected_method")
+    assumptions = method_info.get("method_assumptions", [])
+    # Get required variables
+    treatment = variables.get("treatment_variable")
+    outcome = variables.get("outcome_variable")
+    covariates = variables.get("covariates", [])
+    time_variable = variables.get("time_variable")
+    group_variable = variables.get("group_variable")
+    instrument_variable = variables.get("instrument_variable")
+    running_variable = variables.get("running_variable")
+    cutoff_value = variables.get("cutoff_value")
+    # Initialize validation result
+    validation_result = {
+        "valid": True,
+        "concerns": [],
+        "alternative_suggestions": [],
+        "recommended_method": method,
+    }
+    # Common validations for all methods
+    if treatment is None:
+        validation_result["valid"] = False
+        validation_result["concerns"].append("Treatment variable is not identified")
+    if outcome is None:
+        validation_result["valid"] = False
+        validation_result["concerns"].append("Outcome variable is not identified")
+    # Method-specific validations
+    if method == "propensity_score_matching":
+        validate_propensity_score_matching(validation_result, dataset_analysis, variables)
+    elif method == "regression_adjustment":
+        validate_regression_adjustment(validation_result, dataset_analysis, variables)
+    elif method == "instrumental_variable":
+        validate_instrumental_variable(validation_result, dataset_analysis, variables)
+    elif method == "difference_in_differences":
+        validate_difference_in_differences(validation_result, dataset_analysis, variables)
+    elif method == "regression_discontinuity_design":
+        validate_regression_discontinuity(validation_result, dataset_analysis, variables)
+    elif method == "backdoor_adjustment":
+        validate_backdoor_adjustment(validation_result, dataset_analysis, variables)
+    # If there are serious concerns, recommend alternatives
+    if not validation_result["valid"]:
+        validation_result["recommended_method"] = recommend_alternative(
+            method, validation_result["concerns"], method_info.get("alternatives", [])
+        )
+    # Make sure assumptions are listed in the validation result
+    validation_result["assumptions"] = assumptions
+    print("--------------------------")
+    print("Validation result:", validation_result)
+    print("--------------------------")
+    return validation_result
+def validate_propensity_score_matching(validation_result: Dict[str, Any],
+                                      dataset_analysis: Dict[str, Any],
+                                      variables: Dict[str, Any]) -> None:
+    """
+    Validate propensity score matching method requirements.
+    Args:
+        validation_result: Current validation result to update
+        dataset_analysis: Dataset analysis results
+        variables: Identified variables
+    """
+    treatment = variables.get("treatment_variable")
+    covariates = variables.get("covariates", [])
+    # Check if treatment is binary using column_categories
+    is_binary = dataset_analysis.get("column_categories", {}).get(treatment) == "binary"
+    # Fallback to check if the column has only two unique values (0 and 1)
+    if not is_binary:
+        column_types = dataset_analysis.get("column_types", {})
+        if column_types.get(treatment) == "int64" or column_types.get(treatment) == "int32":
+            # Assuming int type with only 0s and 1s is binary
+            is_binary = True
+    if not is_binary:
+        validation_result["valid"] = False
+        validation_result["concerns"].append(
+            "Treatment variable is not binary, which is required for propensity score matching"
+        )
+    # Check if there are sufficient covariates
+    if len(covariates) < 2:
+        validation_result["concerns"].append(
+            "Few covariates identified, which may limit the effectiveness of propensity score matching"
+        )
+    # Check for sufficient overlap
+    variable_relationships = dataset_analysis.get("variable_relationships", {})
+    treatment_imbalance = variable_relationships.get("treatment_imbalance", 0.5)
+    if treatment_imbalance < 0.1 or treatment_imbalance > 0.9:
+        validation_result["concerns"].append(
+            "Treatment groups are highly imbalanced, which may lead to poor matching quality"
+        )
+        validation_result["alternative_suggestions"].append("regression_adjustment")
+def validate_regression_adjustment(validation_result: Dict[str, Any],
+                                 dataset_analysis: Dict[str, Any],
+                                 variables: Dict[str, Any]) -> None:
+    """
+    Validate regression adjustment method requirements.
+    Args:
+        validation_result: Current validation result to update
+        dataset_analysis: Dataset analysis results
+        variables: Identified variables
+    """
+    outcome = variables.get("outcome_variable")
+    # Check outcome type for appropriate regression model
+    outcome_data = dataset_analysis.get("variable_types", {}).get(outcome, {})
+    outcome_type = outcome_data.get("type")
+    if outcome_type == "categorical" and outcome_data.get("n_categories", 0) > 2:
+        validation_result["concerns"].append(
+            "Outcome is categorical with multiple categories, which may require multinomial regression"
+        )
+    # Check for potential nonlinear relationships
+    nonlinear_relationships = dataset_analysis.get("nonlinear_relationships", False)
+    if nonlinear_relationships:
+        validation_result["concerns"].append(
+            "Potential nonlinear relationships detected, which may require more flexible models"
+        )
+def validate_instrumental_variable(validation_result: Dict[str, Any],
+                                 dataset_analysis: Dict[str, Any],
+                                 variables: Dict[str, Any]) -> None:
+    """
+    Validate instrumental variable method requirements.
+    Args:
+        validation_result: Current validation result to update
+        dataset_analysis: Dataset analysis results
+        variables: Identified variables
+    """
+    instrument_variable = variables.get("instrument_variable")
+    treatment = variables.get("treatment_variable")
+    if instrument_variable is None:
+        validation_result["valid"] = False
+        validation_result["concerns"].append(
+            "No instrumental variable identified, which is required for this method"
+        )
+        validation_result["alternative_suggestions"].append("propensity_score_matching")
+        return
+    # Check for instrument strength (correlation with treatment)
+    variable_relationships = dataset_analysis.get("variable_relationships", {})
+    instrument_correlation = next(
+        (corr.get("correlation", 0) for corr in variable_relationships.get("correlations", [])
+         if corr.get("var1") == instrument_variable and corr.get("var2") == treatment
+         or corr.get("var1") == treatment and corr.get("var2") == instrument_variable),
+        0
+    )
+    if abs(instrument_correlation) < 0.2:
+        validation_result["concerns"].append(
+            "Instrument appears weak (low correlation with treatment), which may lead to bias"
+        )
+        validation_result["alternative_suggestions"].append("propensity_score_matching")
+def validate_difference_in_differences(validation_result: Dict[str, Any],
+                                     dataset_analysis: Dict[str, Any],
+                                     variables: Dict[str, Any]) -> None:
+    """
+    Validate difference-in-differences method requirements.
+    Args:
+        validation_result: Current validation result to update
+        dataset_analysis: Dataset analysis results
+        variables: Identified variables
+    """
+    time_variable = variables.get("time_variable")
+    group_variable = variables.get("group_variable")
+    if time_variable is None:
+        validation_result["valid"] = False
+        validation_result["concerns"].append(
+            "No time variable identified, which is required for difference-in-differences"
+        )
+        validation_result["alternative_suggestions"].append("propensity_score_matching")
+    if group_variable is None:
+        validation_result["valid"] = False
+        validation_result["concerns"].append(
+            "No group variable identified, which is required for difference-in-differences"
+        )
+        validation_result["alternative_suggestions"].append("propensity_score_matching")
+    # Check for parallel trends
+    temporal_structure = dataset_analysis.get("temporal_structure", {})
+    parallel_trends = temporal_structure.get("parallel_trends", False)
+    if not parallel_trends:
+        validation_result["concerns"].append(
+            "No evidence of parallel trends, which is a key assumption for difference-in-differences"
+        )
+        validation_result["alternative_suggestions"].append("synthetic_control")
+def validate_regression_discontinuity(validation_result: Dict[str, Any],
+                                    dataset_analysis: Dict[str, Any],
+                                    variables: Dict[str, Any]) -> None:
+    """
+    Validate regression discontinuity method requirements.
+    Args:
+        validation_result: Current validation result to update
+        dataset_analysis: Dataset analysis results
+        variables: Identified variables
+    """
+    running_variable = variables.get("running_variable")
+    cutoff_value = variables.get("cutoff_value")
+    if running_variable is None:
+        validation_result["valid"] = False
+        validation_result["concerns"].append(
+            "No running variable identified, which is required for regression discontinuity"
+        )
+        validation_result["alternative_suggestions"].append("propensity_score_matching")
+    if cutoff_value is None:
+        validation_result["valid"] = False
+        validation_result["concerns"].append(
+            "No cutoff value identified, which is required for regression discontinuity"
+        )
+        validation_result["alternative_suggestions"].append("propensity_score_matching")
+    # Check for discontinuity at threshold
+    discontinuities = dataset_analysis.get("discontinuities", {})
+    has_discontinuity = discontinuities.get("has_discontinuities", False)
+    if not has_discontinuity:
+        validation_result["valid"] = False
+        validation_result["concerns"].append(
+            "No clear discontinuity detected at the threshold, which is necessary for this method"
+        )
+        validation_result["alternative_suggestions"].append("regression_adjustment")
+def validate_backdoor_adjustment(validation_result: Dict[str, Any],
+                               dataset_analysis: Dict[str, Any],
+                               variables: Dict[str, Any]) -> None:
+    """
+    Validate backdoor adjustment method requirements.
+    Args:
+        validation_result: Current validation result to update
+        dataset_analysis: Dataset analysis results
+        variables: Identified variables
+    """
+    covariates = variables.get("covariates", [])
+    if len(covariates) == 0:
+        validation_result["valid"] = False
+        validation_result["concerns"].append(
+            "No covariates identified for backdoor adjustment"
+        )
+        validation_result["alternative_suggestions"].append("regression_adjustment")
+def recommend_alternative(method: str, concerns: List[str], alternatives: List[str]) -> str:
+    """
+    Recommend an alternative method if the current one has issues.
+    Args:
+        method: Current method
+        concerns: List of concerns with the current method
+        alternatives: List of alternative methods suggested by the decision tree
+    Returns:
+        String with the recommended method
+    """
+    # If there are alternatives, recommend the first one
+    if alternatives:
+        return alternatives[0]
+    # If no alternatives, use regression adjustment as a fallback
+    if method != "regression_adjustment":
+        return "regression_adjustment"
+    # If regression adjustment is also problematic, use propensity score matching
+    return "propensity_score_matching"

auto_causal/components/output_formatter.py ADDED Viewed

	@@ -0,0 +1,138 @@

+"""
+Output formatter component for causal inference results.
+This module formats the results of causal analysis into a clear,
+structured output for presentation to the user.
+"""
+from typing import Dict, List, Any, Optional
+import json # Add this import at the top of the file
+# Import the new model
+from auto_causal.models import FormattedOutput
+# Add this module-level variable, typically near imports or at the top
+CURRENT_OUTPUT_LOG_FILE = None
+# Revert signature and logic to handle results and structured explanation
+def format_output(
+    query: str,
+    method: str,
+    results: Dict[str, Any],
+    explanation: Dict[str, Any],
+    dataset_analysis: Optional[Dict[str, Any]] = None,
+    dataset_description: Optional[str] = None
+) -> FormattedOutput:
+    """
+    Format final results including numerical estimates and explanations.
+    Args:
+        query: Original user query
+        method: Causal inference method used (string name)
+        results: Numerical results from method_executor_tool
+        explanation: Structured explanation object from explainer_tool
+        dataset_analysis: Optional dictionary of dataset analysis results
+        dataset_description: Optional string description of the dataset
+    Returns:
+        Dict with formatted output fields ready for presentation.
+    """
+    # Extract numerical results
+    effect_estimate = results.get("effect_estimate")
+    confidence_interval = results.get("confidence_interval")
+    p_value = results.get("p_value")
+    effect_se = results.get("standard_error") # Get SE if available
+    # Format method name for readability
+    method_name_formatted = _format_method_name(method)
+    # Extract explanation components (assuming explainer returns structured dict again)
+    # If explainer returns single string, adjust this
+    method_explanation_text = explanation.get("method_explanation", "")
+    interpretation_guide = explanation.get("interpretation_guide", "")
+    limitations = explanation.get("limitations", [])
+    assumptions_discussion = explanation.get("assumptions", "") # Assuming key is 'assumptions'
+    practical_implications = explanation.get("practical_implications", "")
+    # Add back final_explanation_text if explainer provides it
+    # final_explanation_text = explanation.get("final_explanation_text")
+    # Create summary using numerical results
+    ci_text = ""
+    if confidence_interval and confidence_interval[0] is not None and confidence_interval[1] is not None:
+        ci_text = f" (95% CI: [{confidence_interval[0]:.4f}, {confidence_interval[1]:.4f}])"
+    p_value_text = f", p={p_value:.4f}" if p_value is not None else ""
+    effect_text = f"{effect_estimate:.4f}" if effect_estimate is not None else "N/A"
+    summary = (
+        f"Based on {method_name_formatted}, the estimated causal effect is {effect_text}"
+        f"{ci_text}{p_value_text}. {_create_effect_interpretation(effect_estimate, p_value)}"
+        f" See details below regarding assumptions and limitations."
+    )
+    # Assemble formatted output dictionary
+    results_dict = {
+        "query": query,
+        "method_used": method_name_formatted,
+        "causal_effect": effect_estimate,
+        "standard_error": effect_se,
+        "confidence_interval": confidence_interval,
+        "p_value": p_value,
+        "summary": summary,
+        "method_explanation": method_explanation_text,
+        "interpretation_guide": interpretation_guide,
+        "limitations": limitations,
+        "assumptions": assumptions_discussion,
+        "practical_implications": practical_implications,
+        # "full_explanation_text": final_explanation_text # Optionally include combined text
+    }
+    final_results_dict = {key : results_dict[key] for key in {"query", "method_used", "causal_effect", "standard_error", "confidence_interval"}}
+    # print(final_results_dict)
+    # Validate and instantiate the Pydantic model
+    try:
+        formatted_output_model = FormattedOutput(**results_dict)
+    except Exception as e: # Catch validation errors specifically if needed
+        # Handle validation error - perhaps log and return a default or raise
+        print(f"Error creating FormattedOutput model: {e}") # Or use logger
+        # Decide on error handling: raise, return None, return default?
+        # For now, re-raising might be simplest if the structure is expected
+        raise ValueError(f"Failed to create FormattedOutput from results: {e}")
+    return formatted_output_model # Return the Pydantic model instance
+def _format_method_name(method: str) -> str:
+    """Format method name for readability."""
+    method_names = {
+        "propensity_score_matching": "Propensity Score Matching",
+        "regression_adjustment": "Regression Adjustment",
+        "instrumental_variable": "Instrumental Variable Analysis",
+        "difference_in_differences": "Difference-in-Differences",
+        "regression_discontinuity": "Regression Discontinuity Design",
+        "backdoor_adjustment": "Backdoor Adjustment",
+        "propensity_score_weighting": "Propensity Score Weighting"
+    }
+    return method_names.get(method, method.replace("_", " ").title())
+# Reinstate helper function for interpretation
+def _create_effect_interpretation(effect: Optional[float], p_value: Optional[float] = None) -> str:
+    """Create a basic interpretation of the effect."""
+    if effect is None:
+        return "Effect estimate not available."
+    significance = ""
+    if p_value is not None:
+        significance = "statistically significant" if p_value < 0.05 else "not statistically significant"
+    magnitude = ""
+    if abs(effect) < 0.01:
+        magnitude = "no practical effect"
+    elif abs(effect) < 0.1:
+        magnitude = "a small effect"
+    elif abs(effect) < 0.5:
+        magnitude = "a moderate effect"
+    else:
+        magnitude = "a substantial effect"
+    return f"This suggests {magnitude}{f' and is {significance}' if significance else ''}."

auto_causal/components/query_interpreter.py ADDED Viewed

	@@ -0,0 +1,580 @@

+"""
+Query interpreter component for causal inference.
+This module provides functionality to match query concepts to actual dataset variables,
+identifying treatment, outcome, and covariate variables for causal inference analysis.
+"""
+import re
+from typing import Dict, List, Any, Optional, Union, Tuple
+import pandas as pd
+import logging
+import numpy as np
+from auto_causal.config import get_llm_client
+# Import LLM and message types
+from langchain_core.language_models import BaseChatModel
+from langchain_core.messages import HumanMessage
+from langchain_core.exceptions import OutputParserException
+# Import base Pydantic models needed directly
+from pydantic import BaseModel, ValidationError
+from dowhy import CausalModel
+import json
+# Import shared Pydantic models from the central location
+from auto_causal.models import (
+    LLMSelectedVariable,
+    LLMSelectedCovariates,
+    LLMIVars,
+    LLMRDDVars,
+    LLMRCTCheck,
+    LLMTreatmentReferenceLevel,
+    LLMInteractionSuggestion,
+    LLMEstimand,
+    # LLMDIDCheck,
+    # LLMDiDTemporalVars,
+    # LLMDiDGroupVars,
+    # LLMRDDCheck,
+    # LLMRDDVarsExtended
+)
+# Import the new prompt templates
+from auto_causal.prompts.method_identification_prompts import (
+    IV_IDENTIFICATION_PROMPT_TEMPLATE,
+    RDD_IDENTIFICATION_PROMPT_TEMPLATE,
+    RCT_IDENTIFICATION_PROMPT_TEMPLATE,
+    TREATMENT_REFERENCE_IDENTIFICATION_PROMPT_TEMPLATE,
+    INTERACTION_TERM_IDENTIFICATION_PROMPT_TEMPLATE,
+    TREATMENT_VAR_IDENTIFICATION_PROMPT_TEMPLATE,
+    OUTCOME_VAR_IDENTIFICATION_PROMPT_TEMPLATE,
+    COVARIATES_IDENTIFICATION_PROMPT_TEMPLATE,
+    ESTIMAND_PROMPT_TEMPLATE,
+    CONFOUNDER_IDENTIFICATION_PROMPT_TEMPLATE,
+    DID_TERM_IDENTIFICATION_PROMPT_TEMPLATE)
+# Assume central models are defined elsewhere or keep local definitions for now
+# from ..models import ...
+# --- Pydantic models for LLM structured output ---
+# REMOVED - Now defined in causalscientist/auto_causal/models.py
+# class LLMSelectedVariable(BaseModel): ...
+# class LLMSelectedCovariates(BaseModel): ...
+# class LLMIVars(BaseModel): ...
+# class LLMRDDVars(BaseModel): ...
+# class LLMRCTCheck(BaseModel): ...
+logger = logging.getLogger(__name__)
+def infer_treatment_variable_type(treatment_variable: str, column_categories: Dict[str, str],
+                                  dataset_analysis: Dict[str, Any]) -> str:
+    """
+    Determine treatment variable type from column category and unique value count
+    Args:
+        treatment_variable: name of the treatment variable
+        column_categories: mapping of column names to their categories
+        dataset_analysis: exploratory analysis results
+    Returns:
+        str: type of the treatment variable (e.g., "binary", "continuous", etc
+    """
+    treatment_variable_type = "unknown"
+    if treatment_variable and treatment_variable in column_categories:
+        category = column_categories[treatment_variable]
+        logger.info(f"Category for treatment '{treatment_variable}' is '{category}'.")
+        if category == "continuous_numeric":
+            treatment_variable_type = "continuous"
+        elif category == "discrete_numeric":
+            num_unique = dataset_analysis.get("column_nunique_counts", {}).get(treatment_variable, -1)
+            if num_unique > 10:
+                logger.info(f"'{treatment_variable}' has {num_unique} unique values, treating as continuous.")
+                treatment_variable_type = "continuous"
+            elif num_unique == 2:
+                logger.info(f"'{treatment_variable}' has 2 unique values, treating as binary.")
+                treatment_variable_type = "binary"
+            elif num_unique > 0:
+                logger.info(f"'{treatment_variable}' has {num_unique} unique values, treating as discrete_multi_value.")
+                treatment_variable_type = "discrete_multi_value"
+            else:
+                logger.info(f"'{treatment_variable}' unique value count unknown or too few.")
+                treatment_variable_type = "discrete_numeric_unknown_cardinality"
+        elif category in ["binary", "binary_categorical"]:
+            treatment_variable_type = "binary"
+        elif category in ["categorical", "categorical_numeric"]:
+            num_unique = dataset_analysis.get("column_nunique_counts", {}).get(treatment_variable, -1)
+            if num_unique == 2:
+                treatment_variable_type = "binary"
+            elif num_unique > 0:
+                treatment_variable_type = "categorical_multi_value"
+            else:
+                treatment_variable_type = "categorical_unknown_cardinality"
+        else:
+            logger.warning(f"Unmapped category '{category}' for '{treatment_variable}', setting as 'other'.")
+            treatment_variable_type = "other"
+    elif treatment_variable:
+        logger.warning(f"'{treatment_variable}' not found in column_categories.")
+    else:
+        logger.info("No treatment variable identified.")
+    logger.info(f"Final Determined Treatment Variable Type: {treatment_variable_type}")
+    return treatment_variable_type
+def determine_treatment_reference_level(is_rct: Optional[bool], llm: Optional[BaseChatModel], treatment_variable: Optional[str],
+                                      query_text: str, dataset_description: Optional[str], file_path: Optional[str],
+                                      columns: List[str]) -> Optional[str]:
+    """
+    Determines the treatment reference level
+    """
+    # If LLM didn't explicitly say RCT, default to False or keep None?
+    # Let's default to False if LLM didn't provide a boolean value.
+    if is_rct is None: is_rct = False
+    treatment_reference_level = None
+    if llm and treatment_variable and treatment_variable in columns:
+        treatment_values_sample = []
+        if file_path:
+            try:
+                df = pd.read_csv(file_path)
+                if treatment_variable in df.columns:
+                    unique_vals = df[treatment_variable].unique()
+                    treatment_values_sample = [item.item() if hasattr(item, 'item') else item for item in unique_vals][:10]
+                    if treatment_values_sample:
+                        logger.info(f"Successfully read treatment values sample from dataset at '{file_path}' for variable '{treatment_variable}'.")
+                    else:
+                        logger.info(f"'{treatment_variable}' in '{file_path}' has no unique values or is empty.")
+                else:
+                    logger.warning(f"'{treatment_variable}' not found in dataset columns at '{file_path}'.")
+            except FileNotFoundError:
+                logger.warning(f"File not found at: {file_path}")
+            except pd.errors.EmptyDataError:
+                logger.warning(f"Empty file at: {file_path}")
+            except Exception as e:
+                logger.warning(f"Error reading dataset at '{file_path}' for '{treatment_variable}': {e}")
+        if not treatment_values_sample:
+            logger.warning(f"No unique values found for treatment '{treatment_variable}'. LLM prompt will receive empty list.")
+        else:
+            logger.info(f"Final treatment values sample: {treatment_values_sample}")
+        try:
+            prompt = TREATMENT_REFERENCE_IDENTIFICATION_PROMPT_TEMPLATE.format(query=query_text, description=dataset_description or 'N/A', treatment_variable=treatment_variable, treatment_variable_values=treatment_values_sample)
+            ref_result = _call_llm_for_var(llm, prompt, LLMTreatmentReferenceLevel)
+            if ref_result and ref_result.reference_level:
+                if treatment_values_sample and ref_result.reference_level not in treatment_values_sample:
+                    logger.warning(f"LLM reference level '{ref_result.reference_level}' not in sampled values for '{treatment_variable}'.")
+                treatment_reference_level = ref_result.reference_level
+                logger.info(f"LLM identified reference level: {treatment_reference_level} (Reason: {ref_result.reasoning})")
+            elif ref_result:
+                logger.info(f"LLM returned no reference level (Reason: {ref_result.reasoning})")
+        except Exception as e:
+            logger.error(f"LLM error for treatment reference level: {e}")
+    return treatment_reference_level
+def identify_interaction_term(llm: Optional[BaseChatModel], treatment_variable: Optional[str], covariates: List[str],
+                              column_categories: Dict[str, str], query_text: str,
+                              dataset_description: Optional[str]) -> Tuple[bool, Optional[str]]:
+    """
+    Identifies the interaction term based on the query and the dataset information
+    """
+    interaction_term_suggested, interaction_variable_candidate = False, None
+    if llm and treatment_variable and covariates:
+        try:
+            covariates_list_str = "\n".join([f"- {cov}: {column_categories.get(cov, 'Unknown')}" for cov in covariates]) or "No covariates identified or available."
+            prompt = INTERACTION_TERM_IDENTIFICATION_PROMPT_TEMPLATE.format(query=query_text, description=dataset_description or 'N/A', treatment_variable=treatment_variable, covariates_list_with_types=covariates_list_str)
+            result = _call_llm_for_var(llm, prompt, LLMInteractionSuggestion)
+            if result:
+                interaction_term_suggested = result.interaction_needed if result.interaction_needed is not None else False
+                if interaction_term_suggested and result.interaction_variable:
+                    if result.interaction_variable in covariates:
+                        interaction_variable_candidate = result.interaction_variable
+                        logger.info(f"LLM suggested interaction: needed={interaction_term_suggested}, variable='{interaction_variable_candidate}' (Reason: {result.reasoning})")
+                    else:
+                        logger.warning(f"LLM suggested variable '{result.interaction_variable}' not in covariates {covariates}. Ignoring.")
+                        interaction_term_suggested = False
+                elif interaction_term_suggested:
+                    logger.info(f"LLM suggested interaction is needed but no variable provided (Reason: {result.reasoning})")
+                else:
+                    logger.info(f"LLM suggested no interaction is needed (Reason: {result.reasoning})")
+            else:
+                logger.warning("LLM returned no result for interaction term suggestion.")
+        except Exception as e:
+            logger.error(f"LLM error during interaction term check: {e}")
+    return interaction_term_suggested, interaction_variable_candidate
+def interpret_query(query_info: Dict[str, Any], dataset_analysis: Dict[str, Any],
+                    dataset_description: Optional[str] = None) -> Dict[str, Any]:
+    """
+    Interpret query using hybrid heuristic/LLM approach to identify variables.
+    Args:
+        query_info: Information extracted from the user's query (text, hints).
+        dataset_analysis: Information about the dataset structure (columns, types, etc.).
+        dataset_description: Optional textual description of the dataset.
+        llm: Optional language model instance.
+    Returns:
+        Dict containing identified variables (treatment, outcome, covariates, etc., and is_rct).
+    """
+    logger.info("Interpreting query with hybrid approach...")
+    llm = get_llm_client()
+    query_text = query_info.get("query_text", "")
+    columns = dataset_analysis.get("columns", [])
+    column_categories = dataset_analysis.get("column_categories", {})
+    file_path = dataset_analysis["dataset_info"]["file_path"]
+    # --- Identify Treatment ---
+    treatment_hints = query_info.get("potential_treatments", [])
+    dataset_treatments = dataset_analysis.get("potential_treatments", [])
+    treatment_variable = _identify_variable_hybrid(role="treatment", query_hints=treatment_hints,
+                                                   dataset_suggestions=dataset_treatments, columns=columns,
+                                                   column_categories=column_categories,
+                                                   prioritize_types=["binary", "binary_categorical", "discrete_numeric","continuous_numeric"], # Prioritize binary/discrete
+                                                   query_text=query_text, dataset_description=dataset_description,llm=llm)
+    logger.info(f"Identified Treatment: {treatment_variable}")
+    treatment_variable_type = infer_treatment_variable_type(treatment_variable, column_categories, dataset_analysis)
+    # --- Identify Outcome ---
+    outcome_hints = query_info.get("outcome_hints", [])
+    dataset_outcomes = dataset_analysis.get("potential_outcomes", [])
+    outcome_variable = _identify_variable_hybrid(role="outcome", query_hints=outcome_hints, dataset_suggestions=dataset_outcomes,
+                                                 columns=columns, column_categories=column_categories,
+                                                 prioritize_types=["continuous_numeric", "discrete_numeric"], # Prioritize numeric
+                                                 exclude_vars=[treatment_variable], # Exclude treatment
+                                                 query_text=query_text, dataset_description=dataset_description, llm=llm)
+    logger.info(f"Identified Outcome: {outcome_variable}")
+    # --- Identify Covariates ---
+    covariate_hints = query_info.get("covariates_hints", [])
+    covariates = _identify_covariates_hybrid("covars", treatment_variable=treatment_variable, outcome_variable=outcome_variable,
+                                             columns=columns, column_categories=column_categories, query_hints=covariate_hints,
+                                             query_text=query_text, dataset_description=dataset_description, llm=llm)
+    logger.info(f"Identified Covariates: {covariates}")
+    # --- Identify Confounders ---
+    confounder_hints = query_info.get("covariates_hints", [])
+    confounders = _identify_covariates_hybrid("confounders", treatment_variable=treatment_variable, outcome_variable=outcome_variable,
+                                              columns=columns, column_categories=column_categories, query_hints=confounder_hints,
+                                              query_text=query_text, dataset_description=dataset_description, llm=llm)
+    logger.info(f"Identified Confounders: {confounders}")
+    # --- Identify Time/Group (from dataset analysis) ---
+    time_variable = None
+    group_variable = None
+    has_temporal = dataset_analysis.get("temporal_structure", {}).get("has_temporal_structure", False)
+    temporal_structure = dataset_analysis.get("temporal_structure", {})
+    if temporal_structure.get("has_temporal_structure", False):
+        time_variable = temporal_structure.get("time_column") or temporal_structure.get("temporal_columns", [None])[0]
+        if temporal_structure.get("is_panel_data", False):
+            group_variable = temporal_structure.get("id_column")
+    logger.info(f"Identified Time Var: {time_variable}, Group Var: {group_variable}, temporal structure: {temporal_structure}")
+    # --- Identify IV/RDD/RCT using LLM ---
+    instrument_variable = None
+    running_variable = None
+    cutoff_value = None
+    is_rct = None
+    smd_score = None
+    if llm:
+        try:
+            # Check for RCT
+            prompt_rct = _create_identify_prompt("whether data is from RCT", query_text, dataset_description, columns, column_categories, treatment_variable, outcome_variable)
+            rct_result = _call_llm_for_var(llm, prompt_rct, LLMRCTCheck)
+            is_rct = rct_result.is_rct if rct_result else None
+            logger.info(f"LLM identified RCT: {is_rct}")
+            # Check for IV
+            prompt_iv = _create_identify_prompt("instrumental variable", query_text, dataset_description, columns, column_categories, treatment_variable, outcome_variable)
+            iv_result = _call_llm_for_var(llm, prompt_iv, LLMIVars)
+            instrument_variable = iv_result.instrument_variable if iv_result else None
+            if instrument_variable not in columns:
+                instrument_variable = None
+            logger.info(f"LLM identified IV: {instrument_variable}")
+            # Check for RDD
+            prompt_rdd = _create_identify_prompt("regression discontinuity (running variable and cutoff)", query_text, dataset_description, columns, column_categories, treatment_variable, outcome_variable)
+            rdd_result = _call_llm_for_var(llm, prompt_rdd, LLMRDDVars)
+            if rdd_result:
+                running_variable = rdd_result.running_variable
+                cutoff_value = rdd_result.cutoff_value
+            if running_variable not in columns or cutoff_value is None:
+                running_variable = None
+                cutoff_value = None
+            logger.info(f"LLM identified RDD: Running={running_variable}, Cutoff={cutoff_value}")
+            ## For graph based methods
+            exclude_cols = [treatment_variable, outcome_variable]
+            potential_covariates = [col for col in columns if col not in exclude_cols and col is not None]
+            usable_covariates = [col for col in potential_covariates if column_categories.get(col) not in ["text_or_other"]]
+            logger.info(f"Usable covariates for graph: {usable_covariates}")
+            estimand_prompt = ESTIMAND_PROMPT_TEMPLATE.format(query=query_text,dataset_description=dataset_description,
+                                                               dataset_columns=usable_covariates,
+                                                               treatment=treatment_variable, outcome=outcome_variable)
+            estimand_result = _call_llm_for_var(llm, estimand_prompt, LLMEstimand)
+            estimand = "ate" if "ate" in estimand_result.estimand.strip().lower() else "att"
+            logger.info(f"LLM identified estimand: {estimand}")
+            ## Did Term
+            did_term_prompt = DID_TERM_IDENTIFICATION_PROMPT_TEMPLATE.format(query=query_text, description=dataset_description,
+                                                                             column_info=columns, time_variable=time_variable,
+                                                                             group_variable=group_variable, column_types=column_categories)
+            did_term_result = _call_llm_for_var(llm, did_term_prompt, LLMRDDVars)
+            did_term_result = did_term_result.did_term if did_term_result in columns else None
+            logger.info(f"LLM identified DiD term: {did_term_result}")
+            #smd_score_all = compute_smd(dataset_analysis.get("data", pd.DataFrame()), treatment_variable, usable_covariates)
+            #smd_score = smd_score_all.get("ate", 0.0) if smd_score_all else 0.0
+            #logger.info(f"Computed SMD score: {smd_score}")
+            #logger.debug(f"Computed SMD score for {estimand}: {smd_score}")
+        except Exception as e:
+            logger.error(f"Error during LLM checks for IV/RDD/RCT: {e}")
+    # --- Identify Treatment Reference Level ---
+    treatment_reference_level = determine_treatment_reference_level(is_rct=is_rct, llm=llm, treatment_variable=treatment_variable,
+                                                                    query_text=query_text, dataset_description=dataset_description,
+                                                                    file_path=file_path, columns=columns)
+    # --- Identify Interaction Term Suggestion ---
+    interaction_term_suggested, interaction_variable_candidate = identify_interaction_term(llm=llm, treatment_variable=treatment_variable,
+                                                                                           covariates=covariates,
+                                                                                           column_categories=column_categories, query_text=query_text,
+                                                                                           dataset_description=dataset_description)
+    # --- Consolidate ---
+    return {
+        "treatment_variable": treatment_variable,
+        "treatment_variable_type": treatment_variable_type,
+        "outcome_variable": outcome_variable,
+        "covariates": covariates,
+        "time_variable": time_variable,
+        "group_variable": group_variable,
+        "instrument_variable": instrument_variable,
+        "running_variable": running_variable,
+        "cutoff_value": cutoff_value,
+        "is_rct": is_rct,
+        "treatment_reference_level": treatment_reference_level,
+        "interaction_term_suggested": interaction_term_suggested,
+        "interaction_variable_candidate": interaction_variable_candidate,
+        "confounders": confounders,
+        "did_term": did_term_result
+    }
+def compute_smd(df: pd.DataFrame, treat, covars_list) -> Dict[str, float]:
+    """
+    Computed the standardized mean differences (SMD) for the treatment variable
+    Args:
+        df (pd.DataFrame): The dataset.
+        treat (str): Name of the binary treatment column (0/1).
+        covars_list (List[str]): List of covariate names to consider for SMD calculation
+    Returns:
+        Dict{str ->float}: the standardized mean difference (SMD)
+    """
+    logger.info(f"Computing SMD for treatment variable '{treat}' with covariates: {covars_list}")
+    df_t = df[df[treat] == 1]
+    df_c = df[df[treat] == 0]
+    covariates = covars_list if covars_list else df.columns.tolist()
+    smd_ate = np.zeros(len(covariates))
+    smd_att = np.zeros(len(covariates))
+    for i, col in enumerate(covariates):
+        try:
+            m_t, m_c = df_t[col].mean(), df_c[col].mean()
+            s_t, s_c = df_t[col].std(ddof=0), df_c[col].std(ddof=0)
+            pooled = np.sqrt((s_t**2 + s_c**2) / 2)
+            ate_val = 0.0 if pooled == 0 else (m_t - m_c) / pooled
+            att_val = 0.0 if s_t == 0 else (m_t - m_c) / s_t
+            smd_ate.append(ate_val)
+            smd_att.append(att_val)
+        except Exception as e:
+            logger.warning(f"SMD computation failed for column '{col}': {e}")
+            continue
+    avg_ate = np.nanmean(np.abs(smd_ate))
+    avg_att = np.nanmean(np.abs(smd_att))
+    return {"ate":avg_ate, "att":avg_att}
+# --- Helper Functions for Hybrid Identification ---
+def _identify_variable_hybrid(role: str, query_hints: List[str], dataset_suggestions: List[str],
+                               columns: List[str], column_categories: Dict[str, str],
+                               prioritize_types: List[str], query_text: str,
+                               dataset_description: Optional[str],llm: Optional[BaseChatModel],
+                               exclude_vars: Optional[List[str]] = None) -> Optional[str]:
+    """
+    Used to identify a variable from the avaiable information by prompting the LLM. In case of failure,
+    it will fallback to a programmatic selection (heuristics)
+    Args:
+        role: variable type (treatment or outcome)
+        query_hints: hints from the query for this variable
+        dataset_suggestions: dataset-specific suggestions for this variable
+        columns: list of available columns in the dataset
+        column_categories: mapping of column names to their categories
+        prioritize_types: types to prioritize for this variable
+        query_text: the original query text
+        dataset_description: description of the dataset
+        llm: language model
+        exclude_vars: list of variables to exclude from selection (e.g., treatment for outcome)
+    Returns:
+        str: name of the identified variable, or None if not found
+    """
+    candidates = set()
+    available_columns = [c for c in columns if c not in (exclude_vars or [])]
+    if not available_columns: return None
+    # 1. Exact matches from hints
+    for hint in query_hints:
+        if hint in available_columns:
+            candidates.add(hint)
+    # 2. Add dataset suggestions
+    for sugg in dataset_suggestions:
+        if sugg in available_columns:
+            candidates.add(sugg)
+    # 3. Programmatic Filtering based on type
+    plausible_candidates = [c for c in candidates if column_categories.get(c) in prioritize_types]
+    if llm:
+        if role == "treatment":
+            prompt_template = TREATMENT_VAR_IDENTIFICATION_PROMPT_TEMPLATE
+        elif role == "outcome":
+            prompt_template = OUTCOME_VAR_IDENTIFICATION_PROMPT_TEMPLATE
+        else:
+            raise ValueError(f"Unsupported role for LLM variable identification: {role}")
+        prompt = prompt_template.format(query=query_text, description=dataset_description,
+                                        column_info=available_columns)
+        llm_choice = _call_llm_for_var(llm, prompt, LLMSelectedVariable)
+        if llm_choice and llm_choice.variable_name in available_columns:
+            logger.info(f"LLM selected {role}: {llm_choice.variable_name}")
+            return llm_choice.variable_name
+        else:
+            fallback = plausible_candidates[0] if plausible_candidates else None
+            logger.warning(f"LLM failed to select valid {role}. Falling back to: {fallback}")
+            return fallback
+    if plausible_candidates:
+        logger.info(f"No LLM provided. Using first plausible {role}: {plausible_candidates[0]}")
+        return plausible_candidates[0]
+    logger.warning(f"No plausible candidates for {role}. Cannot identify variable.")
+    return None
+def _identify_covariates_hybrid(role, treatment_variable: Optional[str], outcome_variable: Optional[str],
+                                columns: List[str], column_categories: Dict[str, str], query_hints: List[str],
+                                query_text: str, dataset_description: Optional[str], llm: Optional[BaseChatModel]) -> List[str]:
+    """
+    Prompts an LLM to identify the covariates
+    """
+    # 1. Initial Programmatic Filtering
+    exclude_cols = [treatment_variable, outcome_variable]
+    potential_covariates = [col for col in columns if col not in exclude_cols and col is not None]
+    # Filter out unusable types
+    usable_covariates = [col for col in potential_covariates if column_categories.get(col) not in ["text_or_other"]]
+    logger.debug(f"Initial usable covariates: {usable_covariates}")
+    # 2. LLM Refinement (if LLM available)
+    if llm:
+        logger.info("Using LLM to refine covariate list...")
+        prompt = ""
+        if role == "covars":
+            prompt = COVARIATES_IDENTIFICATION_PROMPT_TEMPLATE.format("covars", query=query_text, description=dataset_description,
+                                                                 column_info=", ".join(usable_covariates),
+                                                                 treatment=treatment_variable, outcome=outcome_variable)
+        elif role == "confounders":
+            prompt = CONFOUNDER_IDENTIFICATION_PROMPT_TEMPLATE.format(query=query_text, description=dataset_description,
+                                                       column_info=", ".join(usable_covariates),
+                                                       treatment=treatment_variable, outcome=outcome_variable)
+        llm_selection = _call_llm_for_var(llm, prompt, LLMSelectedCovariates)
+        if llm_selection and llm_selection.covariates:
+            # Validate LLM output against available columns
+            valid_llm_covs = [c for c in llm_selection.covariates if c in usable_covariates]
+            if len(valid_llm_covs) < len(llm_selection.covariates):
+                 logger.warning("LLM suggested covariates not found in initial usable list.")
+            if valid_llm_covs: # Use LLM selection if it's valid and non-empty
+                 logger.info(f"LLM refined covariates to: {valid_llm_covs}")
+                 return valid_llm_covs[:10] # Cap at 10
+            else:
+                 logger.warning("LLM refinement failed or returned empty/invalid list. Falling back.")
+        else:
+             logger.warning("LLM refinement call failed or returned no covariates. Falling back.")
+    # 3. Fallback to Programmatic List (Capped)
+    logger.info(f"Using programmatically determined covariates (capped at 10): {usable_covariates[:10]}")
+    return usable_covariates[:10]
+def _create_identify_prompt(target: str, query: str, description: Optional[str], columns: List[str],
+                            categories: Dict[str,str], treatment: Optional[str], outcome: Optional[str]) -> str:
+    """
+    Creates a prompt to ask LLM to identify specific roles like IV, RDD, or RCT by selecting and formatting a specific template
+    """
+    column_info = "\n".join([f"- '{c}' (Type: {categories.get(c, 'Unknown')})" for c in columns])
+    # Select the appropriate detailed prompt template based on the target
+    if "instrumental variable" in target.lower():
+        template = IV_IDENTIFICATION_PROMPT_TEMPLATE
+    elif "regression discontinuity" in target.lower():
+        template = RDD_IDENTIFICATION_PROMPT_TEMPLATE
+    elif "rct" in target.lower():
+        template = RCT_IDENTIFICATION_PROMPT_TEMPLATE
+    else:
+        # Fallback or error? For now, let's raise an error if target is unexpected.
+        logger.error(f"Unsupported target for _create_identify_prompt: {target}")
+        raise ValueError(f"Unsupported target for specific identification prompt: {target}")
+    # Format the selected template with the provided context
+    prompt = template.format(query=query, description=description or 'N/A', column_info=column_info,
+                             treatment=treatment or 'N/A', outcome=outcome or 'N/A')
+    return prompt
+def _call_llm_for_var(llm: BaseChatModel, prompt: str, pydantic_model: BaseModel) -> Optional[BaseModel]:
+    """Helper to call LLM with structured output and handle errors."""
+    try:
+        messages = [HumanMessage(content=prompt)]
+        structured_llm = llm.with_structured_output(pydantic_model)
+        parsed_result = structured_llm.invoke(messages)
+        return parsed_result
+    except (OutputParserException, ValidationError) as e:
+        logger.error(f"LLM call failed parsing/validation for {pydantic_model.__name__}: {e}")
+    except Exception as e:
+         logger.error(f"LLM call failed unexpectedly for {pydantic_model.__name__}: {e}", exc_info=True)
+    return None

auto_causal/components/state_manager.py ADDED Viewed

	@@ -0,0 +1,40 @@

+"""
+State management utilities for the auto_causal workflow.
+This module provides utility functions to create standardized state updates
+for passing between tools in the auto_causal agent workflow.
+"""
+from typing import Dict, Any, Optional
+def create_workflow_state_update(
+    current_step: str,
+    step_completed_flag: bool,
+    next_tool: str,
+    next_step_reason: str,
+    error: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Create a standardized workflow state update dictionary.
+    Args:
+        current_step: Current step in the workflow (e.g., "input_processing")
+        step_completed_flag: Flag indicating which step was completed (e.g., "query_parsed")
+        next_tool: Name of the next tool to call
+        next_step_reason: Reason message for the next step
+        error: Optional error message if the step failed
+    Returns:
+        Dictionary containing the workflow_state sub-dictionary
+    """
+    state_update = {
+        "workflow_state": {
+            "current_step": current_step,
+            current_step + "_completed": step_completed_flag,
+            "next_tool": next_tool,
+            "next_step_reason": next_step_reason
+        }
+    }
+    if error:
+        state_update["workflow_state"]["error_message"] = error
+    return state_update

auto_causal/config.py ADDED Viewed

	@@ -0,0 +1,97 @@

+# auto_causal/config.py
+"""Central configuration for AutoCausal, including LLM client setup."""
+import os
+import logging
+from typing import Optional
+# Langchain imports
+from langchain_core.language_models import BaseChatModel
+from langchain_openai import ChatOpenAI # Default
+from langchain_anthropic import ChatAnthropic # Example
+from langchain_google_genai import ChatGoogleGenerativeAI
+# Add other providers if needed, e.g.:
+# from langchain_community.chat_models import ChatOllama
+from dotenv import load_dotenv
+from langchain_deepseek import ChatDeepSeek
+# Create a disk-backed SQLite cache:
+# Import Together provider
+from langchain_together import ChatTogether
+logger = logging.getLogger(__name__)
+# Load .env file when this module is loaded
+load_dotenv()
+def get_llm_client(provider: Optional[str] = None, model_name: Optional[str] = None, **kwargs) -> BaseChatModel:
+    """Initializes and returns the chosen LLM client based on provider.
+    Reads provider, model, and API keys from environment variables if not passed directly.
+    Defaults to OpenAI GPT-4o-mini if no provider/model specified.
+    """
+    # Prioritize arguments, then environment variables, then defaults
+    provider = provider or os.getenv("LLM_PROVIDER", "openai")
+    provider = provider.lower()
+    # Default model depends on provider
+    default_models = {
+        "openai": "gpt-4o-mini",
+        "anthropic": "claude-3-5-sonnet-latest",
+        "together": "deepseek-ai/DeepSeek-V3",  # Default Together model
+        "gemini" : "gemini-2.5-flash",
+        "deepseek" : "deepseek-chat"
+    }
+    model_name = model_name or os.getenv("LLM_MODEL", default_models.get(provider, default_models["openai"]))
+    api_key = None
+    if model_name not in ['o3-mini', 'o3', 'o4-mini']:
+        kwargs.setdefault("temperature", 0) # Default temperature if not provided
+    logger.info(f"Initializing LLM client: Provider='{provider}', Model='{model_name}'")
+    try:
+        if provider == "openai":
+            api_key = os.getenv("OPENAI_API_KEY")
+            if not api_key:
+                raise ValueError("OPENAI_API_KEY not found in environment.")
+            return ChatOpenAI(model=model_name, api_key=api_key, **kwargs)
+        elif provider == "anthropic":
+            api_key = os.getenv("ANTHROPIC_API_KEY")
+            if not api_key:
+                raise ValueError("ANTHROPIC_API_KEY not found in environment.")
+            return ChatAnthropic(model=model_name, api_key=api_key, **kwargs, streaming=False)
+        elif provider == "together":
+            api_key = os.getenv("TOGETHER_API_KEY")
+            if not api_key:
+                raise ValueError("TOGETHER_API_KEY not found in environment.")
+            return ChatTogether(model=model_name, api_key=api_key, **kwargs)
+        elif provider == "gemini":
+            api_key = os.getenv("GEMINI_API_KEY")
+            if not api_key:
+                raise ValueError("GEMINI_API_KEY not found in environment.")
+            return ChatGoogleGenerativeAI(model=model_name, **kwargs, function_calling="auto")
+        elif provider == "deepseek":
+            api_key = os.getenv("DEEPSEEK_API_KEY")
+            if not api_key:
+                raise ValueError("DEEPSEEK_API_KEY not found in environment.")
+            return ChatDeepSeek(model=model_name, **kwargs)
+        # Example for Ollama (ensure langchain_community is installed)
+        # elif provider == "ollama":
+        #     try:
+        #         from langchain_community.chat_models import ChatOllama
+        #         return ChatOllama(model=model_name, **kwargs)
+        #     except ImportError:
+        #         raise ValueError("langchain_community needed for Ollama. Run `pip install langchain-community`")
+        else:
+            raise ValueError(f"Unsupported LLM provider: {provider}")
+    except Exception as e:
+        logger.error(f"Failed to initialize LLM (Provider: {provider}, Model: {model_name}): {e}")
+        raise # Re-raise the exception

auto_causal/methods/__init__.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""
+Causal inference methods for the auto_causal module.
+This package contains implementations of various causal inference methods
+that can be selected and applied by the auto_causal pipeline.
+"""
+from .causal_method import CausalMethod
+from .propensity_score.matching import estimate_effect as psm_estimate_effect
+from .propensity_score.weighting import estimate_effect as psw_estimate_effect
+from .instrumental_variable.estimator import estimate_effect as iv_estimate_effect
+from .difference_in_differences.estimator import estimate_effect as did_estimate_effect
+from .diff_in_means.estimator import estimate_effect as dim_estimate_effect
+from .linear_regression.estimator import estimate_effect as lr_estimate_effect
+from .backdoor_adjustment.estimator import estimate_effect as ba_estimate_effect
+from .regression_discontinuity.estimator import estimate_effect as rdd_estimate_effect
+from .generalized_propensity_score.estimator import estimate_effect_gps
+# Mapping of method names to their implementation functions
+METHOD_MAPPING = {
+    "propensity_score_matching": psm_estimate_effect,
+    "propensity_score_weighting": psw_estimate_effect,
+    "instrumental_variable": iv_estimate_effect,
+    "difference_in_differences": did_estimate_effect,
+    "regression_discontinuity_design": rdd_estimate_effect,
+    "backdoor_adjustment": ba_estimate_effect,
+    "linear_regression": lr_estimate_effect,
+    "diff_in_means": dim_estimate_effect,
+    "generalized_propensity_score": estimate_effect_gps,
+}
+__all__ = [
+    "CausalMethod",
+    "psm_estimate_effect",
+    "psw_estimate_effect",
+    "iv_estimate_effect",
+    "did_estimate_effect",
+    "rdd_estimate_effect",
+    "dim_estimate_effect",
+    "lr_estimate_effect",
+    "ba_estimate_effect",
+    "METHOD_MAPPING",
+    "estimate_effect_gps",
+]

auto_causal/methods/backdoor_adjustment/__init__.py ADDED Viewed

File without changes

auto_causal/methods/backdoor_adjustment/diagnostics.py ADDED Viewed

	@@ -0,0 +1,92 @@

+"""
+Diagnostic checks for Backdoor Adjustment models (typically OLS).
+"""
+from typing import Dict, Any, List
+import statsmodels.api as sm
+from statsmodels.stats.diagnostic import het_breuschpagan
+from statsmodels.stats.stattools import jarque_bera, durbin_watson
+from statsmodels.regression.linear_model import RegressionResultsWrapper
+from statsmodels.stats.outliers_influence import variance_inflation_factor
+import pandas as pd
+import numpy as np
+import logging
+logger = logging.getLogger(__name__)
+def run_backdoor_diagnostics(results: RegressionResultsWrapper, X: pd.DataFrame) -> Dict[str, Any]:
+    """
+    Runs diagnostic checks on a fitted OLS model used for backdoor adjustment.
+    Args:
+        results: A fitted statsmodels OLS results object.
+        X: The design matrix (including constant and all predictors) used.
+    Returns:
+        Dictionary containing diagnostic metrics.
+    """
+    diagnostics = {}
+    details = {}
+    try:
+        details['r_squared'] = results.rsquared
+        details['adj_r_squared'] = results.rsquared_adj
+        details['f_statistic'] = results.fvalue
+        details['f_p_value'] = results.f_pvalue
+        details['n_observations'] = int(results.nobs)
+        details['degrees_of_freedom_resid'] = int(results.df_resid)
+        details['durbin_watson'] = durbin_watson(results.resid) if results.nobs > 5 else 'N/A (Too few obs)' # Autocorrelation
+        # --- Normality of Residuals (Jarque-Bera) ---
+        try:
+            if results.nobs >= 2:
+                jb_value, jb_p_value, skew, kurtosis = jarque_bera(results.resid)
+                details['residuals_normality_jb_stat'] = jb_value
+                details['residuals_normality_jb_p_value'] = jb_p_value
+                details['residuals_skewness'] = skew
+                details['residuals_kurtosis'] = kurtosis
+                details['residuals_normality_status'] = "Normal" if jb_p_value > 0.05 else "Non-Normal"
+            else:
+                 details['residuals_normality_status'] = "N/A (Too few obs)"
+        except Exception as e:
+            logger.warning(f"Could not run Jarque-Bera test: {e}")
+            details['residuals_normality_status'] = "Test Failed"
+        # --- Homoscedasticity (Breusch-Pagan) ---
+        try:
+            if X.shape[0] > X.shape[1]: # Needs more observations than predictors
+                lm_stat, lm_p_value, f_stat, f_p_value = het_breuschpagan(results.resid, X)
+                details['homoscedasticity_bp_lm_stat'] = lm_stat
+                details['homoscedasticity_bp_lm_p_value'] = lm_p_value
+                details['homoscedasticity_status'] = "Homoscedastic" if lm_p_value > 0.05 else "Heteroscedastic"
+            else:
+                details['homoscedasticity_status'] = "N/A (Too few obs or too many predictors)"
+        except Exception as e:
+            logger.warning(f"Could not run Breusch-Pagan test: {e}")
+            details['homoscedasticity_status'] = "Test Failed"
+        # --- Multicollinearity (VIF - Placeholder/Basic) ---
+        # Full VIF requires calculating for each predictor vs others.
+        # Providing a basic status based on condition number as a proxy.
+        try:
+            cond_no = np.linalg.cond(results.model.exog)
+            details['model_condition_number'] = cond_no
+            if cond_no > 30:
+                details['multicollinearity_status'] = "High (Cond. No. > 30)"
+            elif cond_no > 10:
+                 details['multicollinearity_status'] = "Moderate (Cond. No. > 10)"
+            else:
+                 details['multicollinearity_status'] = "Low"
+        except Exception as e:
+             logger.warning(f"Could not calculate condition number: {e}")
+             details['multicollinearity_status'] = "Check Failed"
+        # details['VIF'] = "Not Fully Implemented"
+        # --- Linearity (Still requires visual inspection) ---
+        details['linearity_check'] = "Requires visual inspection (e.g., residual vs fitted plot)"
+        return {"status": "Success", "details": details}
+    except Exception as e:
+        logger.error(f"Error running Backdoor Adjustment diagnostics: {e}")
+        return {"status": "Failed", "error": str(e), "details": details}

auto_causal/methods/backdoor_adjustment/estimator.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""
+Backdoor Adjustment Estimator using Regression.
+Estimates the Average Treatment Effect (ATE) by regressing the outcome on the
+treatment and a set of covariates assumed to satisfy the backdoor criterion.
+"""
+import pandas as pd
+import numpy as np
+import statsmodels.api as sm
+from typing import Dict, Any, List, Optional
+import logging
+from langchain.chat_models.base import BaseChatModel # For type hinting llm
+# Import diagnostics and llm assist (placeholders for now)
+from .diagnostics import run_backdoor_diagnostics
+from .llm_assist import interpret_backdoor_results, identify_backdoor_set
+logger = logging.getLogger(__name__)
+def estimate_effect(
+    df: pd.DataFrame,
+    treatment: str,
+    outcome: str,
+    covariates: List[str], # Backdoor set - Required for this method
+    query: Optional[str] = None, # For potential LLM use
+    llm: Optional[BaseChatModel] = None, # For potential LLM use
+    **kwargs # To capture any other potential arguments
+) -> Dict[str, Any]:
+    """
+    Estimates the causal effect using Backdoor Adjustment (via OLS regression).
+    Assumes the provided `covariates` list satisfies the backdoor criterion.
+    Args:
+        df: Input DataFrame.
+        treatment: Name of the treatment variable column.
+        outcome: Name of the outcome variable column.
+        covariates: List of covariate names forming the backdoor adjustment set.
+        query: Optional user query for context (e.g., for LLM).
+        llm: Optional Language Model instance.
+        **kwargs: Additional keyword arguments.
+    Returns:
+        Dictionary containing estimation results:
+        - 'effect_estimate': The estimated coefficient for the treatment variable.
+        - 'p_value': The p-value associated with the treatment coefficient.
+        - 'confidence_interval': The 95% confidence interval for the effect.
+        - 'standard_error': The standard error of the treatment coefficient.
+        - 'formula': The regression formula used.
+        - 'model_summary': Summary object from statsmodels.
+        - 'diagnostics': Placeholder for diagnostic results.
+        - 'interpretation': LLM interpretation.
+    """
+    if not covariates: # Check if the list is empty or None
+        raise ValueError("Backdoor Adjustment requires a non-empty list of covariates (adjustment set).")
+    required_cols = [treatment, outcome] + covariates
+    missing_cols = [col for col in required_cols if col not in df.columns]
+    if missing_cols:
+        raise ValueError(f"Missing required columns for Backdoor Adjustment: {missing_cols}")
+    # Prepare data for statsmodels (add constant, handle potential NaNs)
+    df_analysis = df[required_cols].dropna()
+    if df_analysis.empty:
+        raise ValueError("No data remaining after dropping NaNs for required columns.")
+    X = df_analysis[[treatment] + covariates]
+    X = sm.add_constant(X) # Add intercept
+    y = df_analysis[outcome]
+    # Build the formula string for reporting
+    formula = f"{outcome} ~ {treatment} + " + " + ".join(covariates) + " + const"
+    logger.info(f"Running Backdoor Adjustment regression: {formula}")
+    try:
+        model = sm.OLS(y, X)
+        results = model.fit()
+        effect_estimate = results.params[treatment]
+        p_value = results.pvalues[treatment]
+        conf_int = results.conf_int(alpha=0.05).loc[treatment].tolist()
+        std_err = results.bse[treatment]
+        # Run diagnostics (Placeholders)
+        # Pass the full design matrix X for potential VIF checks etc.
+        diag_results = run_backdoor_diagnostics(results, X)
+        # Get interpretation
+        interpretation = interpret_backdoor_results(results, diag_results, treatment, covariates, llm=llm)
+        return {
+            'effect_estimate': effect_estimate,
+            'p_value': p_value,
+            'confidence_interval': conf_int,
+            'standard_error': std_err,
+            'formula': formula,
+            'model_summary': results.summary(),
+            'diagnostics': diag_results,
+            'interpretation': interpretation,
+            'method_used': 'Backdoor Adjustment (OLS)'
+        }
+    except Exception as e:
+        logger.error(f"Backdoor Adjustment failed: {e}")
+        raise

auto_causal/methods/backdoor_adjustment/llm_assist.py ADDED Viewed

	@@ -0,0 +1,176 @@

+"""
+LLM assistance functions for Backdoor Adjustment analysis.
+"""
+from typing import List, Dict, Any, Optional
+import logging
+# Imported for type hinting
+from langchain.chat_models.base import BaseChatModel
+from statsmodels.regression.linear_model import RegressionResultsWrapper
+# Import shared LLM helpers
+from auto_causal.utils.llm_helpers import call_llm_with_json_output
+logger = logging.getLogger(__name__)
+def identify_backdoor_set(
+    df_cols: List[str],
+    treatment: str,
+    outcome: str,
+    query: Optional[str] = None,
+    existing_covariates: Optional[List[str]] = None, # Allow user to provide some
+    llm: Optional[BaseChatModel] = None
+) -> List[str]:
+    """
+    Use LLM to suggest a potential backdoor adjustment set (confounders).
+    Tries to identify variables that affect both treatment and outcome.
+    Args:
+        df_cols: List of available column names in the dataset.
+        treatment: Treatment variable name.
+        outcome: Outcome variable name.
+        query: User's causal query text (provides context).
+        existing_covariates: Covariates already considered/provided by user.
+        llm: Optional LLM model instance.
+    Returns:
+        List of suggested variable names for the backdoor adjustment set.
+    """
+    if llm is None:
+        logger.warning("No LLM provided for backdoor set identification.")
+        return existing_covariates or []
+    # Exclude treatment and outcome from potential confounders
+    potential_confounders = [c for c in df_cols if c not in [treatment, outcome]]
+    if not potential_confounders:
+        return existing_covariates or []
+    prompt = f"""
+    You are assisting with identifying a backdoor adjustment set for causal inference.
+    The goal is to find observed variables that confound the relationship between the treatment and outcome.
+    Assume the causal effect of '{treatment}' on '{outcome}' is of interest.
+    User query context (optional): {query}
+    Available variables in the dataset (excluding treatment and outcome): {potential_confounders}
+    Variables already specified as covariates by user (if any): {existing_covariates}
+    Based *only* on the variable names and the query context, identify which of the available variables are likely to be common causes (confounders) of both '{treatment}' and '{outcome}'.
+    These variables should be included in the backdoor adjustment set.
+    Consider variables that likely occurred *before* or *at the same time as* the treatment.
+    Return ONLY a valid JSON object with the following structure (no explanations or surrounding text):
+    {{
+      "suggested_backdoor_set": ["confounder1", "confounder2", ...]
+    }}
+    Include variables from the user-provided list if they seem appropriate as confounders.
+    If no plausible confounders are identified among the available variables, return an empty list.
+    """
+    response = call_llm_with_json_output(llm, prompt)
+    suggested_set = []
+    if response and "suggested_backdoor_set" in response and isinstance(response["suggested_backdoor_set"], list):
+        # Basic validation
+        valid_vars = [item for item in response["suggested_backdoor_set"] if isinstance(item, str)]
+        if len(valid_vars) != len(response["suggested_backdoor_set"]):
+            logger.warning("LLM returned non-string items in suggested_backdoor_set list.")
+        suggested_set = valid_vars
+    else:
+         logger.warning(f"Failed to get valid backdoor set recommendations from LLM. Response: {response}")
+    # Combine with existing covariates, removing duplicates
+    final_set = list(dict.fromkeys((existing_covariates or []) + suggested_set))
+    return final_set
+def interpret_backdoor_results(
+    results: RegressionResultsWrapper,
+    diagnostics: Dict[str, Any],
+    treatment_var: str,
+    covariates: List[str],
+    llm: Optional[BaseChatModel] = None
+) -> str:
+    """
+    Use LLM to interpret Backdoor Adjustment results.
+    Args:
+        results: Fitted statsmodels OLS results object.
+        diagnostics: Dictionary of diagnostic results.
+        treatment_var: Name of the treatment variable.
+        covariates: List of covariates used in the adjustment set.
+        llm: Optional LLM model instance.
+    Returns:
+        String containing natural language interpretation.
+    """
+    default_interpretation = "LLM interpretation not available for Backdoor Adjustment."
+    if llm is None:
+        logger.info("LLM not provided for Backdoor Adjustment interpretation.")
+        return default_interpretation
+    try:
+        # --- Prepare summary for LLM ---
+        results_summary = {}
+        diag_details = diagnostics.get('details', {})
+        effect = results.params.get(treatment_var)
+        pval = results.pvalues.get(treatment_var)
+        results_summary['Treatment Effect Estimate'] = f"{effect:.3f}" if isinstance(effect, (int, float)) else str(effect)
+        results_summary['P-value'] = f"{pval:.3f}" if isinstance(pval, (int, float)) else str(pval)
+        try:
+            conf_int = results.conf_int().loc[treatment_var]
+            results_summary['95% Confidence Interval'] = f"[{conf_int[0]:.3f}, {conf_int[1]:.3f}]"
+        except KeyError:
+             results_summary['95% Confidence Interval'] = "Not Found"
+        except Exception as ci_e:
+             results_summary['95% Confidence Interval'] = f"Error ({ci_e})"
+        results_summary['Adjustment Set (Covariates Used)'] = covariates
+        results_summary['Model R-squared'] = f"{diagnostics.get('details', {}).get('r_squared', 'N/A'):.3f}" if isinstance(diagnostics.get('details', {}).get('r_squared'), (int, float)) else "N/A"
+        diag_summary = {}
+        if diagnostics.get("status") == "Success":
+             diag_summary['Residuals Normality Status'] = diag_details.get('residuals_normality_status', 'N/A')
+             diag_summary['Homoscedasticity Status'] = diag_details.get('homoscedasticity_status', 'N/A')
+             diag_summary['Multicollinearity Status'] = diag_details.get('multicollinearity_status', 'N/A')
+        else:
+             diag_summary['Status'] = diagnostics.get("status", "Unknown")
+        # --- Construct Prompt ---
+        prompt = f"""
+        You are assisting with interpreting Backdoor Adjustment (Regression) results.
+        The key assumption is that the specified adjustment set (covariates) blocks all confounding paths between the treatment ('{treatment_var}') and outcome.
+        Results Summary:
+        {results_summary}
+        Diagnostics Summary (OLS model checks):
+        {diag_summary}
+        Explain these results in 2-4 concise sentences. Focus on:
+        1. The estimated average treatment effect after adjusting for the specified covariates (magnitude, direction, statistical significance based on p-value < 0.05).
+        2. **Crucially, mention that this estimate relies heavily on the assumption that the included covariates ('{str(covariates)[:100]}...') are sufficient to control for confounding (i.e., satisfy the backdoor criterion).**
+        3. Briefly mention any major OLS diagnostic issues noted (e.g., non-normal residuals, heteroscedasticity, high multicollinearity).
+        Return ONLY a valid JSON object with the following structure (no explanations or surrounding text):
+        {{
+          "interpretation": "<your concise interpretation text>"
+        }}
+        """
+        # --- Call LLM ---
+        response = call_llm_with_json_output(llm, prompt)
+        # --- Process Response ---
+        if response and isinstance(response, dict) and \
+           "interpretation" in response and isinstance(response["interpretation"], str):
+            return response["interpretation"]
+        else:
+            logger.warning(f"Failed to get valid interpretation from LLM for Backdoor Adj. Response: {response}")
+            return default_interpretation
+    except Exception as e:
+        logger.error(f"Error during LLM interpretation for Backdoor Adj: {e}")
+        return f"Error generating interpretation: {e}"

auto_causal/methods/causal_method.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""
+Abstract base class for all causal inference methods.
+This module defines the interface that all causal inference methods
+must implement, ensuring consistent behavior across different methods.
+"""
+from abc import ABC, abstractmethod
+from typing import Dict, List, Any
+import pandas as pd
+class CausalMethod(ABC):
+    """Base class for all causal inference methods.
+    This abstract class defines the required methods that all causal
+    inference implementations must provide. It ensures a consistent
+    interface across different methods like propensity score matching,
+    instrumental variables, etc.
+    Each implementation should handle the specifics of the causal
+    inference method while conforming to this interface.
+    """
+    @abstractmethod
+    def validate_assumptions(self, df: pd.DataFrame, treatment: str,
+                           outcome: str, covariates: List[str]) -> Dict[str, Any]:
+        """Validate method assumptions against the dataset.
+        Args:
+            df: DataFrame containing the dataset
+            treatment: Name of the treatment variable column
+            outcome: Name of the outcome variable column
+            covariates: List of covariate column names
+        Returns:
+            Dict containing validation results with keys:
+                - assumptions_valid (bool): Whether all assumptions are met
+                - failed_assumptions (List[str]): List of failed assumptions
+                - warnings (List[str]): List of warnings
+                - suggestions (List[str]): Suggestions for addressing issues
+        """
+        pass
+    @abstractmethod
+    def estimate_effect(self, df: pd.DataFrame, treatment: str,
+                      outcome: str, covariates: List[str]) -> Dict[str, Any]:
+        """Estimate causal effect using this method.
+        Args:
+            df: DataFrame containing the dataset
+            treatment: Name of the treatment variable column
+            outcome: Name of the outcome variable column
+            covariates: List of covariate column names
+        Returns:
+            Dict containing estimation results with keys:
+                - effect_estimate (float): Estimated causal effect
+                - confidence_interval (tuple): Confidence interval (lower, upper)
+                - p_value (float): P-value of the estimate
+                - additional_metrics (Dict): Any method-specific metrics
+        """
+        pass
+    @abstractmethod
+    def generate_code(self, dataset_path: str, treatment: str,
+                    outcome: str, covariates: List[str]) -> str:
+        """Generate executable code for this causal method.
+        Args:
+            dataset_path: Path to the dataset file
+            treatment: Name of the treatment variable column
+            outcome: Name of the outcome variable column
+            covariates: List of covariate column names
+        Returns:
+            String containing executable Python code implementing this method
+        """
+        pass
+    @abstractmethod
+    def explain(self) -> str:
+        """Explain this causal method, its assumptions, and when to use it.
+        Returns:
+            String with detailed explanation of the method
+        """
+        pass

auto_causal/methods/diff_in_means/__init__.py ADDED Viewed

File without changes

auto_causal/methods/diff_in_means/diagnostics.py ADDED Viewed

	@@ -0,0 +1,60 @@

+"""
+Basic descriptive statistics for Difference in Means.
+"""
+from typing import Dict, Any
+import pandas as pd
+import numpy as np
+import logging
+logger = logging.getLogger(__name__)
+def run_dim_diagnostics(df: pd.DataFrame, treatment: str, outcome: str) -> Dict[str, Any]:
+    """
+    Calculates basic descriptive statistics for treatment and control groups.
+    Args:
+        df: Input DataFrame (should already be filtered for NaNs in treatment/outcome).
+        treatment: Name of the binary treatment variable column.
+        outcome: Name of the outcome variable column.
+    Returns:
+        Dictionary containing group means, standard deviations, and counts.
+    """
+    details = {}
+    try:
+        grouped = df.groupby(treatment)[outcome]
+        stats = grouped.agg(['mean', 'std', 'count'])
+        # Ensure both groups (0 and 1) are present if possible
+        control_stats = stats.loc[0].to_dict() if 0 in stats.index else {'mean': np.nan, 'std': np.nan, 'count': 0}
+        treated_stats = stats.loc[1].to_dict() if 1 in stats.index else {'mean': np.nan, 'std': np.nan, 'count': 0}
+        details['control_group_stats'] = control_stats
+        details['treated_group_stats'] = treated_stats
+        if control_stats['count'] == 0 or treated_stats['count'] == 0:
+             logger.warning("One or both treatment groups have zero observations.")
+             return {"status": "Warning - Empty Group(s)", "details": details}
+        # Simple check for variance difference (Levene's test could be added)
+        control_std = control_stats.get('std', 0)
+        treated_std = treated_stats.get('std', 0)
+        if control_std > 0 and treated_std > 0:
+            ratio = (control_std**2) / (treated_std**2)
+            details['variance_ratio_control_div_treated'] = ratio
+            if ratio > 4 or ratio < 0.25: # Rule of thumb
+                details['variance_homogeneity_status'] = "Potentially Unequal (ratio > 4 or < 0.25)"
+            else:
+                 details['variance_homogeneity_status'] = "Likely Similar"
+        else:
+            details['variance_homogeneity_status'] = "Could not calculate (zero variance in a group)"
+        return {"status": "Success", "details": details}
+    except KeyError as ke:
+         logger.error(f"KeyError during diagnostics: {ke}. Treatment levels might not be 0/1.")
+         return {"status": "Failed", "error": f"Treatment levels might not be 0/1: {ke}", "details": details}
+    except Exception as e:
+        logger.error(f"Error running Difference in Means diagnostics: {e}")
+        return {"status": "Failed", "error": str(e), "details": details}

auto_causal/methods/diff_in_means/estimator.py ADDED Viewed

	@@ -0,0 +1,107 @@

+"""
+Difference in Means / Simple Linear Regression Estimator.
+Estimates the Average Treatment Effect (ATE) by comparing the mean outcome
+between the treated and control groups. This is equivalent to a simple OLS
+regression of the outcome on the treatment indicator.
+Assumes no confounding (e.g., suitable for RCT data).
+"""
+import pandas as pd
+import statsmodels.api as sm
+import numpy as np
+import warnings
+from typing import Dict, Any, Optional
+import logging
+from langchain.chat_models.base import BaseChatModel # For type hinting llm
+from .diagnostics import run_dim_diagnostics
+from .llm_assist import interpret_dim_results
+logger = logging.getLogger(__name__)
+def estimate_effect(
+    df: pd.DataFrame,
+    treatment: str,
+    outcome: str,
+    query: Optional[str] = None, # For potential LLM use
+    llm: Optional[BaseChatModel] = None, # For potential LLM use
+    **kwargs # To capture any other potential arguments (e.g., covariates - which are ignored)
+) -> Dict[str, Any]:
+    """
+    Estimates the causal effect using Difference in Means (via OLS).
+    Ignores any provided covariates.
+    Args:
+        df: Input DataFrame.
+        treatment: Name of the binary treatment variable column (should be 0 or 1).
+        outcome: Name of the outcome variable column.
+        query: Optional user query for context.
+        llm: Optional Language Model instance.
+        **kwargs: Additional keyword arguments (ignored).
+    Returns:
+        Dictionary containing estimation results:
+        - 'effect_estimate': The difference in means (treatment coefficient).
+        - 'p_value': The p-value associated with the difference.
+        - 'confidence_interval': The 95% confidence interval for the difference.
+        - 'standard_error': The standard error of the difference.
+        - 'formula': The regression formula used.
+        - 'model_summary': Summary object from statsmodels.
+        - 'diagnostics': Basic group statistics.
+        - 'interpretation': LLM interpretation.
+    """
+    required_cols = [treatment, outcome]
+    missing_cols = [col for col in required_cols if col not in df.columns]
+    if missing_cols:
+        raise ValueError(f"Missing required columns: {missing_cols}")
+    # Validate treatment is binary (or close to it)
+    treat_vals = df[treatment].dropna().unique()
+    if not np.all(np.isin(treat_vals, [0, 1])):
+        warnings.warn(f"Treatment column '{treatment}' contains values other than 0 and 1: {treat_vals}. Proceeding, but results may be unreliable.", UserWarning)
+        # Optional: could raise ValueError here if strict binary is required
+    # Prepare data for statsmodels (add constant, handle potential NaNs)
+    df_analysis = df[required_cols].dropna()
+    if df_analysis.empty:
+        raise ValueError("No data remaining after dropping NaNs for required columns.")
+    X = df_analysis[[treatment]]
+    X = sm.add_constant(X) # Add intercept
+    y = df_analysis[outcome]
+    formula = f"{outcome} ~ {treatment} + const"
+    logger.info(f"Running Difference in Means regression: {formula}")
+    try:
+        model = sm.OLS(y, X)
+        results = model.fit()
+        effect_estimate = results.params[treatment]
+        p_value = results.pvalues[treatment]
+        conf_int = results.conf_int(alpha=0.05).loc[treatment].tolist()
+        std_err = results.bse[treatment]
+        # Run basic diagnostics (group means, stds, counts)
+        diag_results = run_dim_diagnostics(df_analysis, treatment, outcome)
+        # Get interpretation
+        interpretation = interpret_dim_results(results, diag_results, treatment, llm=llm)
+        return {
+            'effect_estimate': effect_estimate,
+            'p_value': p_value,
+            'confidence_interval': conf_int,
+            'standard_error': std_err,
+            'formula': formula,
+            'model_summary': results.summary(),
+            'diagnostics': diag_results,
+            'interpretation': interpretation,
+            'method_used': 'Difference in Means (OLS)'
+        }
+    except Exception as e:
+        logger.error(f"Difference in Means failed: {e}")
+        raise

auto_causal/methods/diff_in_means/llm_assist.py ADDED Viewed

	@@ -0,0 +1,95 @@

+"""
+LLM assistance functions for Difference in Means analysis.
+"""
+from typing import Dict, Any, Optional
+import logging
+# Imported for type hinting
+from langchain.chat_models.base import BaseChatModel
+from statsmodels.regression.linear_model import RegressionResultsWrapper
+# Import shared LLM helpers
+from auto_causal.utils.llm_helpers import call_llm_with_json_output
+logger = logging.getLogger(__name__)
+def interpret_dim_results(
+    results: RegressionResultsWrapper,
+    diagnostics: Dict[str, Any],
+    treatment_var: str,
+    llm: Optional[BaseChatModel] = None
+) -> str:
+    """
+    Use LLM to interpret Difference in Means results.
+    Args:
+        results: Fitted statsmodels OLS results object (from outcome ~ treatment).
+        diagnostics: Dictionary of diagnostic results (group stats).
+        treatment_var: Name of the treatment variable.
+        llm: Optional LLM model instance.
+    Returns:
+        String containing natural language interpretation.
+    """
+    default_interpretation = "LLM interpretation not available for Difference in Means."
+    if llm is None:
+        logger.info("LLM not provided for Difference in Means interpretation.")
+        return default_interpretation
+    try:
+        # --- Prepare summary for LLM ---
+        results_summary = {}
+        diag_details = diagnostics.get('details', {})
+        control_stats = diag_details.get('control_group_stats', {})
+        treated_stats = diag_details.get('treated_group_stats', {})
+        effect = results.params.get(treatment_var)
+        pval = results.pvalues.get(treatment_var)
+        results_summary['Effect Estimate (Difference in Means)'] = f"{effect:.3f}" if isinstance(effect, (int, float)) else str(effect)
+        results_summary['P-value'] = f"{pval:.3f}" if isinstance(pval, (int, float)) else str(pval)
+        try:
+            conf_int = results.conf_int().loc[treatment_var]
+            results_summary['95% Confidence Interval'] = f"[{conf_int[0]:.3f}, {conf_int[1]:.3f}]"
+        except KeyError:
+             results_summary['95% Confidence Interval'] = "Not Found"
+        except Exception as ci_e:
+             results_summary['95% Confidence Interval'] = f"Error ({ci_e})"
+        results_summary['Control Group Mean Outcome'] = f"{control_stats.get('mean', 'N/A'):.3f}" if isinstance(control_stats.get('mean'), (int, float)) else str(control_stats.get('mean'))
+        results_summary['Treated Group Mean Outcome'] = f"{treated_stats.get('mean', 'N/A'):.3f}" if isinstance(treated_stats.get('mean'), (int, float)) else str(treated_stats.get('mean'))
+        results_summary['Control Group Size'] = control_stats.get('count', 'N/A')
+        results_summary['Treated Group Size'] = treated_stats.get('count', 'N/A')
+        # --- Construct Prompt ---
+        prompt = f"""
+        You are assisting with interpreting Difference in Means results, likely from an RCT.
+        Results Summary:
+        {results_summary}
+        Explain these results in 1-3 concise sentences. Focus on:
+        1. The estimated average treatment effect (magnitude, direction, statistical significance based on p-value < 0.05).
+        2. Compare the mean outcomes between the treated and control groups.
+        Return ONLY a valid JSON object with the following structure (no explanations or surrounding text):
+        {{
+          "interpretation": "<your concise interpretation text>"
+        }}
+        """
+        # --- Call LLM ---
+        response = call_llm_with_json_output(llm, prompt)
+        # --- Process Response ---
+        if response and isinstance(response, dict) and \
+           "interpretation" in response and isinstance(response["interpretation"], str):
+            return response["interpretation"]
+        else:
+            logger.warning(f"Failed to get valid interpretation from LLM for Difference in Means. Response: {response}")
+            return default_interpretation
+    except Exception as e:
+        logger.error(f"Error during LLM interpretation for Difference in Means: {e}")
+        return f"Error generating interpretation: {e}"

auto_causal/methods/difference_in_differences/diagnostics.py ADDED Viewed

	@@ -0,0 +1,345 @@

+"""Diagnostic functions for Difference-in-Differences method."""
+import pandas as pd
+import numpy as np
+from typing import Dict, Any, Optional, List
+import logging
+import statsmodels.formula.api as smf # Import statsmodels
+from patsy import PatsyError # To catch formula errors
+# Import helper function from estimator -> Change to utils
+from .utils import create_post_indicator
+logger = logging.getLogger(__name__)
+def validate_parallel_trends(df: pd.DataFrame, time_var: str, outcome: str,
+                             group_indicator_col: str, treatment_period_start: Any,
+                             dataset_description: Optional[str] = None,
+                             time_varying_covariates: Optional[List[str]] = None) -> Dict[str, Any]:
+    """Validates the parallel trends assumption using pre-treatment data.
+    Regresses the outcome on group-specific time trends before the treatment period.
+    Tests if the interaction terms between group and pre-treatment time periods are jointly significant.
+    Args:
+        df: DataFrame containing the data.
+        time_var: Name of the time variable column.
+        outcome: Name of the outcome variable column.
+        group_indicator_col: Name of the binary treatment group indicator column (0/1).
+        treatment_period_start: The time period value when treatment starts.
+        dataset_description: Optional dictionary for additional dataset description.
+        time_varying_covariates: Optional list of time-varying covariates to include.
+    Returns:
+        Dictionary with validation results.
+    """
+    logger.info("Validating parallel trends...")
+    validation_result = {"valid": False, "p_value": 1.0, "details": "", "error": None}
+    try:
+        # Filter pre-treatment data
+        pre_df = df[df[time_var] < treatment_period_start].copy()
+        if len(pre_df) < 20 or pre_df[group_indicator_col].nunique() < 2 or pre_df[time_var].nunique() < 2:
+            validation_result["details"] = "Insufficient pre-treatment data or variation to perform test."
+            logger.warning(validation_result["details"])
+            # Assume valid if cannot test? Or invalid? Let's default to True if we can't test
+            validation_result["valid"] = True
+            validation_result["details"] += " Defaulting to assuming parallel trends (unable to test)."
+            return validation_result
+        # Check if group indicator is binary
+        if pre_df[group_indicator_col].nunique() > 2:
+            validation_result["details"] = f"Group indicator '{group_indicator_col}' has more than 2 unique values. Using simple visual assessment."
+            logger.warning(validation_result["details"])
+            # Use visual assessment method instead (check if trends look roughly parallel)
+            validation_result = assess_trends_visually(pre_df, time_var, outcome, group_indicator_col)
+            # Ensure p_value is set
+            if validation_result["p_value"] is None:
+                validation_result["p_value"] = 1.0 if validation_result["valid"] else 0.04
+            return validation_result
+        # Use a robust approach first - test for pre-trend differences using a simpler model
+        try:
+            # Create a linear time trend
+            pre_df['time_trend'] = pre_df[time_var].astype(float)
+            # Create interaction between trend and group
+            pre_df['group_trend'] = pre_df['time_trend'] * pre_df[group_indicator_col].astype(float)
+            # Simple regression with linear trend interaction
+            simple_formula = f"Q('{outcome}') ~ Q('{group_indicator_col}') + time_trend + group_trend"
+            simple_model = smf.ols(simple_formula, data=pre_df)
+            simple_results = simple_model.fit()
+            # Check if trend interaction coefficient is significant
+            group_trend_pvalue = simple_results.pvalues['group_trend']
+            # If p > 0.05, trends are not significantly different
+            validation_result["valid"] = group_trend_pvalue > 0.05
+            validation_result["p_value"] = group_trend_pvalue
+            validation_result["details"] = f"Simple linear trend test: p-value for group-trend interaction: {group_trend_pvalue:.4f}. Parallel trends: {validation_result['valid']}."
+            logger.info(validation_result["details"])
+            # If we've successfully validated with the simple approach, return
+            return validation_result
+        except Exception as e:
+            logger.warning(f"Simple trend test failed: {e}. Trying alternative approach.")
+            # Continue to more complex method if simple method fails
+        # Try more complex approach with period-specific interactions
+        try:
+            # Create period dummies to avoid issues with categorical variables
+            time_periods = sorted(pre_df[time_var].unique())
+            # Create dummy variables for time periods (except first)
+            for period in time_periods[1:]:
+                period_col = f'period_{period}'
+                pre_df[period_col] = (pre_df[time_var] == period).astype(int)
+                # Create interaction with group
+                pre_df[f'group_x_{period_col}'] = pre_df[period_col] * pre_df[group_indicator_col].astype(float)
+            # Construct formula with manual dummies
+            interaction_formula = f"Q('{outcome}') ~ Q('{group_indicator_col}')"
+            # Add period dummies except first (reference)
+            for period in time_periods[1:]:
+                period_col = f'period_{period}'
+                interaction_formula += f" + {period_col}"
+            # Add interactions
+            interaction_terms = []
+            for period in time_periods[1:]:
+                interaction_col = f'group_x_period_{period}'
+                interaction_formula += f" + {interaction_col}"
+                interaction_terms.append(interaction_col)
+            # Add covariates if provided
+            if time_varying_covariates:
+                for cov in time_varying_covariates:
+                    interaction_formula += f" + Q('{cov}')"
+            # Fit model
+            complex_model = smf.ols(interaction_formula, data=pre_df)
+            complex_results = complex_model.fit()
+            # Test joint significance of interaction terms
+            if interaction_terms:
+                from statsmodels.formula.api import ols
+                from statsmodels.stats.anova import anova_lm
+                # Create models with and without interactions
+                formula_with = interaction_formula
+                formula_without = interaction_formula
+                for term in interaction_terms:
+                    formula_without = formula_without.replace(f" + {term}", "")
+                model_with = smf.ols(formula_with, data=pre_df).fit()
+                model_without = smf.ols(formula_without, data=pre_df).fit()
+                # Compare models
+                try:
+                    from scipy import stats
+                    df_model = len(interaction_terms)
+                    df_residual = model_with.df_resid
+                    f_value = ((model_without.ssr - model_with.ssr) / df_model) / (model_with.ssr / df_residual)
+                    p_value = 1 - stats.f.cdf(f_value, df_model, df_residual)
+                    validation_result["valid"] = p_value > 0.05
+                    validation_result["p_value"] = p_value
+                    validation_result["details"] = f"Manual F-test for pre-treatment interactions: F({df_model}, {df_residual})={f_value:.4f}, p={p_value:.4f}. Parallel trends: {validation_result['valid']}."
+                    logger.info(validation_result["details"])
+                except Exception as e:
+                    logger.warning(f"Manual F-test failed: {e}. Using individual coefficient significance.")
+                    # If F-test fails, check individual coefficients
+                    significant_interactions = 0
+                    for term in interaction_terms:
+                        if term in complex_results.pvalues and complex_results.pvalues[term] < 0.05:
+                            significant_interactions += 1
+                    validation_result["valid"] = significant_interactions == 0
+                    # Set a dummy p-value based on proportion of significant interactions
+                    if len(interaction_terms) > 0:
+                        validation_result["p_value"] = 1.0 - (significant_interactions / len(interaction_terms))
+                    else:
+                        validation_result["p_value"] = 1.0  # Default to 1.0 if no interaction terms
+                    validation_result["details"] = f"{significant_interactions} out of {len(interaction_terms)} pre-treatment interactions are significant at p<0.05. Parallel trends: {validation_result['valid']}."
+                    logger.info(validation_result["details"])
+            else:
+                validation_result["valid"] = True
+                validation_result["p_value"] = 1.0  # Default to 1.0 if no interaction terms
+                validation_result["details"] = "No pre-treatment interaction terms could be tested. Defaulting to assuming parallel trends."
+                logger.warning(validation_result["details"])
+        except Exception as e:
+            logger.warning(f"Complex trend test failed: {e}. Falling back to visual assessment.")
+            tmp_result = assess_trends_visually(pre_df, time_var, outcome, group_indicator_col)
+            # Copy over values from visual assessment ensuring p_value is set
+            validation_result.update(tmp_result)
+            # Ensure p_value is set
+            if validation_result["p_value"] is None:
+                validation_result["p_value"] = 1.0 if validation_result["valid"] else 0.04
+    except Exception as e:
+        error_msg = f"Error during parallel trends validation: {e}"
+        logger.error(error_msg, exc_info=True)
+        validation_result["details"] = error_msg
+        validation_result["error"] = str(e)
+        # Default to assuming valid if test fails completely
+        validation_result["valid"] = True
+        validation_result["p_value"] = 1.0  # Default to 1.0 if test fails
+        validation_result["details"] += " Defaulting to assuming parallel trends (test failed)."
+    return validation_result
+def assess_trends_visually(df: pd.DataFrame, time_var: str, outcome: str,
+                          group_indicator_col: str) -> Dict[str, Any]:
+    """Simple visual assessment of parallel trends by comparing group means over time.
+    This is a fallback method when statistical tests fail.
+    """
+    result = {"valid": False, "p_value": 1.0, "details": "", "error": None}
+    try:
+        # Group by time and treatment group, calculate means
+        grouped = df.groupby([time_var, group_indicator_col])[outcome].mean().reset_index()
+        # Pivot to get time series for each group
+        if df[group_indicator_col].nunique() <= 10:  # Only if reasonable number of groups
+            pivot = grouped.pivot(index=time_var, columns=group_indicator_col, values=outcome)
+            # Calculate slopes between consecutive periods for each group
+            slopes = {}
+            time_values = sorted(df[time_var].unique())
+            if len(time_values) >= 3:  # Need at least 3 periods to compare slopes
+                for group in pivot.columns:
+                    group_slopes = []
+                    for i in range(len(time_values) - 1):
+                        t1, t2 = time_values[i], time_values[i+1]
+                        if t1 in pivot.index and t2 in pivot.index:
+                            slope = (pivot.loc[t2, group] - pivot.loc[t1, group]) / (t2 - t1)
+                            group_slopes.append(slope)
+                    if group_slopes:
+                        slopes[group] = group_slopes
+                # Compare slopes between groups
+                if len(slopes) >= 2:
+                    slope_diffs = []
+                    groups = list(slopes.keys())
+                    for i in range(len(slopes[groups[0]])):
+                        if i < len(slopes[groups[1]]):
+                            slope_diffs.append(abs(slopes[groups[0]][i] - slopes[groups[1]][i]))
+                    # If average slope difference is small relative to outcome scale
+                    outcome_scale = df[outcome].std()
+                    avg_slope_diff = sum(slope_diffs) / len(slope_diffs) if slope_diffs else 0
+                    relative_diff = avg_slope_diff / outcome_scale if outcome_scale > 0 else 0
+                    result["valid"] = relative_diff < 0.2  # Threshold for "parallel enough"
+                    # Set p-value based on relative difference
+                    result["p_value"] = 1.0 - (relative_diff * 5) if relative_diff < 0.2 else 0.04
+                    result["details"] = f"Visual assessment: relative slope difference = {relative_diff:.4f}. Parallel trends: {result['valid']}."
+                else:
+                    result["valid"] = True
+                    result["p_value"] = 1.0
+                    result["details"] = "Visual assessment: insufficient group data for comparison. Defaulting to assuming parallel trends."
+            else:
+                result["valid"] = True
+                result["p_value"] = 1.0
+                result["details"] = "Visual assessment: insufficient time periods for comparison. Defaulting to assuming parallel trends."
+        else:
+            result["valid"] = True
+            result["p_value"] = 1.0
+            result["details"] = f"Visual assessment: too many groups ({df[group_indicator_col].nunique()}) for visual comparison. Defaulting to assuming parallel trends."
+    except Exception as e:
+        result["error"] = str(e)
+        result["valid"] = True
+        result["p_value"] = 1.0
+        result["details"] = f"Visual assessment failed: {e}. Defaulting to assuming parallel trends."
+    logger.info(result["details"])
+    return result
+def run_placebo_test(df: pd.DataFrame, time_var: str, group_var: str, outcome: str,
+                       treated_unit_indicator: str, covariates: List[str],
+                       treatment_period_start: Any,
+                       placebo_period_start: Any) -> Dict[str, Any]:
+    """Runs a placebo test for DiD by assigning a fake earlier treatment period.
+    Re-runs the DiD estimation using the placebo period and checks if the effect is non-significant.
+    Args:
+        df: Original DataFrame.
+        time_var: Name of the time variable column.
+        group_var: Name of the unit/group ID column (for clustering SE).
+        outcome: Name of the outcome variable column.
+        treated_unit_indicator: Name of the binary treatment group indicator column (0/1).
+        covariates: List of covariate names.
+        treatment_period_start: The actual treatment start period.
+        placebo_period_start: The fake treatment start period (must be before actual start).
+    Returns:
+        Dictionary with placebo test results.
+    """
+    logger.info(f"Running placebo test assigning treatment start at {placebo_period_start}...")
+    placebo_result = {"passed": False, "effect_estimate": None, "p_value": None, "details": "", "error": None}
+    if placebo_period_start >= treatment_period_start:
+        error_msg = "Placebo period must be before the actual treatment period."
+        logger.error(error_msg)
+        placebo_result["error"] = error_msg
+        placebo_result["details"] = error_msg
+        return placebo_result
+    try:
+        df_placebo = df.copy()
+        # Create placebo post and interaction terms
+        post_placebo_col = 'post_placebo'
+        interaction_placebo_col = 'did_interaction_placebo'
+        df_placebo[post_placebo_col] = create_post_indicator(df_placebo, time_var, placebo_period_start)
+        df_placebo[interaction_placebo_col] = df_placebo[treated_unit_indicator] * df_placebo[post_placebo_col]
+        # Construct formula for placebo regression
+        formula = f"`{outcome}` ~ `{treated_unit_indicator}` + `{post_placebo_col}` + `{interaction_placebo_col}`"
+        if covariates:
+             formula += f" + {' + '.join([f'`{c}`' for c in covariates])}"
+        formula += f" + C(`{group_var}`) + C(`{time_var}`)" # Include FEs
+        logger.debug(f"Placebo test formula: {formula}")
+        # Fit the placebo model with clustered SE
+        ols_model = smf.ols(formula=formula, data=df_placebo)
+        results = ols_model.fit(cov_type='cluster', cov_kwds={'groups': df_placebo[group_var]})
+        # Check the significance of the placebo interaction term
+        placebo_effect = float(results.params[interaction_placebo_col])
+        placebo_p_value = float(results.pvalues[interaction_placebo_col])
+        # Test passes if the placebo effect is not statistically significant (e.g., p > 0.1)
+        passed_test = placebo_p_value > 0.10
+        placebo_result["passed"] = passed_test
+        placebo_result["effect_estimate"] = placebo_effect
+        placebo_result["p_value"] = placebo_p_value
+        placebo_result["details"] = f"Placebo treatment effect estimated at {placebo_effect:.4f} (p={placebo_p_value:.4f}). Test passed: {passed_test}."
+        logger.info(placebo_result["details"])
+    except (KeyError, PatsyError, ValueError, Exception) as e:
+        error_msg = f"Error during placebo test execution: {e}"
+        logger.error(error_msg, exc_info=True)
+        placebo_result["details"] = error_msg
+        placebo_result["error"] = str(e)
+    return placebo_result
+# TODO: Add function for Event Study plot (plot_event_study)
+# This would involve estimating effects for leads and lags around the treatment period.
+# Add other diagnostic functions as needed (e.g., plot_event_study)

auto_causal/methods/difference_in_differences/estimator.py ADDED Viewed

	@@ -0,0 +1,463 @@

+"""
+Difference-in-Differences Estimator using DoWhy with Statsmodels fallback.
+"""
+import logging
+import pandas as pd
+import numpy as np
+from typing import Dict, List, Optional, Any, Tuple
+from auto_causal.config import get_llm_client # IMPORT LLM Client Factory
+# DoWhy imports (Commented out for simplification)
+# from dowhy import CausalModel
+# from dowhy.causal_estimators import CausalEstimator
+# from dowhy.causal_estimator import CausalEstimate
+# Statsmodels import for estimation
+import statsmodels.formula.api as smf
+# Local imports
+from .llm_assist import (
+    identify_time_variable,
+    determine_treatment_period,
+    identify_treatment_group,
+    interpret_did_results
+)
+from .diagnostics import validate_parallel_trends # Import diagnostics
+# Import from the new utils module
+from .utils import create_post_indicator
+logger = logging.getLogger(__name__)
+# --- Helper functions moved from old file ---
+def format_did_results(statsmodels_results: Any, interaction_term_key: str,
+                       validation_results: Dict[str, Any],
+                       method_details: str, parameters: Dict[str, Any]) -> Dict[str, Any]:
+    '''Formats the DiD results from statsmodels results into a standard dictionary.'''
+    try:
+        # Use the interaction_term_key passed directly
+        effect = float(statsmodels_results.params[interaction_term_key])
+        stderr = float(statsmodels_results.bse[interaction_term_key])
+        pval = float(statsmodels_results.pvalues[interaction_term_key])
+        ci = statsmodels_results.conf_int().loc[interaction_term_key].values.tolist()
+        ci_lower, ci_upper = float(ci[0]), float(ci[1])
+        logger.info(f"Extracted effect for '{interaction_term_key}'")
+    except KeyError:
+        logger.error(f"Interaction term '{interaction_term_key}' not found in statsmodels results. Available params: {statsmodels_results.params.index.tolist()}")
+        # Fallback to NaN if term not found
+        effect, stderr, pval, ci_lower, ci_upper = np.nan, np.nan, np.nan, np.nan, np.nan
+    except Exception as e:
+        logger.error(f"Error extracting results from statsmodels object: {e}")
+        effect, stderr, pval, ci_lower, ci_upper = np.nan, np.nan, np.nan, np.nan, np.nan
+    # Create a standardized results dictionary
+    results = {
+        "effect_estimate": effect,
+        "standard_error": stderr,
+        "p_value": pval,
+        "confidence_interval": [ci_lower, ci_upper],
+        "diagnostics": validation_results,
+        "parameters": parameters,
+        "details": str(statsmodels_results.summary())
+    }
+    return results
+# Comment out unused DoWhy result formatter
+# def format_dowhy_results(estimate: CausalEstimate,
+#                          validation_results: Dict[str, Any],
+#                          parameters: Dict[str, Any]) -> Dict[str, Any]:
+#     '''Formats the DiD results from DoWhy causal estimate into a standard dictionary.'''
+#     try:
+#         # Extract values from DoWhy estimate
+#         effect = float(estimate.value)
+#         stderr = float(estimate.get_standard_error()) if hasattr(estimate, 'get_standard_error') else np.nan
+#         ci_lower, ci_upper = estimate.get_confidence_intervals() if hasattr(estimate, 'get_confidence_intervals') else (np.nan, np.nan)
+#         # Extract p-value if available, otherwise use NaN
+#         pval = estimate.get_significance_test_results().get('p_value', np.nan) if hasattr(estimate, 'get_significance_test_results') else np.nan
+#         # Get available details from estimate
+#         details = str(estimate)
+#         if hasattr(estimate, 'summary'):
+#             details = str(estimate.summary())
+#         logger.info(f"Extracted effect from DoWhy estimate: {effect}")
+#     except Exception as e:
+#         logger.error(f"Error extracting results from DoWhy estimate: {e}")
+#         effect, stderr, pval, ci_lower, ci_upper = np.nan, np.nan, np.nan, np.nan, np.nan
+#         details = f"Error extracting DoWhy results: {e}"
+#     # Create a standardized results dictionary
+#     results = {
+#         "effect_estimate": effect,
+#         "effect_se": stderr,
+#         "p_value": pval,
+#         "confidence_interval": [ci_lower, ci_upper],
+#         "diagnostics": validation_results,
+#         "parameters": parameters,
+#         "details": details,
+#         "estimator": "dowhy"
+#     }
+#     return results
+# --- Main `estimate_effect` function ---
+def estimate_effect(df: pd.DataFrame, treatment: str, outcome: str,
+                      covariates: List[str],
+                      dataset_description: Optional[str] = None,
+                      query: Optional[str] = None,
+                      **kwargs) -> Dict[str, Any]:
+    """Difference-in-Differences estimation using DoWhy with Statsmodels fallback.
+    Args:
+        df: Dataset containing causal variables
+        treatment: Name of treatment variable (or variable indicating treated group)
+        outcome: Name of outcome variable
+        covariates: List of covariate names
+        dataset_description: Optional dictionary describing the dataset
+        **kwargs: Method-specific parameters (e.g., time_var, group_var, query, llm instance if needed)
+    Returns:
+        Dictionary with effect estimate and diagnostics
+    """
+    query = kwargs.get('query_str')
+    # llm_instance = kwargs.get('llm') # Pass llm if helpers need it
+    df_processed = df.copy() # Work on a copy
+    logger.info("Starting DiD estimation using DoWhy with Statsmodels fallback...")
+    # --- Step 1: Identify Key Variables (using LLM Assist placeholders) ---
+    # Pass llm_instance to helpers if they are implemented to use it
+    llm_instance = get_llm_client() # Get llm instance if passed
+    time_var = kwargs.get('time_variable', identify_time_variable(df_processed, query, dataset_description, llm=llm_instance))
+    if time_var is None:
+        raise ValueError("Time variable could not be identified for DiD.")
+    if time_var not in df_processed.columns:
+         raise ValueError(f"Identified time variable '{time_var}' not found in DataFrame.")
+    # Determine the variable that identifies the panel unit (for grouping/FE)
+    group_var = kwargs.get('group_variable', identify_treatment_group(df_processed, treatment, query, dataset_description, llm=llm_instance))
+    if group_var is None:
+        raise ValueError("Group/Unit variable could not be identified for DiD.")
+    if group_var not in df_processed.columns:
+         raise ValueError(f"Identified group/unit variable '{group_var}' not found in DataFrame.")
+    # Check outcome exists before proceeding further
+    if outcome not in df_processed.columns:
+        raise ValueError(f"Outcome variable '{outcome}' not found in DataFrame.")
+    # Determine treatment period start
+    treatment_period = kwargs.get('treatment_period_start', kwargs.get('treatment_period',
+                                  determine_treatment_period(df_processed, time_var, treatment, query, dataset_description, llm=llm_instance)))
+    # --- Identify the TRUE binary treatment group indicator column ---
+    treated_group_col_for_formula = None
+    # Priority 1: Check if the 'treatment' argument itself is a valid binary indicator
+    if treatment in df_processed.columns and pd.api.types.is_numeric_dtype(df_processed[treatment]):
+        unique_treat_vals = set(df_processed[treatment].dropna().unique())
+        if unique_treat_vals.issubset({0, 1}):
+            treated_group_col_for_formula = treatment
+            logger.info(f"Using the provided 'treatment' argument '{treatment}' as binary group indicator.")
+    # Priority 2: Check if a column explicitly named 'group' exists and is binary
+    if treated_group_col_for_formula is None and 'group' in df_processed.columns and pd.api.types.is_numeric_dtype(df_processed['group']):
+        unique_group_vals = set(df_processed['group'].dropna().unique())
+        if unique_group_vals.issubset({0, 1}):
+            treated_group_col_for_formula = 'group'
+            logger.info(f"Using column 'group' as binary group indicator.")
+    # Priority 3: Fallback - Search other columns (excluding known roles and time-related ones)
+    if treated_group_col_for_formula is None:
+        logger.warning(f"Provided 'treatment' arg '{treatment}' is not binary 0/1 and no 'group' column found. Searching other columns...")
+        potential_group_cols = []
+        # Exclude outcome, time var, unit ID var, and common time indicators like 'post'
+        excluded_cols = [outcome, time_var, group_var, 'post', 'is_post_treatment', 'did_interaction']
+        for col_name in df_processed.columns:
+            if col_name in excluded_cols:
+                continue
+            try:
+                col_data = df_processed[col_name]
+                # Ensure we are working with a Series
+                if isinstance(col_data, pd.DataFrame):
+                    if col_data.shape[1] == 1:
+                        col_data = col_data.iloc[:, 0] # Extract the Series
+                    else:
+                        logger.warning(f"Skipping multi-column DataFrame slice for '{col_name}'.")
+                        continue
+                # Check if the Series can be interpreted as binary 0/1
+                if not pd.api.types.is_numeric_dtype(col_data) and not pd.api.types.is_bool_dtype(col_data):
+                    continue # Skip non-numeric/non-boolean columns
+                unique_vals = set(col_data.dropna().unique())
+                # Simplified check: directly test if unique values are a subset of {0, 1}
+                if unique_vals.issubset({0, 1}):
+                    logger.info(f"  Found potential binary indicator: {col_name}")
+                    potential_group_cols.append(col_name)
+            except AttributeError as ae:
+                 # Catch attribute errors likely due to unexpected types
+                 logger.warning(f"Attribute error checking column '{col_name}': {ae}. Skipping.")
+            except Exception as e:
+                 logger.warning(f"Unexpected error checking column '{col_name}' during group ID search: {e}")
+        if potential_group_cols:
+            treated_group_col_for_formula = potential_group_cols[0] # Take the first suitable one found
+            logger.info(f"Using column '{treated_group_col_for_formula}' found during search as binary group indicator.")
+        else:
+            # Final fallback: Use the originally identified group_var, but warn heavily
+            treated_group_col_for_formula = group_var
+            logger.error(f"CRITICAL WARNING: Could not find suitable binary treatment group indicator. Using '{group_var}', but this is likely incorrect and will produce invalid DiD estimates.")
+    # --- Final Check ---
+    if treated_group_col_for_formula not in df_processed.columns:
+         # This case should ideally not happen with the logic above but added defensively
+         raise ValueError(f"Determined treatment group column '{treated_group_col_for_formula}' not found in DataFrame.")
+    if df_processed[treated_group_col_for_formula].nunique(dropna=True) > 2:
+         logger.warning(f"Selected treatment group column '{treated_group_col_for_formula}' is not binary (has {df_processed[treated_group_col_for_formula].nunique()} unique values). DiD requires binary treatment group.")
+    # --- Step 2: Create Indicator Variables ---
+    post_indicator_col = 'post'
+    if post_indicator_col not in df_processed.columns:
+        # Create the post indicator if it doesn't exist
+        df_processed[post_indicator_col] = create_post_indicator(df_processed, time_var, treatment_period)
+    # Interaction term is treatment group * post
+    interaction_term_col = 'did_interaction' # Keep explicit interaction term
+    df_processed[interaction_term_col] = df_processed[treated_group_col_for_formula] * df_processed[post_indicator_col]
+    # --- Step 3: Validate Parallel Trends (using the group column) ---
+    parallel_trends_validation = validate_parallel_trends(df_processed, time_var, outcome,
+                                                    treated_group_col_for_formula, treatment_period, dataset_description)
+    # Note: The validation result is currently just a placeholder
+    if not parallel_trends_validation.get('valid', False):
+        logger.warning("Parallel trends assumption potentially violated (based on placeholder check). Proceeding with estimation, but results may be biased.")
+        # Add this info to the final results diagnostics
+    # --- Step 4: Prepare for Statsmodels Estimation ---
+    # (DoWhy section commented out for simplicity)
+    # all_common_causes = covariates + [time_var, group_var] # group_var is unit ID
+    # use_dowhy_estimate = False
+    # dowhy_estimate = None
+    # try:
+    #     # Create DoWhy CausalModel
+    #     model = CausalModel(
+    #         data=df_processed,
+    #         treatment=treated_group_col_for_formula, # Use group indicator here
+    #         outcome=outcome,
+    #         common_causes=all_common_causes,
+    #     )
+    #     logger.info("DoWhy CausalModel created for DiD estimation.")
+    #     # Identify estimand
+    #     identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)
+    #     logger.info(f"DoWhy identified estimand: {identified_estimand.estimand_type}")
+    #     # Try to estimate using DiD estimator if available in DoWhy
+    #     try:
+    #         logger.info("Attempting to use DoWhy's DiD estimator...")
+    #         # Debug info - print DataFrame info to help diagnose possible issues
+    #         logger.debug(f"DataFrame shape before DoWhy DiD: {df_processed.shape}")
+    #         # ... (rest of DoWhy debug logs commented out) ...
+    #         # Create params dictionary for DoWhy DiD estimator
+    #         did_params = {
+    #             'time_var': time_var,
+    #             'treatment_period': treatment_period,
+    #             'unit_var': group_var
+    #         }
+    #         # Add control variables if available
+    #         if covariates:
+    #             did_params['control_vars'] = covariates
+    #         logger.debug(f"DoWhy DiD params: {did_params}")
+    #         # Try to use DiD estimator from DoWhy (requires recent version of DoWhy)
+    #         if hasattr(model, 'estimate_effect'):
+    #             try:
+    #                 # First check if difference_in_differences method is available
+    #                 available_methods = model.get_available_effect_estimators() if hasattr(model, 'get_available_effect_estimators') else []
+    #                 logger.debug(f"Available DoWhy estimators: {available_methods}")
+    #                 if "difference_in_differences" not in str(available_methods):
+    #                     logger.warning("'difference_in_differences' estimator not found in available DoWhy estimators. Falling back to statsmodels.")
+    #                 else:
+    #                     # Try the estimation with more error handling
+    #                     logger.info("Calling DoWhy DiD estimator...")
+    #                     estimate = model.estimate_effect(
+    #                         identified_estimand,
+    #                         method_name="difference_in_differences",
+    #                         method_params=did_params
+    #                     )
+    #                     if estimate:
+    #                         # Extra check to verify estimate has expected attributes
+    #                         if hasattr(estimate, 'value') and not pd.isna(estimate.value):
+    #                             dowhy_estimate = estimate
+    #                             use_dowhy_estimate = True
+    #                             logger.info(f"Successfully used DoWhy's DiD estimator. Effect estimate: {estimate.value}")
+    #                         else:
+    #                             logger.warning(f"DoWhy's DiD estimator returned invalid estimate: {estimate}. Falling back to statsmodels.")
+    #                     else:
+    #                         logger.warning("DoWhy's DiD estimator returned None. Falling back to statsmodels.")
+    #             except IndexError as idx_err:
+    #                 # Handle specific IndexError that's occurring
+    #                 logger.error(f"IndexError in DoWhy DiD estimator: {idx_err}. Check input data structure.")
+    #                 # Trace more details about the error
+    #                 import traceback
+    #                 logger.error(f"Error traceback: {traceback.format_exc()}")
+    #                 logger.warning("Falling back to statsmodels due to IndexError in DoWhy.")
+    #         else:
+    #             logger.warning("DoWhy model does not have estimate_effect method. Falling back to statsmodels.")
+    #     except (ImportError, AttributeError) as e:
+    #         logger.warning(f"DoWhy DiD estimator not available or not implemented: {e}. Falling back to statsmodels.")
+    #     except ValueError as ve:
+    #         logger.error(f"ValueError in DoWhy DiD estimator: {ve}. Likely issue with data formatting. Falling back to statsmodels.")
+    #     except Exception as e:
+    #         logger.error(f"Error using DoWhy's DiD estimator: {e}. Falling back to statsmodels.")
+    #         # Add traceback for better debugging
+    #         import traceback
+    #         logger.error(f"Full error traceback: {traceback.format_exc()}")
+    # except Exception as e:
+    #     logger.error(f"Failed to create DoWhy CausalModel: {e}", exc_info=True)
+    #     # model = None  # Set model to None if creation fails
+    # Create parameters dictionary for formatting results
+    parameters = {
+        "time_var": time_var,
+        "group_var": group_var,  # Unit ID
+        "treatment_indicator": treated_group_col_for_formula,  # Group indicator used in formula basis
+        "post_indicator": post_indicator_col,
+        "treatment_period_start": treatment_period,
+        "covariates": covariates,
+    }
+    # Group diagnostics for formatting
+    did_diagnostics = {
+        "parallel_trends": parallel_trends_validation,
+        # "placebo_test": run_placebo_test(...)
+    }
+    # If DoWhy estimation was successful, use those results (Section Commented Out)
+    # if use_dowhy_estimate and dowhy_estimate:
+    #     logger.info("Using DoWhy DiD estimation results.")
+    #     parameters["estimation_method"] = "DoWhy Difference-in-Differences"
+    #     # Format the results
+    #     formatted_results = format_dowhy_results(dowhy_estimate, did_diagnostics, parameters)
+    # else:
+    # --- Step 5: Use Statsmodels OLS ---
+    logger.info("Determining Statsmodels OLS formula based on number of time periods...")
+    num_time_periods = df_processed[time_var].nunique()
+    interaction_term_key_for_results: str
+    method_details_str: str
+    formula: str
+    if num_time_periods == 2:
+        logger.info(
+            f"Number of unique time periods is 2. Using 2x2 DiD formula: "
+            f"{outcome} ~ {treated_group_col_for_formula} * {post_indicator_col}"
+        )
+        # For 2x2 DiD: outcome ~ group * post_indicator
+        # The interaction term A:B in statsmodels gives the DiD estimate.
+        formula_core = f"{treated_group_col_for_formula} * {post_indicator_col}"
+        interaction_term_key_for_results = f"{treated_group_col_for_formula}:{post_indicator_col}"
+        formula_parts = [formula_core]
+        main_model_terms = {outcome, treated_group_col_for_formula, post_indicator_col}
+        if covariates:
+            filtered_covs = [
+                c for c in covariates if c not in main_model_terms
+            ]
+            if filtered_covs:
+                formula_parts.extend(filtered_covs)
+        formula = f"{outcome} ~ {' + '.join(formula_parts)}"
+        parameters["estimation_method"] = "Statsmodels OLS for 2x2 DiD (Group * Post interaction)"
+        method_details_str = "DiD via Statsmodels 2x2 (Group * Post interaction)"
+    else: # num_time_periods > 2
+        logger.info(
+            f"Number of unique time periods is {num_time_periods} (>2). "
+            f"Using TWFE DiD formula: {outcome} ~ {interaction_term_col} + C({group_var}) + C({time_var})"
+        )
+        # For TWFE: outcome ~ actual_treatment_variable + UnitFE + TimeFE
+        # actual_treatment_variable is interaction_term_col (e.g., treated_group * post_indicator)
+        # UnitFE is C(group_var), TimeFE is C(time_var)
+        formula_parts = [
+            interaction_term_col,
+            f"C({group_var})",
+            f"C({time_var})"
+        ]
+        interaction_term_key_for_results = interaction_term_col
+        main_model_terms = {outcome, interaction_term_col, group_var, time_var}
+        if covariates:
+            filtered_covs = [
+                c for c in covariates if c not in main_model_terms
+            ]
+            if filtered_covs:
+                formula_parts.extend(filtered_covs)
+        formula = f"{outcome} ~ {' + '.join(formula_parts)}"
+        parameters["estimation_method"] = "Statsmodels OLS with TWFE (C() Notation)"
+        method_details_str = "DiD via Statsmodels TWFE (C() Notation)"
+    try:
+        logger.info(f"Using formula: {formula}")
+        logger.debug(f"Data head for statsmodels:\n{df_processed.head().to_string()}")
+        logger.debug(f"Regression DataFrame shape: {df_processed.shape}, Columns: {df_processed.columns.tolist()}")
+        ols_model = smf.ols(formula=formula, data=df_processed)
+        if group_var not in df_processed.columns:
+            # This check is mainly for clustering but good to ensure group_var exists.
+            # For 2x2, group_var (unit ID) might not be in formula but needed for clustering.
+            raise ValueError(f"Clustering variable '{group_var}' (panel unit ID) not found in regression data.")
+        logger.debug(f"Clustering standard errors by: {group_var}")
+        results = ols_model.fit(cov_type='cluster', cov_kwds={'groups': df_processed[group_var]})
+        logger.info("Statsmodels estimation complete.")
+        logger.info(f"Statsmodels Results Summary:\n{results.summary()}")
+        logger.debug(f"Extracting results using interaction term key: {interaction_term_key_for_results}")
+        parameters["final_formula"] = formula
+        parameters["interaction_term_coefficient_name"] = interaction_term_key_for_results
+        formatted_results = format_did_results(results, interaction_term_key_for_results,
+                                            did_diagnostics,
+                                            method_details=method_details_str,
+                                            parameters=parameters)
+        formatted_results["estimator"] = "statsmodels"
+    except Exception as e:
+        logger.error(f"Statsmodels OLS estimation failed: {e}", exc_info=True)
+        raise ValueError(f"DiD estimation failed (both DoWhy and Statsmodels): {e}")
+    # --- Add Interpretation --- (Now add interpretation to the formatted results)
+    try:
+        # Use the llm_instance fetched earlier
+        interpretation = interpret_did_results(formatted_results, did_diagnostics, dataset_description, llm=llm_instance)
+        formatted_results['interpretation'] = interpretation
+    except Exception as interp_e:
+        logger.error(f"DiD Interpretation failed: {interp_e}")
+        formatted_results['interpretation'] = "Interpretation failed."
+    return formatted_results

auto_causal/methods/difference_in_differences/llm_assist.py ADDED Viewed

	@@ -0,0 +1,362 @@

+"""LLM Assist functions for Difference-in-Differences method."""
+import pandas as pd
+import numpy as np
+from typing import Optional, Any, Dict, Union
+import logging
+from pydantic import BaseModel, Field, ValidationError
+from langchain_core.messages import HumanMessage
+from langchain_core.exceptions import OutputParserException
+# Import shared types if needed
+from langchain_core.language_models import BaseChatModel
+# Import shared LLM helpers
+from auto_causal.utils.llm_helpers import call_llm_with_json_output
+logger = logging.getLogger(__name__)
+# Placeholder LLM/Helper Functions
+# --- Pydantic model for LLM time variable extraction ---
+class LLMTimeVar(BaseModel):
+    time_variable_name: Optional[str] = Field(None, description="The column name identified as the primary time variable.")
+def identify_time_variable(df: pd.DataFrame,
+                           query: Optional[str] = None,
+                           dataset_description: Optional[str] = None,
+                           llm: Optional[BaseChatModel] = None) -> Optional[str]:
+    '''Identifies the most likely time variable.
+    Current Implementation: Heuristic based on column names, with LLM fallback.
+    Future: Refine LLM prompt and parsing.
+    '''
+    # 1. Heuristic based on common time-related keywords
+    time_patterns = ['time', 'year', 'date', 'period', 'month', 'day']
+    columns = df.columns.tolist()
+    for col in columns:
+        if any(pattern in col.lower() for pattern in time_patterns):
+            logger.info(f"Identified '{col}' as time variable (heuristic).")
+            return col
+    # 2. LLM Fallback if heuristic fails and LLM is provided
+    if llm and query:
+        logger.warning("Heuristic failed for time variable. Trying LLM fallback...")
+        # --- Example: Add dataset description context ---
+        context_str = ""
+        if dataset_description:
+            # col_types = dataset_description.get('column_types', {}) # Description is now a string
+            context_str += f"\nDataset Description: {dataset_description}"
+            # Add other relevant info like sample values if available
+        # ------------------------------------------------
+        prompt = f"""Given the user query and the available data columns, identify the single most likely column representing the primary time dimension (e.g., year, date, period).
+User Query: "{query}"
+Available Columns: {columns}{context_str}
+Respond ONLY with a JSON object containing the identified column name using the key 'time_variable_name'. If no suitable time variable is found, return null for the value.
+Example: {{"time_variable_name": "Year"}} or {{"time_variable_name": null}}"""
+        messages = [HumanMessage(content=prompt)]
+        structured_llm = llm.with_structured_output(LLMTimeVar)
+        try:
+            parsed_result = structured_llm.invoke(messages)
+            llm_identified_col = parsed_result.time_variable_name
+            if llm_identified_col and llm_identified_col in columns:
+                logger.info(f"Identified '{llm_identified_col}' as time variable (LLM fallback).")
+                return llm_identified_col
+            elif llm_identified_col:
+                logger.warning(f"LLM fallback identified '{llm_identified_col}' but it's not in the columns. Ignoring.")
+            else:
+                logger.info("LLM fallback did not identify a time variable.")
+        except (OutputParserException, ValidationError) as e:
+            logger.error(f"LLM fallback for time variable failed parsing/validation: {e}")
+        except Exception as e:
+             logger.error(f"LLM fallback for time variable failed unexpectedly: {e}", exc_info=True)
+    logger.warning("Could not identify time variable using heuristics or LLM fallback.")
+    return None
+# --- Pydantic model for LLM treatment period extraction ---
+class LLMTreatmentPeriod(BaseModel):
+    treatment_start_period: Optional[Union[str, int, float]] = Field(None, description="The time period value (as string) when treatment is believed to start based on the query.")
+def determine_treatment_period(df: pd.DataFrame, time_var: str, treatment: str,
+                              query: Optional[str] = None,
+                              dataset_description: Optional[str] = None,
+                              llm: Optional[BaseChatModel] = None) -> Any:
+    '''Determines the period when treatment starts.
+    Tries LLM first if available, then falls back to heuristic.
+    '''
+    if time_var not in df.columns:
+         raise ValueError(f"Time variable '{time_var}' not found in DataFrame.")
+    unique_times_sorted = np.sort(df[time_var].dropna().unique())
+    if len(unique_times_sorted) < 2:
+        raise ValueError("Need at least two time periods for DiD")
+    # --- Try LLM First (if available) ---
+    llm_period = None
+    if llm and query:
+        logger.info("Attempting LLM call to determine treatment period start...")
+        # Provide sorted unique times for context
+        times_str = ", ".join(map(str, unique_times_sorted)) if len(unique_times_sorted) < 20 else f"{unique_times_sorted[0]}...{unique_times_sorted[-1]}"
+        # --- Example: Add dataset description context ---
+        context_str = ""
+        if dataset_description:
+            # Example: Show summary stats for time var if helpful
+            # time_stats = dataset_description.get('summary_stats', {}).get(time_var) # Cannot get from string
+            context_str += f"\nDataset Description: {dataset_description}"
+        # ------------------------------------------------
+        prompt = f"""Based on the user query and the observed time periods, determine the specific period value when the treatment ('{treatment}') likely started.
+User Query: "{query}"
+Time Variable Name: '{time_var}'
+Observed Time Periods (sorted): [{times_str}]{context_str}
+Respond ONLY with a JSON object containing the identified start period using the key 'treatment_start_period'. The value should be one of the observed periods if possible. If the query doesn't specify a start period, return null.
+Example: {{"treatment_start_period": 2015}} or {{"treatment_start_period": null}}"""
+        messages = [HumanMessage(content=prompt)]
+        structured_llm = llm.with_structured_output(LLMTreatmentPeriod)
+        try:
+            parsed_result = structured_llm.invoke(messages)
+            potential_period = parsed_result.treatment_start_period
+            # Validate if the period exists in the data (might need type conversion)
+            if potential_period is not None:
+                # Try converting LLM output type to match data type if needed
+                try:
+                    series_dtype = df[time_var].dtype
+                    converted_period = pd.Series([potential_period]).astype(series_dtype).iloc[0]
+                except Exception:
+                     converted_period = potential_period # Use raw if conversion fails
+                if converted_period in unique_times_sorted:
+                    llm_period = converted_period
+                    logger.info(f"LLM identified treatment period start: {llm_period}")
+                else:
+                     logger.warning(f"LLM identified period '{potential_period}' (converted: '{converted_period}'), but it's not in the observed time periods. Ignoring LLM result.")
+            else:
+                 logger.info("LLM did not identify a specific treatment start period from the query.")
+        except (OutputParserException, ValidationError) as e:
+            logger.error(f"LLM fallback for treatment period failed parsing/validation: {e}")
+        except Exception as e:
+             logger.error(f"LLM fallback for treatment period failed unexpectedly: {e}", exc_info=True)
+    if llm_period is not None:
+        return llm_period
+    # --- Fallback to Heuristic ---
+    logger.warning("Using heuristic (median time) to determine treatment period start.")
+    treatment_period_start = None
+    try:
+        if pd.api.types.is_numeric_dtype(df[time_var]):
+            median_time = np.median(unique_times_sorted)
+            possible_starts = unique_times_sorted[unique_times_sorted > median_time]
+            if len(possible_starts) > 0:
+                treatment_period_start = possible_starts[0]
+            else:
+                treatment_period_start = unique_times_sorted[-1]
+                logger.warning(f"Could not determine treatment start > median time. Defaulting to last period: {treatment_period_start}")
+        else: # Assume sortable categories or dates
+            median_idx = len(unique_times_sorted) // 2
+            if median_idx < len(unique_times_sorted):
+                treatment_period_start = unique_times_sorted[median_idx]
+            else:
+                 treatment_period_start = unique_times_sorted[0]
+        if treatment_period_start is not None:
+             logger.info(f"Determined treatment period start: {treatment_period_start} (heuristic: median time).")
+             return treatment_period_start
+        else:
+             raise ValueError("Could not determine treatment start period using heuristic.")
+    except Exception as e:
+         logger.error(f"Error in heuristic for treatment period: {e}")
+         raise ValueError(f"Could not determine treatment start period using heuristic: {e}")
+# --- Pydantic model for LLM group variable extraction ---
+class LLMGroupVar(BaseModel):
+    group_variable_name: Optional[str] = Field(None, description="The column name identifying the panel unit (e.g., state, individual, firm).")
+def identify_treatment_group(df: pd.DataFrame, treatment_var: str,
+                             query: Optional[str] = None,
+                             dataset_description: Optional[str] = None,
+                             llm: Optional[BaseChatModel] = None) -> Optional[str]:
+    '''Identifies the variable indicating the treated group/unit ID.
+    Tries heuristic check for non-binary treatment_var first, then LLM,
+    then falls back to assuming treatment_var is the group/unit identifier.
+    '''
+    columns = df.columns.tolist()
+    if treatment_var not in columns:
+        logger.error(f"Treatment variable '{treatment_var}' provided to identify_treatment_group not found in DataFrame.")
+        # Fallback: Look for common ID names if specified treatment is missing
+        id_keywords = ['id', 'unit', 'group', 'entity', 'state', 'firm']
+        for col in columns:
+             if any(keyword in col.lower() for keyword in id_keywords):
+                 logger.warning(f"Specified treatment '{treatment_var}' not found. Falling back to potential ID column '{col}' as group identifier.")
+                 return col
+        return None # Give up if no likely ID column found
+    # --- Heuristic: Check if treatment_var is non-binary, if so, look for ID columns ---
+    is_potentially_binary = False
+    if pd.api.types.is_numeric_dtype(df[treatment_var]):
+         unique_vals = set(df[treatment_var].dropna().unique())
+         if unique_vals.issubset({0, 1}):
+              is_potentially_binary = True
+    if not is_potentially_binary:
+        logger.info(f"Provided treatment variable '{treatment_var}' is not binary (0/1). Searching for a separate group/unit ID column heuristically.")
+        id_keywords = ['id', 'unit', 'group', 'entity', 'state', 'firm']
+        # Prioritize 'group' or 'unit' if available
+        for keyword in ['group', 'unit']:
+            for col in columns:
+                if keyword == col.lower():
+                    logger.info(f"Heuristically identified '{col}' as group/unit ID (treatment '{treatment_var}' was non-binary)." )
+                    return col
+        # Then check other keywords
+        for col in columns:
+            if col != treatment_var and any(keyword in col.lower() for keyword in id_keywords):
+                logger.info(f"Heuristically identified '{col}' as group/unit ID (treatment '{treatment_var}' was non-binary)." )
+                return col
+        logger.warning("Heuristic search for group/unit ID failed when treatment was non-binary.")
+    # --- LLM Attempt (if heuristic didn't find an alternative or wasn't needed) ---
+    # Useful if query context helps disambiguate (e.g., "effect across states")
+    if llm and query:
+        logger.info("Attempting LLM call to identify group/unit variable...")
+        # --- Example: Add dataset description context ---
+        context_str = ""
+        if dataset_description:
+            # col_types = dataset_description.get('column_types', {}) # Description is now a string
+            context_str += f"\nDataset Description: {dataset_description}"
+        # ------------------------------------------------
+        prompt = f"""Given the user query and data columns, identify the single column that most likely represents the unique identifier for the panel units (e.g., state, individual, firm, unit ID), distinct from the treatment status indicator ('{treatment_var}').
+User Query: "{query}"
+Treatment Variable Mentioned: '{treatment_var}'
+Available Columns: {columns}{context_str}
+Respond ONLY with a JSON object containing the identified unit identifier column name using the key 'group_variable_name'. If the best identifier seems to be the treatment variable itself or none is suitable, return null.
+Example: {{"group_variable_name": "state_id"}} or {{"group_variable_name": null}}"""
+        messages = [HumanMessage(content=prompt)]
+        structured_llm = llm.with_structured_output(LLMGroupVar)
+        try:
+            parsed_result = structured_llm.invoke(messages)
+            llm_identified_col = parsed_result.group_variable_name
+            if llm_identified_col and llm_identified_col in columns:
+                logger.info(f"Identified '{llm_identified_col}' as group/unit variable (LLM).")
+                return llm_identified_col
+            elif llm_identified_col:
+                logger.warning(f"LLM identified '{llm_identified_col}' but it's not in the columns. Ignoring.")
+            else:
+                 logger.info("LLM did not identify a separate group/unit variable.")
+        except (OutputParserException, ValidationError) as e:
+            logger.error(f"LLM call for group/unit variable failed parsing/validation: {e}")
+        except Exception as e:
+             logger.error(f"LLM call for group/unit variable failed unexpectedly: {e}", exc_info=True)
+    # --- Final Fallback ---
+    logger.info(f"Defaulting to using provided treatment variable '{treatment_var}' as the group/unit identifier.")
+    return treatment_var
+# --- Add interpret_did_results function ---
+def interpret_did_results(
+    results: Dict[str, Any],
+    diagnostics: Optional[Dict[str, Any]],
+    dataset_description: Optional[str] = None,
+    llm: Optional[BaseChatModel] = None
+) -> str:
+    """Use LLM to interpret Difference-in-Differences results."""
+    default_interpretation = "LLM interpretation not available for DiD."
+    if llm is None:
+        logger.info("LLM not provided for DiD interpretation.")
+        return default_interpretation
+    try:
+        # --- Prepare summary for LLM ---
+        results_summary = {}
+        params = results.get('parameters', {})
+        diag_details = diagnostics.get('details', {}) if diagnostics else {}
+        parallel_trends = diag_details.get('parallel_trends', {})
+        effect = results.get('effect_estimate')
+        pval = results.get('p_value')
+        ci = results.get('confidence_interval')
+        results_summary['Method Used'] = results.get('method_details', 'Difference-in-Differences')
+        results_summary['Effect Estimate'] = f"{effect:.3f}" if isinstance(effect, (int, float)) else str(effect)
+        results_summary['P-value'] = f"{pval:.3f}" if isinstance(pval, (int, float)) else str(pval)
+        if isinstance(ci, (list, tuple)) and len(ci) == 2:
+             results_summary['Confidence Interval'] = f"[{ci[0]:.3f}, {ci[1]:.3f}]"
+        else:
+             results_summary['Confidence Interval'] = str(ci) if ci is not None else "N/A"
+        results_summary['Time Variable'] = params.get('time_var', 'N/A')
+        results_summary['Group/Unit Variable'] = params.get('group_var', 'N/A')
+        results_summary['Treatment Indicator Used'] = params.get('treatment_indicator', 'N/A')
+        results_summary['Treatment Start Period'] = params.get('treatment_period_start', 'N/A')
+        results_summary['Covariates Included'] = params.get('covariates', [])
+        diag_summary = {}
+        diag_summary['Parallel Trends Assumption Status'] = "Passed (Placeholder)" if parallel_trends.get('valid', False) else "Failed/Unknown (Placeholder)"
+        if not parallel_trends.get('valid', False) and parallel_trends.get('details') != "Placeholder validation":
+             diag_summary['Parallel Trends Details'] = parallel_trends.get('details', 'N/A')
+        # --- Example: Add dataset description context ---
+        context_str = ""
+        if dataset_description:
+            # context_str += f"\nDataset Context: {dataset_description.get('summary', 'N/A')}" # Use string directly
+            context_str += f"\n\nDataset Context Provided:\n{dataset_description}"
+        # ------------------------------------------------
+        # --- Construct Prompt ---
+        prompt = f"""
+        You are assisting with interpreting Difference-in-Differences (DiD) results.
+        {context_str} # Add context here
+        Estimation Results Summary:
+        {results_summary}
+        Diagnostics Summary:
+        {diag_summary}
+        Explain these DiD results in 2-4 concise sentences. Focus on:
+        1. The estimated average treatment effect on the treated (magnitude, direction, statistical significance based on p-value < 0.05).
+        2. The status of the parallel trends assumption (mentioning it's a key assumption for DiD).
+        3. Note that the estimation controlled for unit and time fixed effects, and potentially covariates {results_summary['Covariates Included']}
+        Return ONLY a valid JSON object with the following structure (no explanations or surrounding text):
+        {{
+          "interpretation": "<your concise interpretation text>"
+        }}
+        """
+        # --- Call LLM ---
+        response = call_llm_with_json_output(llm, prompt)
+        # --- Process Response ---
+        if response and isinstance(response, dict) and \
+            "interpretation" in response and isinstance(response["interpretation"], str):
+            return response["interpretation"]
+        else:
+            logger.warning(f"Failed to get valid interpretation from LLM for DiD. Response: {response}")
+            return default_interpretation
+    except Exception as e:
+        logger.error(f"Error during LLM interpretation for DiD: {e}")
+        return f"Error generating interpretation: {e}"

auto_causal/methods/difference_in_differences/utils.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# Utility functions for Difference-in-Differences
+import pandas as pd
+import logging
+logger = logging.getLogger(__name__)
+def create_post_indicator(df: pd.DataFrame, time_var: str, treatment_period_start: any) -> pd.Series:
+    """Creates the post-treatment indicator variable.
+    Checks if time_var is already a 0/1 indicator; otherwise, compares to treatment_period_start.
+    """
+    try:
+        time_var_series = df[time_var]
+        # Ensure numeric for checks and direct comparison
+        if pd.api.types.is_bool_dtype(time_var_series):
+            time_var_series = time_var_series.astype(int)
+        # Check if it's already a binary 0/1 indicator
+        if pd.api.types.is_numeric_dtype(time_var_series):
+            unique_vals = set(time_var_series.dropna().unique())
+            if unique_vals == {0, 1}:
+                logger.info(f"Time variable '{time_var}' is already a binary 0/1 indicator. Using it directly as post indicator.")
+                return time_var_series.astype(int)
+            else:
+                # Numeric, but not 0/1, so compare with treatment_period_start
+                logger.info(f"Time variable '{time_var}' is numeric. Comparing with treatment_period_start: {treatment_period_start}")
+                return (time_var_series >= treatment_period_start).astype(int)
+        else:
+            # Non-numeric and not boolean, will likely fall into TypeError for datetime conversion
+            # This else block might not be strictly necessary if TypeError is caught below
+            # but added for logical completeness before attempting datetime conversion.
+            pass # Let it fall through to TypeError if not numeric here
+        # If we reached here, it means it wasn't numeric or bool, try direct comparison which will likely raise TypeError
+        # and be caught by the except block for datetime conversion if applicable.
+        # This line is kept to ensure non-numeric non-datetime-like strings also trigger the except.
+        return (df[time_var] >= treatment_period_start).astype(int)
+    except TypeError:
+        # If direct comparison fails (e.g., comparing datetime with int/str, or non-numeric string with number),
+        # attempt to convert both to datetime objects for comparison.
+        logger.info(f"Direct comparison/numeric check failed for time_var '{time_var}'. Attempting datetime conversion.")
+        try:
+            time_series_dt = pd.to_datetime(df[time_var], errors='coerce')
+            # Try to convert treatment_period_start to datetime if it's not already
+            # This handles cases where treatment_period_start might be a date string
+            try:
+                treatment_start_dt = pd.to_datetime(treatment_period_start)
+            except Exception as e_conv:
+                logger.error(f"Could not convert treatment_period_start '{treatment_period_start}' to datetime: {e_conv}")
+                raise TypeError(f"treatment_period_start '{treatment_period_start}' could not be converted to a comparable datetime format.")
+            if time_series_dt.isna().all(): # if all values are NaT after conversion
+                raise ValueError(f"Time variable '{time_var}' could not be converted to datetime (all values NaT).")
+            if pd.isna(treatment_start_dt):
+                raise ValueError(f"Treatment start period '{treatment_period_start}' converted to NaT.")
+            logger.info(f"Comparing time_var '{time_var}' (as datetime) with treatment_start_dt '{treatment_start_dt}' (as datetime).")
+            return (time_series_dt >= treatment_start_dt).astype(int)
+        except Exception as e:
+            logger.error(f"Failed to compare time variable '{time_var}' with treatment start '{treatment_period_start}' using datetime logic: {e}", exc_info=True)
+            raise TypeError(f"Could not compare time variable '{time_var}' with treatment start '{treatment_period_start}'. Ensure they are comparable or convertible to datetime. Error: {e}")
+    except Exception as ex:
+        # Catch any other unexpected errors during the initial numeric processing
+        logger.error(f"Unexpected error processing time_var '{time_var}' for post indicator: {ex}", exc_info=True)
+        raise TypeError(f"Unexpected error processing time_var '{time_var}': {ex}")

auto_causal/methods/generalized_propensity_score/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+Generalized Propensity Score (GPS) method for continuous treatments.
+"""

auto_causal/methods/generalized_propensity_score/diagnostics.py ADDED Viewed

	@@ -0,0 +1,196 @@

+"""
+Diagnostic checks for the Generalized Propensity Score (GPS) method.
+"""
+from typing import Dict, List, Any
+import pandas as pd
+import logging
+import numpy as np
+import statsmodels.api as sm
+logger = logging.getLogger(__name__)
+def assess_gps_balance(
+    df_with_gps: pd.DataFrame,
+    treatment_var: str,
+    covariate_vars: List[str],
+    gps_col_name: str,
+    **kwargs: Any
+) -> Dict[str, Any]:
+    """
+    Assesses the balance of covariates conditional on the estimated GPS.
+    This function is typically called after GPS estimation to validate the
+    assumption that covariates are independent of treatment conditional on GPS.
+    Args:
+        df_with_gps: DataFrame containing the original data plus the estimated GPS column.
+        treatment_var: The name of the continuous treatment variable column.
+        covariate_vars: A list of covariate column names to check for balance.
+        gps_col_name: The name of the column containing the estimated GPS values.
+        **kwargs: Additional arguments (e.g., number of strata for checking balance).
+    Returns:
+        A dictionary containing balance statistics and summaries. For example:
+            {
+                "overall_balance_metric": 0.05,
+                "covariate_balance": {
+                    "cov1": {"statistic": 0.03, "p_value": 0.5, "balanced": True},
+                    "cov2": {"statistic": 0.12, "p_value": 0.02, "balanced": False}
+                },
+                "summary": "Balance assessment complete."
+            }
+    """
+    logger.info(f"Assessing GPS balance for covariates: {covariate_vars}")
+    # Default to 5 strata (quintiles) if not specified
+    num_strata = kwargs.get('num_strata', 5)
+    if not isinstance(num_strata, int) or num_strata <= 1:
+        logger.warning(f"Invalid num_strata ({num_strata}), defaulting to 5.")
+        num_strata = 5
+    balance_results = {}
+    overall_summary = {
+        "num_strata_used": num_strata,
+        "covariates_tested": len(covariate_vars),
+        "warnings": [],
+        "all_strata_coefficients": {cov: [] for cov in covariate_vars},
+        "all_strata_p_values": {cov: [] for cov in covariate_vars}
+    }
+    if df_with_gps[gps_col_name].isnull().all():
+        logger.error(f"All GPS scores in column '{gps_col_name}' are NaN. Cannot perform balance assessment.")
+        overall_summary["error"] = "All GPS scores are NaN."
+        return {
+            "error": "All GPS scores are NaN.",
+            "summary": "Balance assessment failed."
+        }
+    try:
+        # Create GPS strata (e.g., quintiles)
+        # Ensure unique bin edges for qcut, duplicates='drop' will handle cases with sparse GPS values
+        # but might result in fewer than num_strata if GPS distribution is highly skewed or has few unique values.
+        try:
+            df_with_gps['gps_stratum'] = pd.qcut(df_with_gps[gps_col_name], num_strata, labels=False, duplicates='drop')
+            actual_num_strata = df_with_gps['gps_stratum'].nunique()
+            if actual_num_strata < num_strata and actual_num_strata > 0:
+                logger.warning(f"Requested {num_strata} strata, but due to GPS distribution, only {actual_num_strata} could be formed.")
+                overall_summary["warnings"].append(f"Only {actual_num_strata} strata formed out of {num_strata} requested.")
+            overall_summary["actual_num_strata_formed"] = actual_num_strata
+        except ValueError as ve:
+            logger.error(f"Could not create strata using pd.qcut due to: {ve}. This might happen if GPS has too few unique values.")
+            logger.info("Attempting to use unique GPS values as strata if count is low.")
+            unique_gps_count = df_with_gps[gps_col_name].nunique()
+            if unique_gps_count <= num_strata * 2 and unique_gps_count > 1: # Arbitrary threshold to try unique values as strata
+                strata_map = {val: i for i, val in enumerate(df_with_gps[gps_col_name].unique())}
+                df_with_gps['gps_stratum'] = df_with_gps[gps_col_name].map(strata_map)
+                actual_num_strata = df_with_gps['gps_stratum'].nunique()
+                overall_summary["actual_num_strata_formed"] = actual_num_strata
+                overall_summary["warnings"].append(f"Used {actual_num_strata} unique GPS values as strata due to qcut error.")
+            else:
+                overall_summary["error"] = f"Failed to create GPS strata: {ve}. GPS may have too few unique values."
+                return {
+                    "error": overall_summary["error"],
+                    "summary": "Balance assessment failed due to strata creation issues."
+                }
+        if df_with_gps['gps_stratum'].isnull().all():
+            logger.error("Stratum assignment resulted in all NaNs.")
+            overall_summary["error"] = "Stratum assignment resulted in all NaNs."
+            return {"error": overall_summary["error"], "summary": "Balance assessment failed."}
+        for cov in covariate_vars:
+            balance_results[cov] = {
+                "strata_details": [],
+                "mean_abs_coefficient": None,
+                "num_significant_strata_p005": 0,
+                "balanced_heuristic": True # Assume balanced until proven otherwise
+            }
+            coeffs_for_cov = []
+            p_values_for_cov = []
+            for stratum_idx in sorted(df_with_gps['gps_stratum'].dropna().unique()):
+                stratum_data = df_with_gps[df_with_gps['gps_stratum'] == stratum_idx]
+                stratum_detail = {"stratum_index": int(stratum_idx), "n_obs": len(stratum_data)}
+                if len(stratum_data) < 10: # Need a minimum number of observations for stable regression
+                    stratum_detail["status"] = "Skipped (too few observations)"
+                    stratum_detail["coefficient_on_treatment"] = np.nan
+                    stratum_detail["p_value_on_treatment"] = np.nan
+                    balance_results[cov]["strata_details"].append(stratum_detail)
+                    continue
+                # Ensure covariate and treatment have variance within the stratum
+                if stratum_data[cov].nunique() < 2 or stratum_data[treatment_var].nunique() < 2:
+                    stratum_detail["status"] = "Skipped (no variance in cov or treatment)"
+                    stratum_detail["coefficient_on_treatment"] = np.nan
+                    stratum_detail["p_value_on_treatment"] = np.nan
+                    balance_results[cov]["strata_details"].append(stratum_detail)
+                    continue
+                try:
+                    X_balance = sm.add_constant(stratum_data[[treatment_var]])
+                    y_balance = stratum_data[cov]
+                    # Drop NaNs for this specific regression within stratum
+                    temp_df = pd.concat([y_balance, X_balance], axis=1).dropna()
+                    if len(temp_df) < X_balance.shape[1] +1: # Check for enough data points after NaNs for regression
+                         stratum_detail["status"] = "Skipped (too few non-NaN obs for regression)"
+                         stratum_detail["coefficient_on_treatment"] = np.nan
+                         stratum_detail["p_value_on_treatment"] = np.nan
+                         balance_results[cov]["strata_details"].append(stratum_detail)
+                         continue
+                    y_balance_fit = temp_df[cov]
+                    X_balance_fit = temp_df[[col for col in temp_df.columns if col != cov]]
+                    balance_model = sm.OLS(y_balance_fit, X_balance_fit).fit()
+                    coeff = balance_model.params.get(treatment_var, np.nan)
+                    p_value = balance_model.pvalues.get(treatment_var, np.nan)
+                    coeffs_for_cov.append(coeff)
+                    p_values_for_cov.append(p_value)
+                    overall_summary["all_strata_coefficients"][cov].append(coeff)
+                    overall_summary["all_strata_p_values"][cov].append(p_value)
+                    stratum_detail["status"] = "Analyzed"
+                    stratum_detail["coefficient_on_treatment"] = coeff
+                    stratum_detail["p_value_on_treatment"] = p_value
+                    if not pd.isna(p_value) and p_value < 0.05:
+                        balance_results[cov]["num_significant_strata_p005"] += 1
+                        balance_results[cov]["balanced_heuristic"] = False # If any stratum is unbalanced
+                except Exception as e_bal:
+                    logger.debug(f"Balance check regression failed for {cov} in stratum {stratum_idx}: {e_bal}")
+                    stratum_detail["status"] = f"Error: {str(e_bal)}"
+                    stratum_detail["coefficient_on_treatment"] = np.nan
+                    stratum_detail["p_value_on_treatment"] = np.nan
+                balance_results[cov]["strata_details"].append(stratum_detail)
+            if coeffs_for_cov:
+                balance_results[cov]["mean_abs_coefficient"] = np.nanmean(np.abs(coeffs_for_cov))
+            else:
+                 balance_results[cov]["mean_abs_coefficient"] = np.nan # No strata were analyzable
+        overall_summary["num_covariates_potentially_imbalanced_p005"] = sum(
+            1 for cov_data in balance_results.values() if not cov_data["balanced_heuristic"]
+        )
+    except Exception as e:
+        logger.error(f"Error during GPS balance assessment: {e}", exc_info=True)
+        overall_summary["error"] = f"Overall assessment error: {str(e)}"
+        return {
+            "error": str(e),
+            "balance_results": balance_results,
+            "summary_stats": overall_summary,
+            "summary": "Balance assessment failed due to an unexpected error."
+        }
+    logger.info("GPS balance assessment complete.")
+    return {
+        "balance_results_per_covariate": balance_results,
+        "summary_stats": overall_summary,
+        "summary": "GPS balance assessment finished. Review strata details and mean absolute coefficients."
+    }

auto_causal/methods/generalized_propensity_score/estimator.py ADDED Viewed

	@@ -0,0 +1,386 @@

+"""
+Core estimation logic for the Generalized Propensity Score (GPS) method.
+"""
+from typing import Dict, List, Any
+import pandas as pd
+import logging
+import numpy as np
+import statsmodels.api as sm
+from .diagnostics import assess_gps_balance # Import for balance check
+logger = logging.getLogger(__name__)
+def estimate_effect_gps(
+    df: pd.DataFrame,
+    treatment: str,
+    outcome: str,
+    covariates: List[str],
+    **kwargs: Any
+) -> Dict[str, Any]:
+    """
+    Estimates the causal effect using the Generalized Propensity Score method
+    for continuous treatments.
+    This function will be called by the method_executor_tool.
+    Args:
+        df: The input DataFrame.
+        treatment: The name of the continuous treatment variable column.
+        outcome: The name of the outcome variable column.
+        covariates: A list of covariate column names.
+        **kwargs: Additional arguments for controlling the estimation, including:
+            - gps_model_spec (dict): Specification for the GPS model (T ~ X).
+            - outcome_model_spec (dict): Specification for the outcome model (Y ~ T, GPS).
+            - t_values_range (list or dict): Specification for treatment levels for ADRF.
+            - n_bootstraps (int): Number of bootstrap replications for SEs.
+    Returns:
+        A dictionary containing the estimation results, including:
+            - "effect_estimate": Typically the ADRF or a specific contrast.
+            - "standard_error": Standard error for the primary effect estimate.
+            - "confidence_interval": Confidence interval for the primary estimate.
+            - "adrf_curve": Data representing the Average Dose-Response Function.
+            - "specific_contrasts": Any calculated specific contrasts.
+            - "diagnostics": Results from diagnostic checks (e.g., balance).
+            - "method_details": Description of the method and models used.
+            - "parameters_used": Dictionary of parameters used.
+    """
+    logger.info(f"Starting GPS estimation for treatment '{treatment}', outcome '{outcome}'.")
+    # --- Parameter Extraction and Defaults ---
+    gps_model_spec = kwargs.get('gps_model_spec', {"type": "linear"})
+    outcome_model_spec = kwargs.get('outcome_model_spec', {"type": "polynomial", "degree": 2, "interaction": True})
+    # Get t_values for ADRF from llm_assist or kwargs, default to 10 points over observed range
+    # For simplicity, we'll use a simple range here. In a full impl, this might call llm_assist.
+    t_values_for_adrf = kwargs.get('t_values_for_adrf')
+    if t_values_for_adrf is None:
+        min_t_obs = df[treatment].min()
+        max_t_obs = df[treatment].max()
+        if pd.isna(min_t_obs) or pd.isna(max_t_obs) or min_t_obs == max_t_obs:
+            logger.warning(f"Cannot determine a valid range for treatment '{treatment}' for ADRF. Using limited points.")
+            t_values_for_adrf = sorted(list(df[treatment].dropna().unique()))[:10] # Fallback
+        else:
+            t_values_for_adrf = np.linspace(min_t_obs, max_t_obs, 10).tolist()
+    n_bootstraps = kwargs.get('n_bootstraps', 0) # Default to 0, meaning no bootstrap for now
+    logger.info(f"Using GPS model spec: {gps_model_spec}")
+    logger.info(f"Using outcome model spec: {outcome_model_spec}")
+    logger.info(f"Evaluating ADRF at t-values: {t_values_for_adrf}")
+    try:
+        # 2. Estimate GPS Values
+        df_with_gps, gps_estimation_diagnostics = _estimate_gps_values(
+            df.copy(), treatment, covariates, gps_model_spec
+        )
+        if 'gps_score' not in df_with_gps.columns or df_with_gps['gps_score'].isnull().all():
+            logger.error("GPS estimation failed or resulted in all NaNs.")
+            return {
+                "error": "GPS estimation failed.",
+                "diagnostics": gps_estimation_diagnostics,
+                "method_details": "GPS (Failed)",
+                "parameters_used": kwargs
+            }
+        # Drop rows where GPS or outcome or necessary modeling variables are NaN before proceeding
+        modeling_cols = [outcome, treatment, 'gps_score'] + covariates
+        df_with_gps.dropna(subset=modeling_cols, inplace=True)
+        if df_with_gps.empty:
+            logger.error("DataFrame is empty after GPS estimation and NaN removal.")
+            return {"error": "No data available after GPS estimation and NaN removal.", "method_details": "GPS (Failed)", "parameters_used": kwargs}
+        # 3. Assess GPS Balance (call diagnostics.assess_gps_balance)
+        balance_diagnostics = assess_gps_balance(
+            df_with_gps, treatment, covariates, 'gps_score' # kwargs for assess_gps_balance can be passed if needed
+        )
+        # 4. Estimate Outcome Model
+        fitted_outcome_model = _estimate_outcome_model(
+            df_with_gps, outcome, treatment, 'gps_score', outcome_model_spec
+        )
+        # 5. Generate Dose-Response Function
+        adrf_results = _generate_dose_response_function(
+            df_with_gps, fitted_outcome_model, treatment, 'gps_score', outcome_model_spec, t_values_for_adrf
+        )
+        adrf_curve_data = {"t_levels": t_values_for_adrf, "expected_outcomes": adrf_results}
+        # 6. Calculate specific contrasts if requested (Placeholder)
+        specific_contrasts = {"info": "Specific contrasts not implemented in this version."}
+        # 7. Perform bootstrapping for SEs if requested (Placeholder for now)
+        standard_error_info = {"info": "Bootstrap SEs not implemented in this version."}
+        confidence_interval_info = {"info": "Bootstrap CIs not implemented in this version."}
+        if n_bootstraps > 0:
+            logger.info(f"Bootstrapping with {n_bootstraps} replications (placeholder).")
+            # Actual bootstrapping logic would go here.
+            # For now, we'll just note that it's not implemented.
+        logger.info("GPS estimation steps completed.")
+        # Consolidate diagnostics
+        all_diagnostics = {
+            "gps_estimation_diagnostics": gps_estimation_diagnostics,
+            "balance_check": balance_diagnostics, # Now using the actual balance check results
+            "outcome_model_summary": str(fitted_outcome_model.summary()) if fitted_outcome_model else "Outcome model not fitted.",
+            "warnings": [], # Populate with any warnings during the process
+            "summary": "GPS estimation complete."
+        }
+        return {
+            "effect_estimate": adrf_curve_data, # The ADRF is the primary "effect"
+            "standard_error_info": standard_error_info, # Placeholder
+            "confidence_interval_info": confidence_interval_info, # Placeholder
+            "adrf_curve": adrf_curve_data,
+            "specific_contrasts": specific_contrasts, # Placeholder
+            "diagnostics": all_diagnostics,
+            "method_details": f"Generalized Propensity Score (GPS) with {gps_model_spec.get('type', 'N/A')} GPS model and {outcome_model_spec.get('type', 'N/A')} outcome model.",
+            "parameters_used": {
+                "treatment_var": treatment,
+                "outcome_var": outcome,
+                "covariate_vars": covariates,
+                "gps_model_spec": gps_model_spec,
+                "outcome_model_spec": outcome_model_spec,
+                "t_values_for_adrf": t_values_for_adrf,
+                "n_bootstraps": n_bootstraps,
+                **kwargs
+            }
+        }
+    except Exception as e:
+        logger.error(f"Error during GPS estimation pipeline: {e}", exc_info=True)
+        return {
+            "error": f"Pipeline failed: {str(e)}",
+            "method_details": "GPS (Failed)",
+            "diagnostics": {"error": f"Pipeline failed during GPS estimation: {str(e)}"}, # Add diagnostics here too
+            "parameters_used": kwargs
+        }
+# Placeholder for internal helper functions
+def _estimate_gps_values(
+    df: pd.DataFrame,
+    treatment: str,
+    covariates: List[str],
+    gps_model_spec: Dict
+) -> tuple[pd.DataFrame, Dict]:
+    """
+    Estimates Generalized Propensity Scores.
+    Assumes T | X ~ N(X*beta, sigma^2), so GPS is the conditional density.
+    """
+    logger.info(f"Estimating GPS for treatment '{treatment}' using covariates: {covariates}")
+    diagnostics = {}
+    if not covariates:
+        logger.error("No covariates provided for GPS estimation.")
+        diagnostics["error"] = "No covariates provided."
+        df['gps_score'] = np.nan # Ensure gps_score column is added
+        return df, diagnostics
+    X_df = df[covariates]
+    T_series = df[treatment]
+    # Handle potential NaN values in covariates or treatment before modeling
+    valid_indices = X_df.dropna().index.intersection(T_series.dropna().index)
+    if len(valid_indices) < len(df):
+        logger.warning(f"Dropped {len(df) - len(valid_indices)} rows due to NaNs in treatment/covariates before GPS estimation.")
+        diagnostics["pre_estimation_nan_rows_dropped"] = len(df) - len(valid_indices)
+    X = X_df.loc[valid_indices]
+    T = T_series.loc[valid_indices]
+    if X.empty or T.empty:
+        logger.error("Covariate or treatment data is empty after NaN handling.")
+        diagnostics["error"] = "Covariate or treatment data is empty after NaN handling."
+        return df, diagnostics
+    X_sm = sm.add_constant(X, has_constant='add')
+    try:
+        if gps_model_spec.get("type") == 'linear':
+            model = sm.OLS(T, X_sm).fit()
+            t_hat = model.predict(X_sm)
+            residuals = T - t_hat
+            # MSE: sum of squared residuals / (n - k) where k is number of regressors (including const)
+            if len(T) <= X_sm.shape[1]:
+                 logger.error("Not enough degrees of freedom to estimate sigma_sq_hat.")
+                 diagnostics["error"] = "Not enough degrees of freedom for GPS variance."
+                 df['gps_score'] = np.nan
+                 return df, diagnostics
+            sigma_sq_hat = np.sum(residuals**2) / (len(T) - X_sm.shape[1])
+            if sigma_sq_hat <= 1e-9: # Check for effectively zero or very small variance
+                logger.warning(f"Estimated residual variance (sigma_sq_hat) is very close to zero ({sigma_sq_hat}). GPS will be set to NaN.")
+                diagnostics["warning_sigma_sq_hat_near_zero"] = sigma_sq_hat
+                df['gps_score'] = np.nan # Set GPS to NaN as density is ill-defined
+                if sigma_sq_hat == 0: # if it is exactly zero, add specific error
+                     diagnostics["error_sigma_sq_hat_is_zero"] = "Residual variance is exactly zero."
+                return df, diagnostics
+            # Calculate GPS: (1 / sqrt(2*pi*sigma_hat^2)) * exp(-(T_i - T_hat_i)^2 / (2*sigma_hat^2))
+            # Ensure calculation is done on the original T values (T_series.loc[valid_indices])
+            # and corresponding t_hat for those valid_indices
+            gps_values_calculated = (1 / np.sqrt(2 * np.pi * sigma_sq_hat)) * np.exp(-((T - t_hat)**2) / (2 * sigma_sq_hat))
+            # Assign back to the original DataFrame using .loc to ensure alignment
+            df['gps_score'] = np.nan # Initialize column
+            df.loc[valid_indices, 'gps_score'] = gps_values_calculated
+            diagnostics["gps_model_type"] = "linear_ols"
+            diagnostics["gps_model_rsquared"] = model.rsquared
+            diagnostics["gps_residual_variance_mse"] = sigma_sq_hat
+            diagnostics["num_observations_for_gps_model"] = len(T)
+        else:
+            logger.error(f"GPS model type '{gps_model_spec.get('type')}' not implemented.")
+            diagnostics["error"] = f"GPS model type '{gps_model_spec.get('type')}' not implemented."
+            df['gps_score'] = np.nan
+    except Exception as e:
+        logger.error(f"Error during GPS model estimation: {e}", exc_info=True)
+        diagnostics["error"] = f"Exception during GPS estimation: {str(e)}"
+        df['gps_score'] = np.nan
+    # Ensure the original df is not modified if no valid indices for GPS estimation
+    if 'gps_score' not in df.columns:
+        df['gps_score'] = np.nan
+    return df, diagnostics
+def _estimate_outcome_model(
+    df_with_gps: pd.DataFrame,
+    outcome: str,
+    treatment: str,
+    gps_col_name: str,
+    outcome_model_spec: Dict
+) -> Any: # Returns a fitted statsmodels model
+    """
+    Estimates the outcome model Y ~ f(T, GPS).
+    """
+    logger.info(f"Estimating outcome model for '{outcome}' using T='{treatment}', GPS='{gps_col_name}'")
+    Y = df_with_gps[outcome]
+    T_val = pd.Series(df_with_gps[treatment].values, index=df_with_gps.index)
+    GPS_val = pd.Series(df_with_gps[gps_col_name].values, index=df_with_gps.index)
+    X_outcome_dict = {'intercept': np.ones(len(df_with_gps))}
+    model_type = outcome_model_spec.get("type", "polynomial")
+    degree = outcome_model_spec.get("degree", 2)
+    interaction = outcome_model_spec.get("interaction", True)
+    if model_type == "polynomial":
+        X_outcome_dict['T'] = T_val
+        X_outcome_dict['GPS'] = GPS_val
+        if degree >= 2:
+            X_outcome_dict['T_sq'] = T_val**2
+            X_outcome_dict['GPS_sq'] = GPS_val**2
+        if degree >=3: # Example for higher order, can be made more general
+            X_outcome_dict['T_cub'] = T_val**3
+            X_outcome_dict['GPS_cub'] = GPS_val**3
+        if interaction:
+            X_outcome_dict['T_x_GPS'] = T_val * GPS_val
+            if degree >=2: # Interaction with squared terms if degree allows
+                 X_outcome_dict['T_sq_x_GPS'] = (T_val**2) * GPS_val
+                 X_outcome_dict['T_x_GPS_sq'] = T_val * (GPS_val**2)
+    # Add more model types as needed (e.g., splines)
+    else:
+        logger.warning(f"Outcome model type '{model_type}' not fully recognized. Defaulting to T + GPS.")
+        X_outcome_dict['T'] = T_val
+        X_outcome_dict['GPS'] = GPS_val
+        # Fallback to linear if spec is unknown or simple
+    X_outcome_df = pd.DataFrame(X_outcome_dict, index=df_with_gps.index)
+    # Drop rows with NaNs that might have been introduced by transformations if T or GPS were NaN
+    # (though earlier dropna should handle most of this for input T/GPS)
+    valid_outcome_model_indices = Y.dropna().index.intersection(X_outcome_df.dropna().index)
+    if len(valid_outcome_model_indices) < len(df_with_gps):
+        logger.warning(f"Dropped {len(df_with_gps) - len(valid_outcome_model_indices)} rows due to NaNs before outcome model fitting.")
+    Y_fit = Y.loc[valid_outcome_model_indices]
+    X_outcome_df_fit = X_outcome_df.loc[valid_outcome_model_indices]
+    if Y_fit.empty or X_outcome_df_fit.empty:
+        logger.error("Not enough data to fit outcome model after NaN handling.")
+        raise ValueError("Empty data for outcome model fitting.")
+    try:
+        model = sm.OLS(Y_fit, X_outcome_df_fit).fit()
+        logger.info("Outcome model estimated successfully.")
+        return model
+    except Exception as e:
+        logger.error(f"Error during outcome model estimation: {e}", exc_info=True)
+        raise # Re-raise the exception to be caught by the main try-except block
+def _generate_dose_response_function(
+    df_with_gps: pd.DataFrame,
+    fitted_outcome_model: Any,
+    treatment: str,
+    gps_col_name: str,
+    outcome_model_spec: Dict, # To know how to construct X_pred features
+    t_values_to_evaluate: List[float]
+) -> List[float]:
+    """
+    Calculates the Average Dose-Response Function (ADRF).
+    E[Y(t)] = integral over E[Y | T=t, GPS=g] * f(g) dg
+            ~= (1/N) * sum_i E[Y | T=t, GPS=g_i] (using observed GPS values)
+    """
+    logger.info(f"Calculating ADRF for treatment levels: {t_values_to_evaluate}")
+    adrf_estimates = []
+    if not t_values_to_evaluate: # Handle empty list case
+        logger.warning("t_values_to_evaluate is empty. ADRF calculation will be skipped.")
+        return []
+    model_exog_names = fitted_outcome_model.model.exog_names
+    # Original GPS values from the dataframe
+    original_gps_values = pd.Series(df_with_gps[gps_col_name].values, index=df_with_gps.index)
+    for t_level in t_values_to_evaluate:
+        # Create a new DataFrame for prediction at this t_level
+        # Each row corresponds to an original observation's GPS, but with T set to t_level
+        X_pred_dict = {'intercept': np.ones(len(df_with_gps))}
+        # Reconstruct features based on outcome_model_spec and model_exog_names
+        # This mirrors the construction in _estimate_outcome_model
+        degree = outcome_model_spec.get("degree", 2)
+        interaction = outcome_model_spec.get("interaction", True)
+        if 'T' in model_exog_names: X_pred_dict['T'] = t_level
+        if 'GPS' in model_exog_names: X_pred_dict['GPS'] = original_gps_values
+        if 'T_sq' in model_exog_names: X_pred_dict['T_sq'] = t_level**2
+        if 'GPS_sq' in model_exog_names: X_pred_dict['GPS_sq'] = original_gps_values**2
+        if 'T_cub' in model_exog_names: X_pred_dict['T_cub'] = t_level**3 # Example
+        if 'GPS_cub' in model_exog_names: X_pred_dict['GPS_cub'] = original_gps_values**3 # Example
+        if 'T_x_GPS' in model_exog_names and interaction:
+            X_pred_dict['T_x_GPS'] = t_level * original_gps_values
+        if 'T_sq_x_GPS' in model_exog_names and interaction and degree >=2:
+            X_pred_dict['T_sq_x_GPS'] = (t_level**2) * original_gps_values
+        if 'T_x_GPS_sq' in model_exog_names and interaction and degree >=2:
+            X_pred_dict['T_x_GPS_sq'] = t_level * (original_gps_values**2)
+        X_pred_df = pd.DataFrame(X_pred_dict, index=df_with_gps.index)
+        # Ensure all required columns are present and in the correct order
+        # Drop any rows that might have NaNs if original_gps_values had NaNs (though they should be filtered before this)
+        X_pred_df_fit = X_pred_df[model_exog_names].dropna()
+        if X_pred_df_fit.empty:
+            logger.warning(f"Prediction data for t_level={t_level} is empty after NaN drop. Assigning NaN to ADRF point.")
+            adrf_estimates.append(np.nan)
+            continue
+        predicted_outcomes_at_t = fitted_outcome_model.predict(X_pred_df_fit)
+        adrf_estimates.append(np.mean(predicted_outcomes_at_t))
+    return adrf_estimates

auto_causal/methods/generalized_propensity_score/llm_assist.py ADDED Viewed

	@@ -0,0 +1,208 @@

+"""
+LLM-assisted components for the Generalized Propensity Score (GPS) method.
+These functions help in suggesting model specifications or parameters
+by leveraging an LLM, providing intelligent defaults when not specified by the user.
+"""
+from typing import Dict, List, Any, Optional
+import pandas as pd
+import logging
+from auto_causal.utils.llm_helpers import call_llm_with_json_output # Hypothetical import
+logger = logging.getLogger(__name__)
+def suggest_treatment_model_spec(
+    df: pd.DataFrame,
+    treatment_var: str,
+    covariate_vars: List[str],
+    query: Optional[str] = None,
+    llm_client: Optional[Any] = None
+) -> Dict[str, Any]:
+    """
+    Suggests a model specification for the treatment mechanism (T ~ X) in GPS.
+    Args:
+        df: The input DataFrame.
+        treatment_var: The name of the continuous treatment variable.
+        covariate_vars: A list of covariate names.
+        query: Optional user query for context.
+        llm_client: Optional LLM client for making a call.
+    Returns:
+        A dictionary representing the suggested model specification.
+        E.g., {"type": "linear", "formula": "T ~ X1 + X2"} or
+              {"type": "random_forest", "params": {...}}
+    """
+    logger.info(f"Suggesting treatment model spec for: {treatment_var}")
+    # Example of constructing a more detailed prompt for an LLM
+    prompt_parts = [
+        f"You are an expert econometrician. The user wants to estimate a Generalized Propensity Score (GPS) for a continuous treatment variable '{treatment_var}'.",
+        f"The available covariates are: {covariate_vars}.",
+        f"The user's research query is: '{query if query else 'Not specified'}'.",
+        "Based on this information and general best practices for GPS estimation:",
+        "1. Suggest a suitable model type for estimating the treatment (T) given covariates (X). Common choices include 'linear' (OLS), or flexible models like 'random_forest' or 'gradient_boosting' if non-linearities are suspected.",
+        "2. If suggesting a regression model like OLS, provide a Patsy-style formula string (e.g., 'treatment ~ cov1 + cov2 + cov1*cov2').",
+        "3. If suggesting a machine learning model, list key hyperparameters and reasonable starting values (e.g., n_estimators, max_depth).",
+        "Return your suggestion as a JSON object with the following structure:",
+        '''
+        {
+          "model_type": "<e.g., linear, random_forest>",
+          "formula": "<Patsy formula if model_type is linear/glm, else null>",
+          "parameters": { // if applicable for ML models
+            "<param1_name>": "<param1_value>",
+            "<param2_name>": "<param2_value>"
+          },
+          "reasoning": "<Brief justification for your suggestion>"
+        }
+        '''
+    ]
+    full_prompt = "\n".join(prompt_parts)
+    if llm_client:
+        logger.info("LLM client provided. Sending constructed prompt (actual call is hypothetical).")
+        logger.debug(f"LLM Prompt for treatment model spec:\n{full_prompt}")
+        # In a real implementation:
+        # response_json = call_llm_with_json_output(llm_client, full_prompt)
+        # if response_json and isinstance(response_json, dict):
+        #     return response_json
+        # else:
+        #     logger.warning("LLM did not return a valid JSON dict for treatment model spec.")
+        pass # Pass for now as it's a hypothetical call
+    # Default suggestion if no LLM or LLM fails
+    return {
+        "model_type": "linear",
+        "formula": f"{treatment_var} ~ {' + '.join(covariate_vars) if covariate_vars else '1'}",
+        "parameters": None,
+        "reasoning": "Defaulting to a linear model for T ~ X. Consider a more flexible model if non-linearities are expected.",
+        "comment": "This is a default suggestion."
+    }
+def suggest_outcome_model_spec(
+    df: pd.DataFrame,
+    outcome_var: str,
+    treatment_var: str,
+    gps_col_name: str,
+    query: Optional[str] = None,
+    llm_client: Optional[Any] = None
+) -> Dict[str, Any]:
+    """
+    Suggests a model specification for the outcome mechanism (Y ~ T, GPS) in GPS.
+    Args:
+        df: The input DataFrame.
+        outcome_var: The name of the outcome variable.
+        treatment_var: The name of the continuous treatment variable.
+        gps_col_name: The name of the GPS column.
+        query: Optional user query for context.
+        llm_client: Optional LLM client for making a call.
+    Returns:
+        A dictionary representing the suggested model specification.
+        E.g., {"type": "polynomial", "degree": 2, "interaction": True,
+               "formula": "Y ~ T + T^2 + GPS + GPS^2 + T*GPS"}
+    """
+    logger.info(f"Suggesting outcome model spec for: {outcome_var}")
+    prompt_parts = [
+        f"You are an expert econometrician. For a Generalized Propensity Score (GPS) analysis, the user needs to model the outcome '{outcome_var}' conditional on the continuous treatment '{treatment_var}' and the estimated GPS (column name '{gps_col_name}').",
+        "The goal is to flexibly capture the relationship E[Y | T, GPS]. A common approach is to use a polynomial specification for T and GPS, including interaction terms.",
+        f"The user's research query is: '{query if query else 'Not specified'}'.",
+        "Suggest a specification for this outcome model. Consider:",
+        "1. The functional form for T (e.g., linear, quadratic, cubic).",
+        "2. The functional form for GPS (e.g., linear, quadratic, cubic).",
+        "3. Whether to include interaction terms between T and GPS (e.g., T*GPS, T^2*GPS, T*GPS^2).",
+        "Return your suggestion as a JSON object with the following structure:",
+        '''
+        {
+          "model_type": "polynomial", // Or other types like "splines"
+          "treatment_terms": ["T", "T_sq"], // e.g., ["T"] for linear, ["T", "T_sq"] for quadratic
+          "gps_terms": ["GPS", "GPS_sq"],   // e.g., ["GPS"] for linear, ["GPS", "GPS_sq"] for quadratic
+          "interaction_terms": ["T_x_GPS", "T_sq_x_GPS", "T_x_GPS_sq"], // Interactions to include, or empty list
+          "reasoning": "<Brief justification for your suggestion>"
+        }
+        '''
+    ]
+    full_prompt = "\n".join(prompt_parts)
+    if llm_client:
+        logger.info("LLM client provided. Sending constructed prompt for outcome model (hypothetical call).")
+        logger.debug(f"LLM Prompt for outcome model spec:\n{full_prompt}")
+        # In a real implementation:
+        # response_json = call_llm_with_json_output(llm_client, full_prompt)
+        # if response_json and isinstance(response_json, dict):
+        #     # Basic validation of expected keys for outcome model could go here
+        #     return response_json
+        # else:
+        #     logger.warning("LLM did not return a valid JSON dict for outcome model spec.")
+        pass # Pass for now
+    # Default suggestion
+    return {
+        "model_type": "polynomial",
+        "treatment_terms": ["T", "T_sq"],
+        "gps_terms": ["GPS", "GPS_sq"],
+        "interaction_terms": ["T_x_GPS"],
+        "reasoning": "Defaulting to a quadratic specification for T and GPS with a simple T*GPS interaction. This is a common starting point.",
+        "comment": "This is a default suggestion."
+    }
+def suggest_dose_response_t_values(
+    df: pd.DataFrame,
+    treatment_var: str,
+    num_points: int = 20,
+    llm_client: Optional[Any] = None
+) -> List[float]:
+    """
+    Suggests a relevant range and number of points for estimating the ADRF.
+    Args:
+        df: The input DataFrame.
+        treatment_var: The name of the continuous treatment variable.
+        num_points: Desired number of points for the ADRF curve.
+        llm_client: Optional LLM client for making a call.
+    Returns:
+        A list of treatment values at which to evaluate the ADRF.
+    """
+    logger.info(f"Suggesting dose response t-values for: {treatment_var}")
+    prompt_parts = [
+        f"For a Generalized Propensity Score (GPS) analysis with continuous treatment '{treatment_var}', the user needs to estimate an Average Dose-Response Function (ADRF).",
+        f"The observed range of '{treatment_var}' is from {df[treatment_var].min():.2f} to {df[treatment_var].max():.2f}.",
+        f"The user desires approximately {num_points} points for the ADRF curve.",
+        f"The user's research query is: '{query if query else 'Not specified'}'.",
+        "Suggest a list of specific treatment values (t_values) at which to evaluate the ADRF. Consider:",
+        "1. Covering the observed range of the treatment.",
+        "2. Potentially including specific points of policy interest if deducible from the query (though this is advanced).",
+        "3. Ensuring a reasonable distribution of points (e.g., equally spaced, or based on quantiles).",
+        "Return your suggestion as a JSON object with a single key 't_values' holding a list of floats:",
+        '''
+        {
+          "t_values": [<float>, <float>, ..., <float>],
+          "reasoning": "<Brief justification for the choice/distribution of these t_values>"
+        }
+        '''
+    ]
+    full_prompt = "\n".join(prompt_parts)
+    if llm_client:
+        logger.info("LLM client provided. Sending prompt for t-values (hypothetical call).")
+        logger.debug(f"LLM Prompt for t-values:\n{full_prompt}")
+        # In a real implementation:
+        # response_json = call_llm_with_json_output(llm_client, full_prompt)
+        # if response_json and isinstance(response_json, dict) and 't_values' in response_json and isinstance(response_json['t_values'], list):
+        #     return response_json['t_values'] # Assuming it returns the list directly based on current function signature
+        # else:
+        #    logger.warning("LLM did not return a valid JSON with 't_values' list for ADRF points.")
+        pass # Pass for now
+    # Default: Linearly spaced points
+    min_t = df[treatment_var].min()
+    max_t = df[treatment_var].max()
+    if pd.isna(min_t) or pd.isna(max_t) or min_t == max_t:
+        logger.warning(f"Could not determine a valid range for treatment '{treatment_var}'. Returning empty list.")
+        return []
+    return list(pd.Series.linspace(min_t, max_t, num_points))

auto_causal/methods/instrumental_variable/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .estimator import estimate_effect

auto_causal/methods/instrumental_variable/diagnostics.py ADDED Viewed

	@@ -0,0 +1,218 @@

+# Placeholder for IV-specific diagnostic functions
+import pandas as pd
+import statsmodels.api as sm
+from statsmodels.regression.linear_model import OLS
+# from statsmodels.sandbox.regression.gmm import IV2SLSResults # Removed problematic import
+from typing import Dict, Any, List, Tuple, Optional
+import logging # Import logging
+import numpy as np # Import numpy for np.zeros
+# Configure logger
+logger = logging.getLogger(__name__)
+def calculate_first_stage_f_statistic(df: pd.DataFrame, treatment: str, instruments: List[str], covariates: List[str]) -> Tuple[Optional[float], Optional[float]]:
+    """
+    Calculates the F-statistic for instrument relevance in the first stage regression.
+    Regresses treatment ~ instruments + covariates.
+    Tests the joint significance of the instrument coefficients.
+    Args:
+        df: Input DataFrame.
+        treatment: Name of the treatment variable.
+        instruments: List of instrument variable names.
+        covariates: List of covariate names.
+    Returns:
+        A tuple containing (F-statistic, p-value). Returns (None, None) on error.
+    """
+    logger.info("Diagnostics: Calculating First-Stage F-statistic...")
+    try:
+        df_copy = df.copy()
+        df_copy['intercept'] = 1
+        exog_vars = ['intercept'] + covariates
+        all_first_stage_exog = list(dict.fromkeys(exog_vars + instruments)) # Ensure unique columns
+        endog = df_copy[treatment]
+        exog = df_copy[all_first_stage_exog]
+        # Check for perfect multicollinearity before fitting
+        if exog.shape[1] > 1:
+            corr_matrix = exog.corr()
+            # Check if correlation matrix calculation failed (e.g., constant columns) or high correlation
+            if corr_matrix.isnull().values.any() or (corr_matrix.abs() > 0.9999).sum().sum() > exog.shape[1]: # Check off-diagonal elements
+                 logger.warning("High multicollinearity or constant column detected in first stage exogenous variables.")
+                 # Note: statsmodels OLS might handle perfect collinearity by dropping columns, but F-test might be unreliable.
+        first_stage_model = OLS(endog, exog).fit()
+        # Construct the restriction matrix (R) to test H0: instrument coeffs = 0
+        num_instruments = len(instruments)
+        if num_instruments == 0:
+            logger.warning("No instruments provided for F-statistic calculation.")
+            return None, None
+        num_exog_total = len(all_first_stage_exog)
+        # Ensure instruments are actually in the fitted model's exog names (in case statsmodels dropped some)
+        fitted_exog_names = first_stage_model.model.exog_names
+        valid_instruments = [inst for inst in instruments if inst in fitted_exog_names]
+        if not valid_instruments:
+             logger.error("None of the provided instruments were included in the first-stage regression model (possibly due to collinearity).")
+             return None, None
+        if len(valid_instruments) < len(instruments):
+            logger.warning(f"Instruments dropped by OLS: {set(instruments) - set(valid_instruments)}")
+        instrument_indices = [fitted_exog_names.index(inst) for inst in valid_instruments]
+        # Need to adjust R matrix size based on fitted model's exog
+        R = np.zeros((len(valid_instruments), len(fitted_exog_names)))
+        for i, idx in enumerate(instrument_indices):
+            R[i, idx] = 1
+        # Perform F-test
+        f_test_result = first_stage_model.f_test(R)
+        f_statistic = float(f_test_result.fvalue)
+        p_value = float(f_test_result.pvalue)
+        logger.info(f"  F-statistic: {f_statistic:.4f}, p-value: {p_value:.4f}")
+        return f_statistic, p_value
+    except Exception as e:
+        logger.error(f"Error calculating first-stage F-statistic: {e}", exc_info=True)
+        return None, None
+def run_overidentification_test(sm_results: Optional[Any], df: pd.DataFrame, treatment: str, outcome: str, instruments: List[str], covariates: List[str]) -> Tuple[Optional[float], Optional[float], Optional[str]]:
+    """
+    Runs an overidentification test (Sargan-Hansen) if applicable.
+    This test is only valid if the number of instruments exceeds the number
+    of endogenous regressors (typically 1, the treatment variable).
+    Requires results from a statsmodels IV estimation.
+    Args:
+        sm_results: The fitted results object from statsmodels IV2SLS.fit().
+        df: Input DataFrame.
+        treatment: Name of the treatment variable.
+        outcome: Name of the outcome variable.
+        instruments: List of instrument variable names.
+        covariates: List of covariate names.
+    Returns:
+        Tuple: (test_statistic, p_value, status_message) or (None, None, error_message)
+    """
+    logger.info("Diagnostics: Running Overidentification Test...")
+    num_instruments = len(instruments)
+    num_endog = 1 # Assuming only one treatment variable is endogenous
+    if num_instruments <= num_endog:
+        logger.info("  Over-ID test not applicable (model is exactly identified or underidentified).")
+        return None, None, "Test not applicable (Need more instruments than endogenous regressors)"
+    if sm_results is None or not hasattr(sm_results, 'resid'):
+        logger.warning("  Over-ID test requires valid statsmodels results object with residuals.")
+        return None, None, "Statsmodels results object not available or invalid for test."
+    try:
+        # Statsmodels IV2SLSResults does not seem to have a direct method for this test (as of common versions).
+        # We need to calculate it manually using residuals and instruments.
+        # Formula: N * R^2 from regressing residuals (u_hat) on all exogenous variables (instruments + covariates).
+        # Degrees of freedom = num_instruments - num_endogenous_vars
+        residuals = sm_results.resid
+        df_copy = df.copy()
+        df_copy['intercept'] = 1
+        exog_vars = ['intercept'] + covariates
+        all_exog_instruments = list(dict.fromkeys(exog_vars + instruments))
+        # Ensure columns exist in the dataframe before selecting
+        missing_cols = [col for col in all_exog_instruments if col not in df_copy.columns]
+        if missing_cols:
+            raise ValueError(f"Missing columns required for Over-ID test: {missing_cols}")
+        exog_for_test = df_copy[all_exog_instruments]
+        # Check shapes match after potential NA handling in main estimator
+        if len(residuals) != exog_for_test.shape[0]:
+             # Attempt to align based on index if lengths differ (might happen if NAs were dropped)
+            logger.warning(f"Residual length ({len(residuals)}) differs from exog_for_test rows ({exog_for_test.shape[0]}). Trying to align indices.")
+            common_index = residuals.index.intersection(exog_for_test.index)
+            if len(common_index) == 0:
+                 raise ValueError("Cannot align residuals and exogenous variables for Over-ID test after NA handling.")
+            residuals = residuals.loc[common_index]
+            exog_for_test = exog_for_test.loc[common_index]
+            logger.warning(f"Aligned to {len(common_index)} common observations.")
+        # Regress residuals on all exogenous instruments
+        aux_model = OLS(residuals, exog_for_test).fit()
+        r_squared = aux_model.rsquared
+        n_obs = len(residuals) # Use length of residuals after potential alignment
+        test_statistic = n_obs * r_squared
+        # Calculate p-value from Chi-squared distribution
+        from scipy.stats import chi2
+        degrees_of_freedom = num_instruments - num_endog
+        if degrees_of_freedom < 0:
+            # This shouldn't happen if the initial check passed, but as a safeguard
+            raise ValueError("Degrees of freedom for Sargan test are negative.")
+        elif degrees_of_freedom == 0:
+            # R-squared should be 0 if exactly identified, but handle edge case
+            p_value = 1.0 if np.isclose(test_statistic, 0) else 0.0
+        else:
+            p_value = chi2.sf(test_statistic, degrees_of_freedom)
+        logger.info(f"  Sargan Test Statistic: {test_statistic:.4f}, p-value: {p_value:.4f}, df: {degrees_of_freedom}")
+        return test_statistic, p_value, "Test successful"
+    except Exception as e:
+        logger.error(f"Error running overidentification test: {e}", exc_info=True)
+        return None, None, f"Error during test: {e}"
+def run_iv_diagnostics(df: pd.DataFrame, treatment: str, outcome: str, instruments: List[str], covariates: List[str], sm_results: Optional[Any] = None, dw_results: Optional[Any] = None) -> Dict[str, Any]:
+    """
+    Runs standard IV diagnostic checks.
+    Args:
+        df: Input DataFrame.
+        treatment: Name of the treatment variable.
+        outcome: Name of the outcome variable.
+        instruments: List of instrument variable names.
+        covariates: List of covariate names.
+        sm_results: Optional fitted results object from statsmodels IV2SLS.fit().
+        dw_results: Optional results object from DoWhy (structure may vary).
+    Returns:
+        Dictionary containing diagnostic results.
+    """
+    diagnostics = {}
+    # 1. Instrument Relevance / Weak Instrument Test (First-Stage F-statistic)
+    f_stat, f_p_val = calculate_first_stage_f_statistic(df, treatment, instruments, covariates)
+    diagnostics['first_stage_f_statistic'] = f_stat
+    diagnostics['first_stage_p_value'] = f_p_val
+    diagnostics['is_instrument_weak'] = (f_stat < 10) if f_stat is not None else None # Common rule of thumb
+    if f_stat is None:
+        diagnostics['weak_instrument_test_status'] = "Error during calculation"
+    elif diagnostics['is_instrument_weak']:
+        diagnostics['weak_instrument_test_status'] = "Warning: Instrument(s) may be weak (F < 10)"
+    else:
+        diagnostics['weak_instrument_test_status'] = "Instrument(s) appear sufficiently strong (F >= 10)"
+    # 2. Overidentification Test (e.g., Sargan-Hansen)
+    overid_stat, overid_p_val, overid_status = run_overidentification_test(sm_results, df, treatment, outcome, instruments, covariates)
+    diagnostics['overid_test_statistic'] = overid_stat
+    diagnostics['overid_test_p_value'] = overid_p_val
+    diagnostics['overid_test_status'] = overid_status
+    diagnostics['overid_test_applicable'] = not ("not applicable" in overid_status.lower() if overid_status else True)
+    # 3. Exogeneity/Exclusion Restriction (Conceptual Check)
+    diagnostics['exclusion_restriction_assumption'] = "Assumed based on graph/input; cannot be statistically tested directly. Qualitative LLM check recommended."
+    # Potential future additions:
+    # - Endogeneity tests (e.g., Hausman test - requires comparing OLS and IV estimates)
+    return diagnostics

auto_causal/methods/instrumental_variable/estimator.py ADDED Viewed

	@@ -0,0 +1,370 @@

+import pandas as pd
+import statsmodels.api as sm
+from statsmodels.sandbox.regression.gmm import IV2SLS
+from dowhy import CausalModel # Primary path
+from typing import Dict, Any, List, Union, Optional
+import logging
+from langchain.chat_models.base import BaseChatModel
+from .diagnostics import run_iv_diagnostics
+from .llm_assist import identify_instrument_variable, validate_instrument_assumptions_qualitative, interpret_iv_results
+logger = logging.getLogger(__name__)
+def build_iv_graph_gml(treatment: str, outcome: str, instruments: List[str], covariates: List[str]) -> str:
+    """
+    Constructs a GML string representing the causal graph for IV.
+    Assumptions:
+    - Instruments cause Treatment
+    - Covariates cause Treatment and Outcome
+    - Treatment causes Outcome
+    - Instruments do NOT directly cause Outcome (Exclusion)
+    - Instruments are NOT caused by Covariates (can be relaxed if needed)
+    - Unobserved Confounder (U) affects Treatment and Outcome
+    Args:
+        treatment: Name of the treatment variable.
+        outcome: Name of the outcome variable.
+        instruments: List of instrument variable names.
+        covariates: List of covariate names.
+    Returns:
+        A GML graph string.
+    """
+    nodes = []
+    edges = []
+    # Define nodes - ensure no duplicates if a variable is both instrument and covariate (SHOULD NOT HAPPEN)
+    # Use a set to ensure unique variable names
+    all_vars_set = set([treatment, outcome] + instruments + covariates + ['U'])
+    all_vars = list(all_vars_set)
+    for var in all_vars:
+        nodes.append(f'node [ id "{var}" label "{var}" ]')
+    # Define edges
+    # Instruments -> Treatment
+    for inst in instruments:
+        edges.append(f'edge [ source "{inst}" target "{treatment}" ]')
+    # Covariates -> Treatment
+    for cov in covariates:
+        # Ensure we don't add self-loops or duplicate edges if cov == treatment (shouldn't happen)
+        if cov != treatment:
+            edges.append(f'edge [ source "{cov}" target "{treatment}" ]')
+    # Covariates -> Outcome
+    for cov in covariates:
+         if cov != outcome:
+            edges.append(f'edge [ source "{cov}" target "{outcome}" ]')
+    # Treatment -> Outcome
+    edges.append(f'edge [ source "{treatment}" target "{outcome}" ]')
+    # Unobserved Confounder -> Treatment and Outcome
+    edges.append(f'edge [ source "U" target "{treatment}" ]')
+    edges.append(f'edge [ source "U" target "{outcome}" ]')
+    # Core IV Assumption: Instruments are NOT caused by U (implicitly handled by not adding edge)
+    # Core IV Assumption: Instruments do NOT directly cause Outcome (handled by not adding edge)
+    # Format nodes and edges with indentation before inserting into f-string
+    formatted_nodes = '\n  '.join(nodes)
+    formatted_edges = '\n  '.join(edges)
+    gml_string = f"""
+graph [
+  directed 1
+  {formatted_nodes}
+  {formatted_edges}
+]
+"""
+    # Convert print to logger
+    logger.debug("\n--- Generated GML Graph ---")
+    logger.debug(gml_string)
+    logger.debug("-------------------------\n")
+    return gml_string
+def format_iv_results(estimate: Optional[float], raw_results: Dict, diagnostics: Dict, treatment: str, outcome: str, instrument: List[str], method_used: str, llm: Optional[BaseChatModel] = None) -> Dict[str, Any]:
+    """
+    Formats the results from IV estimation into a standardized dictionary.
+    Args:
+        estimate: The point estimate of the causal effect.
+        raw_results: Dictionary containing raw outputs from DoWhy/statsmodels.
+        diagnostics: Dictionary containing diagnostic results.
+        treatment: Name of the treatment variable.
+        outcome: Name of the outcome variable.
+        instrument: List of instrument variable names.
+        method_used: 'dowhy' or 'statsmodels'.
+        llm: Optional LLM instance for interpretation.
+    Returns:
+        Standardized results dictionary.
+    """
+    formatted = {
+        "effect_estimate": estimate,
+        "treatment_variable": treatment,
+        "outcome_variable": outcome,
+        "instrument_variables": instrument,
+        "method_used": method_used,
+        "diagnostics": diagnostics,
+        "raw_results": {k: str(v) for k, v in raw_results.items() if "object" not in k}, # Avoid serializing large objects
+        "confidence_interval": None,
+        "standard_error": None,
+        "p_value": None,
+        "interpretation": "Placeholder"
+    }
+    # Extract details from statsmodels results if available
+    sm_results = raw_results.get('statsmodels_results_object')
+    if method_used == 'statsmodels' and sm_results:
+        try:
+            # Use .bse for standard error in statsmodels results
+            formatted["standard_error"] = float(sm_results.bse[treatment])
+            formatted["p_value"] = float(sm_results.pvalues[treatment])
+            conf_int = sm_results.conf_int().loc[treatment].tolist()
+            formatted["confidence_interval"] = [float(ci) for ci in conf_int]
+        except AttributeError as e:
+            logger.warning(f"Could not extract all details from statsmodels results object (likely missing attribute): {e}")
+        except Exception as e:
+            logger.warning(f"Error extracting details from statsmodels results: {e}")
+    # Extract details from DoWhy results if available
+    # Note: DoWhy's CausalEstimate object structure needs inspection
+    dw_results = raw_results.get('dowhy_results_object')
+    if method_used == 'dowhy' and dw_results:
+         try:
+             # Attempt common attributes, may need adjustment based on DoWhy version/output
+             if hasattr(dw_results, 'stderr'):
+                 formatted["standard_error"] = float(dw_results.stderr)
+             if hasattr(dw_results, 'p_value'):
+                  formatted["p_value"] = float(dw_results.p_value)
+             if hasattr(dw_results, 'conf_intervals'):
+                 # Assuming it's stored similarly to statsmodels, might need adjustment
+                 ci = dw_results.conf_intervals().loc[treatment].tolist() # Fictional attribute/method - check DoWhy docs!
+                 formatted["confidence_interval"] = [float(c) for c in ci]
+             elif hasattr(dw_results, 'get_confidence_intervals'):
+                  ci = dw_results.get_confidence_intervals() # Check DoWhy docs for format
+                  # Check format of ci before converting
+                  if isinstance(ci, (list, tuple)) and len(ci) == 2:
+                      formatted["confidence_interval"] = [float(c) for c in ci] # Adapt parsing
+                  else:
+                      logger.warning(f"Could not parse confidence intervals from DoWhy object: {ci}")
+         except Exception as e:
+             logger.warning(f"Could not extract all details from DoWhy results: {e}. Structure might be different.", exc_info=True)
+             # Avoid printing dir in production code, use logger.debug if needed for dev
+             # logger.debug(f"DoWhy result object dir(): {dir(dw_results)}")
+    # Generate LLM interpretation - pass llm object
+    if estimate is not None:
+        formatted["interpretation"] = interpret_iv_results(formatted, diagnostics, llm=llm)
+    else:
+        formatted["interpretation"] = "Estimation failed, cannot interpret results."
+    return formatted
+def estimate_effect(
+    df: pd.DataFrame,
+    treatment: str,
+    outcome: str,
+    covariates: List[str],
+    query: Optional[str] = None,
+    dataset_description: Optional[str] = None,
+    llm: Optional[BaseChatModel] = None,
+    **kwargs
+) -> Dict[str, Any]:
+    instrument = kwargs.get('instrument_variable')
+    if not instrument:
+        return {"error": "Instrument variable ('instrument_variable') not found in kwargs.", "method_used": "none", "diagnostics": {}}
+    instrument_list = [instrument] if isinstance(instrument, str) else instrument
+    valid_instruments = [inst for inst in instrument_list if isinstance(inst, str)]
+    clean_covariates = [cov for cov in covariates if cov not in valid_instruments]
+    logger.info(f"\n--- Starting Instrumental Variable Estimation ---")
+    logger.info(f"Treatment: {treatment}, Outcome: {outcome}, Instrument(s): {valid_instruments}, Original Covariates: {covariates}, Cleaned Covariates: {clean_covariates}")
+    results = {}
+    method_used = "none"
+    sm_results_obj = None
+    dw_results_obj = None
+    identified_estimand = None # Initialize
+    model = None             # Initialize
+    refutation_results = {}  # Initialize
+    # --- Input Validation ---
+    required_cols = [treatment, outcome] + valid_instruments + clean_covariates
+    missing_cols = [col for col in required_cols if col not in df.columns]
+    if missing_cols:
+        return {"error": f"Missing required columns in DataFrame: {missing_cols}", "method_used": method_used, "diagnostics": {}}
+    if not valid_instruments:
+        return {"error": "Instrument variable(s) must be provided and valid.", "method_used": method_used, "diagnostics": {}}
+    # --- LLM Pre-Checks ---
+    if query and llm:
+        qualitative_check = validate_instrument_assumptions_qualitative(treatment, outcome, valid_instruments, clean_covariates, query, llm=llm)
+        results['llm_assumption_check'] = qualitative_check
+        logger.info(f"LLM Qualitative Assumption Check: {qualitative_check}")
+    # --- Build Graph and Instantiate CausalModel (Do this before estimation attempts) ---
+    # This allows using identify_effect and refute_estimate even if DoWhy estimation fails
+    try:
+        graph = build_iv_graph_gml(treatment, outcome, valid_instruments, clean_covariates)
+        if not graph:
+            raise ValueError("Failed to build GML graph for DoWhy.")
+        model = CausalModel(data=df, treatment=treatment, outcome=outcome, graph=graph)
+        # Identify Effect (essential for refutation later)
+        identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)
+        logger.debug("\nDoWhy Identified Estimand:")
+        logger.debug(identified_estimand)
+        if not identified_estimand:
+             raise ValueError("DoWhy could not identify a valid estimand.")
+    except Exception as model_init_e:
+        logger.error(f"Failed to initialize CausalModel or identify effect: {model_init_e}", exc_info=True)
+        # Cannot proceed without model/estimand for DoWhy or refutation
+        results['error'] = f"Failed to initialize CausalModel: {model_init_e}"
+        # Attempt statsmodels anyway? Or return error? Let's try statsmodels.
+        pass # Allow falling through to statsmodels if desired
+    # --- Primary Path: DoWhy Estimation ---
+    if model and identified_estimand and not kwargs.get('force_statsmodels', False):
+        logger.info("\nAttempting estimation with DoWhy...")
+        try:
+            dw_results_obj = model.estimate_effect(
+                identified_estimand,
+                method_name="iv.instrumental_variable",
+                method_params={'iv_instrument_name': valid_instruments}
+            )
+            logger.debug("\nDoWhy Estimation Result:")
+            logger.debug(dw_results_obj)
+            results['dowhy_estimate'] = dw_results_obj.value
+            results['dowhy_results_object'] = dw_results_obj
+            method_used = 'dowhy'
+            logger.info("DoWhy estimation successful.")
+        except Exception as e:
+            logger.error(f"DoWhy IV estimation failed: {e}", exc_info=True)
+            results['dowhy_error'] = str(e)
+            if not kwargs.get('allow_fallback', True):
+                 logger.warning("Fallback to statsmodels disabled. Estimation failed.")
+                 method_used = "dowhy_failed"
+                 # Still run diagnostics and format output
+            else:
+                logger.info("Proceeding to statsmodels fallback.")
+    elif not model or not identified_estimand:
+         logger.warning("Skipping DoWhy estimation due to CausalModel initialization/identification failure.")
+         # Ensure we proceed to statsmodels if fallback is allowed
+         if not kwargs.get('allow_fallback', True):
+             logger.error("Cannot estimate effect: CausalModel failed and fallback disabled.")
+             method_used = "dowhy_failed"
+         else:
+              logger.info("Proceeding to statsmodels fallback.")
+    # --- Fallback Path: statsmodels IV2SLS ---
+    if method_used not in ['dowhy', 'dowhy_failed']:
+        logger.info("\nAttempting estimation with statsmodels IV2SLS...")
+        try:
+            df_copy = df.copy().dropna(subset=required_cols)
+            if df_copy.empty:
+                 raise ValueError("DataFrame becomes empty after dropping NAs in required columns.")
+            df_copy['intercept'] = 1
+            exog_regressors = ['intercept'] + clean_covariates
+            endog_var = treatment
+            all_instruments_for_sm = list(dict.fromkeys(exog_regressors + valid_instruments))
+            endog_data = df_copy[outcome]
+            exog_data_sm_cols = list(dict.fromkeys(exog_regressors + [endog_var]))
+            exog_data_sm = df_copy[exog_data_sm_cols]
+            instrument_data_sm = df_copy[all_instruments_for_sm]
+            num_endog = 1
+            num_external_iv = len(valid_instruments)
+            if num_endog > num_external_iv:
+                 raise ValueError(f"Model underidentified: More endogenous regressors ({num_endog}) than unique external instruments ({num_external_iv}).")
+            iv_model = IV2SLS(endog=endog_data, exog=exog_data_sm, instrument=instrument_data_sm)
+            sm_results_obj = iv_model.fit()
+            logger.info("\nStatsmodels Estimation Summary:")
+            logger.info(f"  Estimate for {treatment}: {sm_results_obj.params[treatment]}")
+            logger.info(f"  Std Error: {sm_results_obj.bse[treatment]}")
+            logger.info(f"  P-value: {sm_results_obj.pvalues[treatment]}")
+            results['statsmodels_estimate'] = sm_results_obj.params[treatment]
+            results['statsmodels_results_object'] = sm_results_obj
+            method_used = 'statsmodels'
+            logger.info("Statsmodels estimation successful.")
+        except Exception as sm_e:
+            logger.error(f"Statsmodels IV estimation also failed: {sm_e}", exc_info=True)
+            results['statsmodels_error'] = str(sm_e)
+            method_used = 'statsmodels_failed' if method_used == "none" else "dowhy_failed_sm_failed"
+    # --- Diagnostics ---
+    logger.info("\nRunning diagnostics...")
+    diagnostics = run_iv_diagnostics(df, treatment, outcome, valid_instruments, clean_covariates, sm_results_obj, dw_results_obj)
+    results['diagnostics'] = diagnostics
+    # --- Refutation Step ---
+    final_estimate_value = results.get('dowhy_estimate') if method_used == 'dowhy' else results.get('statsmodels_estimate')
+    # Only run permute refuter if estimate is valid AND came from DoWhy
+    if method_used == 'dowhy' and dw_results_obj and final_estimate_value is not None:
+        logger.info("\nRunning refutation test (Placebo Treatment - Permute - requires DoWhy estimate object)...")
+        try:
+            # Pass the actual DoWhy estimate object
+            refuter_result = model.refute_estimate(
+                identified_estimand,
+                dw_results_obj, # Pass the original DoWhy result object
+                method_name="placebo_treatment_refuter",
+                placebo_type="permute" # Necessary for IV according to docs/examples
+            )
+            logger.info("Refutation test completed.")
+            logger.debug(f"Refuter Result:\n{refuter_result}")
+            # Store relevant info from refuter_result (check its structure)
+            refutation_results = {
+                "refuter": "placebo_treatment_refuter",
+                "new_effect": getattr(refuter_result, 'new_effect', 'N/A'),
+                "p_value": getattr(refuter_result, 'refutation_result', {}).get('p_value', 'N/A') if hasattr(refuter_result, 'refutation_result') else 'N/A',
+                # Passed if p-value > 0.05 (or not statistically significant)
+                "passed": getattr(refuter_result, 'refutation_result', {}).get('is_statistically_significant', None) == False if hasattr(refuter_result, 'refutation_result') else None
+            }
+        except Exception as refute_e:
+            logger.error(f"Refutation test failed: {refute_e}", exc_info=True)
+            refutation_results = {"error": f"Refutation failed: {refute_e}"}
+    elif final_estimate_value is not None and method_used == 'statsmodels':
+        logger.warning("Skipping placebo permutation refuter: Estimate was generated by statsmodels, not DoWhy's IV estimator.")
+        refutation_results = {"status": "skipped_wrong_estimator_for_permute"}
+    elif final_estimate_value is None:
+        logger.warning("Skipping refutation test because estimation failed.")
+        refutation_results = {"status": "skipped_due_to_failed_estimation"}
+    else: # Model or estimand failed earlier, or unknown method_used
+        logger.warning(f"Skipping refutation test due to earlier failure (method_used: {method_used}).")
+        refutation_results = {"status": "skipped_due_to_model_failure_or_unknown"}
+    results['refutation_results'] = refutation_results # Add to main results
+    # --- Formatting Results ---
+    if final_estimate_value is None and method_used not in ['dowhy', 'statsmodels']:
+        logger.error("ERROR: Both estimation methods failed.")
+        # Ensure error key exists if not set earlier
+        if 'error' not in results:
+            results['error'] = "Both DoWhy and statsmodels IV estimation failed."
+    logger.info("\n--- Formatting Final Results ---")
+    formatted_results = format_iv_results(
+        final_estimate_value, # Pass the numeric value
+        results, # Pass the dict containing estimate objects and refutation results
+        diagnostics,
+        treatment,
+        outcome,
+        valid_instruments,
+        method_used,
+        llm=llm
+    )
+    logger.info("--- Instrumental Variable Estimation Complete ---\n")
+    return formatted_results

auto_causal/methods/instrumental_variable/llm_assist.py ADDED Viewed

	@@ -0,0 +1,240 @@

+"""
+LLM assistance functions for Instrumental Variable (IV) analysis.
+This module provides functions for LLM-based assistance in instrumental variable analysis,
+including identifying potential instruments, validating IV assumptions, and interpreting results.
+"""
+from typing import List, Dict, Any, Optional
+import logging
+# Imported for type hinting
+from langchain.chat_models.base import BaseChatModel
+# Import shared LLM helpers
+from auto_causal.utils.llm_helpers import call_llm_with_json_output
+logger = logging.getLogger(__name__)
+def identify_instrument_variable(
+    df_cols: List[str],
+    query: str,
+    llm: Optional[BaseChatModel] = None
+) -> List[str]:
+    """
+    Use LLM to identify potential instrumental variables from available columns.
+    Args:
+        df_cols: List of column names from the dataset
+        query: User's causal query text
+        llm: Optional LLM model instance
+    Returns:
+        List of column names identified as potential instruments
+    """
+    if llm is None:
+        logger.warning("No LLM provided for instrument identification")
+        return []
+    prompt = f"""
+    You are assisting with an instrumental variable analysis.
+    Available columns in the dataset: {df_cols}
+    User query: {query}
+    Identify potential instrumental variable(s) from the available columns based on the query.
+    The treatment and outcome should NOT be included as instruments.
+    Return ONLY a valid JSON object with the following structure (no explanations or surrounding text):
+    {{
+      "potential_instruments": ["column_name1", "column_name2", ...]
+    }}
+    """
+    response = call_llm_with_json_output(llm, prompt)
+    if response and "potential_instruments" in response and isinstance(response["potential_instruments"], list):
+        # Basic validation: ensure items are strings (column names)
+        valid_instruments = [item for item in response["potential_instruments"] if isinstance(item, str)]
+        if len(valid_instruments) != len(response["potential_instruments"]):
+            logger.warning("LLM returned non-string items in potential_instruments list.")
+        return valid_instruments
+    logger.warning(f"Failed to get valid instrument recommendations from LLM. Response: {response}")
+    return []
+def validate_instrument_assumptions_qualitative(
+    treatment: str,
+    outcome: str,
+    instrument: List[str],
+    covariates: List[str],
+    query: str,
+    llm: Optional[BaseChatModel] = None
+) -> Dict[str, str]:
+    """
+    Use LLM to provide qualitative assessment of IV assumptions.
+    Args:
+        treatment: Treatment variable name
+        outcome: Outcome variable name
+        instrument: List of instrumental variable names
+        covariates: List of covariate variable names
+        query: User's causal query text
+        llm: Optional LLM model instance
+    Returns:
+        Dictionary with qualitative assessments of exclusion and exogeneity assumptions
+    """
+    default_fail = {
+        "exclusion_assessment": "LLM Check Failed",
+        "exogeneity_assessment": "LLM Check Failed"
+    }
+    if llm is None:
+        return {
+            "exclusion_assessment": "LLM Not Provided",
+            "exogeneity_assessment": "LLM Not Provided"
+        }
+    prompt = f"""
+    You are assisting with assessing the validity of instrumental variable assumptions.
+    Treatment variable: {treatment}
+    Outcome variable: {outcome}
+    Instrumental variable(s): {instrument}
+    Covariates: {covariates}
+    User query: {query}
+    Assess the core Instrumental Variable (IV) assumptions based *only* on the provided variable names and query context:
+    1. Exclusion restriction: Plausibility that the instrument(s) affect the outcome ONLY through the treatment.
+    2. Exogeneity (also called Independence): Plausibility that the instrument(s) are not correlated with unobserved confounders that also affect the outcome.
+    Provide a brief, qualitative assessment (e.g., 'Plausible', 'Unlikely', 'Requires Domain Knowledge', 'Potentially Violated').
+    Return ONLY a valid JSON object with the following structure (no explanations or surrounding text):
+    {{
+      "exclusion_assessment": "<brief assessment of exclusion restriction>",
+      "exogeneity_assessment": "<brief assessment of exogeneity assumption>"
+    }}
+    """
+    response = call_llm_with_json_output(llm, prompt)
+    if response and isinstance(response, dict) and \
+       "exclusion_assessment" in response and isinstance(response["exclusion_assessment"], str) and \
+       "exogeneity_assessment" in response and isinstance(response["exogeneity_assessment"], str):
+        return response
+    logger.warning(f"Failed to get valid assumption assessment from LLM. Response: {response}")
+    return default_fail
+def interpret_iv_results(
+    results: Dict[str, Any],
+    diagnostics: Dict[str, Any],
+    llm: Optional[BaseChatModel] = None
+) -> str:
+    """
+    Use LLM to interpret IV results in natural language.
+    Args:
+        results: Dictionary of estimation results (e.g., effect_estimate, p_value, confidence_interval)
+        diagnostics: Dictionary of diagnostic test results (e.g., first_stage_f_statistic, overid_test)
+        llm: Optional LLM model instance
+    Returns:
+        String containing natural language interpretation of results
+    """
+    if llm is None:
+        return "LLM was not available to provide interpretation. Please review the numeric results manually."
+    # Construct a concise summary of inputs for the prompt
+    results_summary = {}
+    effect = results.get('effect_estimate')
+    if effect is not None:
+        try:
+            results_summary['Effect Estimate'] = f"{float(effect):.3f}"
+        except (ValueError, TypeError):
+            results_summary['Effect Estimate'] = 'N/A (Invalid Format)'
+    else:
+        results_summary['Effect Estimate'] = 'N/A'
+    p_value = results.get('p_value')
+    if p_value is not None:
+        try:
+            results_summary['P-value'] = f"{float(p_value):.3f}"
+        except (ValueError, TypeError):
+            results_summary['P-value'] = 'N/A (Invalid Format)'
+    else:
+        results_summary['P-value'] = 'N/A'
+    ci = results.get('confidence_interval')
+    if ci is not None and isinstance(ci, (list, tuple)) and len(ci) == 2:
+        try:
+            results_summary['Confidence Interval'] = f"[{float(ci[0]):.3f}, {float(ci[1]):.3f}]"
+        except (ValueError, TypeError):
+            results_summary['Confidence Interval'] = 'N/A (Invalid Format)'
+    else:
+        # Handle cases where CI is None or not a 2-element list/tuple
+        results_summary['Confidence Interval'] = str(ci) if ci is not None else 'N/A'
+    if 'treatment_variable' in results:
+         results_summary['Treatment'] = results['treatment_variable']
+    if 'outcome_variable' in results:
+         results_summary['Outcome'] = results['outcome_variable']
+    diagnostics_summary = {}
+    f_stat = diagnostics.get('first_stage_f_statistic')
+    if f_stat is not None:
+        try:
+            diagnostics_summary['First-Stage F-statistic'] = f"{float(f_stat):.2f}"
+        except (ValueError, TypeError):
+             diagnostics_summary['First-Stage F-statistic'] = 'N/A (Invalid Format)'
+    else:
+         diagnostics_summary['First-Stage F-statistic'] = 'N/A'
+    if 'weak_instrument_test_status' in diagnostics:
+        diagnostics_summary['Weak Instrument Test'] = diagnostics['weak_instrument_test_status']
+    overid_p = diagnostics.get('overid_test_p_value')
+    if overid_p is not None:
+        try:
+             diagnostics_summary['Overidentification Test P-value'] = f"{float(overid_p):.3f}"
+             diagnostics_summary['Overidentification Test Applicable'] = diagnostics.get('overid_test_applicable', 'N/A')
+        except (ValueError, TypeError):
+             diagnostics_summary['Overidentification Test P-value'] = 'N/A (Invalid Format)'
+             diagnostics_summary['Overidentification Test Applicable'] = diagnostics.get('overid_test_applicable', 'N/A')
+    else:
+        # Explicitly state if not applicable or not available
+        if diagnostics.get('overid_test_applicable') == False:
+             diagnostics_summary['Overidentification Test'] = 'Not Applicable'
+        else:
+             diagnostics_summary['Overidentification Test P-value'] = 'N/A'
+             diagnostics_summary['Overidentification Test Applicable'] = diagnostics.get('overid_test_applicable', 'N/A')
+    prompt = f"""
+    You are assisting with interpreting instrumental variable (IV) analysis results.
+    Estimation results summary: {results_summary}
+    Diagnostic test results summary: {diagnostics_summary}
+    Explain these Instrumental Variable (IV) results in clear, concise language (2-4 sentences).
+    Focus on:
+    1. The estimated causal effect (magnitude, direction, statistical significance based on p-value < 0.05).
+    2. The strength of the instrument(s) (based on F-statistic, typically > 10 indicates strength).
+    3. Any implications from other diagnostic tests (e.g., overidentification test suggesting instrument validity issues if p < 0.05).
+    Return ONLY a valid JSON object with the following structure (no explanations or surrounding text):
+    {{
+      "interpretation": "<your concise interpretation text>"
+    }}
+    """
+    response = call_llm_with_json_output(llm, prompt)
+    if response and isinstance(response, dict) and \
+       "interpretation" in response and isinstance(response["interpretation"], str):
+        return response["interpretation"]
+    logger.warning(f"Failed to get valid interpretation from LLM. Response: {response}")
+    return "LLM interpretation could not be generated. Please review the numeric results manually."

auto_causal/methods/linear_regression/__init__.py ADDED Viewed

File without changes

auto_causal/methods/linear_regression/diagnostics.py ADDED Viewed

	@@ -0,0 +1,76 @@

+"""
+Diagnostic checks for Linear Regression models.
+"""
+from typing import Dict, Any
+import statsmodels.api as sm
+from statsmodels.stats.diagnostic import het_breuschpagan, normal_ad
+from statsmodels.stats.stattools import jarque_bera
+from statsmodels.regression.linear_model import RegressionResultsWrapper
+import pandas as pd
+import logging
+logger = logging.getLogger(__name__)
+def run_lr_diagnostics(results: RegressionResultsWrapper, X: pd.DataFrame) -> Dict[str, Any]:
+    """
+    Runs diagnostic checks on a fitted OLS model.
+    Args:
+        results: A fitted statsmodels OLS results object.
+        X: The design matrix (including constant) used for the regression.
+           Needed for heteroskedasticity tests.
+    Returns:
+        Dictionary containing diagnostic metrics.
+    """
+    diagnostics = {}
+    try:
+        diagnostics['r_squared'] = results.rsquared
+        diagnostics['adj_r_squared'] = results.rsquared_adj
+        diagnostics['f_statistic'] = results.fvalue
+        diagnostics['f_p_value'] = results.f_pvalue
+        diagnostics['n_observations'] = int(results.nobs)
+        diagnostics['degrees_of_freedom_resid'] = int(results.df_resid)
+        # --- Normality of Residuals (Jarque-Bera) ---
+        try:
+            jb_value, jb_p_value, skew, kurtosis = jarque_bera(results.resid)
+            diagnostics['residuals_normality_jb_stat'] = jb_value
+            diagnostics['residuals_normality_jb_p_value'] = jb_p_value
+            diagnostics['residuals_skewness'] = skew
+            diagnostics['residuals_kurtosis'] = kurtosis
+            diagnostics['residuals_normality_status'] = "Normal" if jb_p_value > 0.05 else "Non-Normal"
+        except Exception as e:
+            logger.warning(f"Could not run Jarque-Bera test: {e}")
+            diagnostics['residuals_normality_status'] = "Test Failed"
+        # --- Homoscedasticity (Breusch-Pagan) ---
+        # Requires the design matrix X used in the model fitting
+        try:
+            lm_stat, lm_p_value, f_stat, f_p_value = het_breuschpagan(results.resid, X)
+            diagnostics['homoscedasticity_bp_lm_stat'] = lm_stat
+            diagnostics['homoscedasticity_bp_lm_p_value'] = lm_p_value
+            diagnostics['homoscedasticity_bp_f_stat'] = f_stat
+            diagnostics['homoscedasticity_bp_f_p_value'] = f_p_value
+            diagnostics['homoscedasticity_status'] = "Homoscedastic" if lm_p_value > 0.05 else "Heteroscedastic"
+        except Exception as e:
+            logger.warning(f"Could not run Breusch-Pagan test: {e}")
+            diagnostics['homoscedasticity_status'] = "Test Failed"
+        # --- Linearity (Basic check - often requires visual inspection) ---
+        # No standard quantitative test implemented here. Usually assessed via residual plots.
+        diagnostics['linearity_check'] = "Requires visual inspection (e.g., residual vs fitted plot)"
+        # --- Multicollinearity (Placeholder - requires VIF calculation) ---
+        # VIF requires iterating through predictors, more involved
+        diagnostics['multicollinearity_check'] = "Not Implemented (Requires VIF)"
+        return {"status": "Success", "details": diagnostics}
+    except Exception as e:
+        logger.error(f"Error running LR diagnostics: {e}")
+        return {"status": "Failed", "error": str(e), "details": diagnostics} # Return partial results if possible

auto_causal/methods/linear_regression/estimator.py ADDED Viewed

	@@ -0,0 +1,355 @@

+"""
+Linear Regression Estimator for Causal Inference.
+Uses Ordinary Least Squares (OLS) to estimate the treatment effect, potentially
+adjusting for covariates.
+"""
+import pandas as pd
+import statsmodels.api as sm
+import statsmodels.formula.api as smf
+from typing import Dict, Any, List, Optional, Union
+import logging
+from langchain.chat_models.base import BaseChatModel
+import re
+import json
+from pydantic import BaseModel, ValidationError
+from langchain_core.messages import HumanMessage
+from langchain_core.exceptions import OutputParserException
+from auto_causal.models import LLMIdentifiedRelevantParams
+from auto_causal.prompts.regression_prompts import STATSMODELS_PARAMS_IDENTIFICATION_PROMPT_TEMPLATE
+from auto_causal.config import get_llm_client
+# Placeholder for potential future LLM assistance integration
+# from .llm_assist import interpret_lr_results, suggest_lr_covariates
+# Placeholder for potential future diagnostics integration
+# from .diagnostics import run_lr_diagnostics
+logger = logging.getLogger(__name__)
+def _call_llm_for_var(llm: BaseChatModel, prompt: str, pydantic_model: BaseModel) -> Optional[BaseModel]:
+    """Helper to call LLM with structured output and handle errors."""
+    try:
+        messages = [HumanMessage(content=prompt)]
+        structured_llm = llm.with_structured_output(pydantic_model)
+        parsed_result = structured_llm.invoke(messages)
+        return parsed_result
+    except (OutputParserException, ValidationError) as e:
+        logger.error(f"LLM call failed parsing/validation for {pydantic_model.__name__}: {e}")
+    except Exception as e:
+         logger.error(f"LLM call failed unexpectedly for {pydantic_model.__name__}: {e}", exc_info=True)
+    return None
+# Define module-level helper function
+def _clean_variable_name_for_patsy_local(name: str) -> str:
+    if not isinstance(name, str):
+        name = str(name)
+    name = re.sub(r'[^a-zA-Z0-9_]', '_', name)
+    if not re.match(r'^[a-zA-Z_]', name):
+        name = 'var_' + name
+    return name
+def estimate_effect(
+    df: pd.DataFrame,
+    treatment: str,
+    outcome: str,
+    covariates: Optional[List[str]] = None,
+    query_str: Optional[str] = None, # For potential LLM use
+    llm: Optional[BaseChatModel] = None, # For potential LLM use
+    **kwargs # To capture any other potential arguments
+) -> Dict[str, Any]:
+    """
+    Estimates the causal effect using Linear Regression (OLS).
+    Args:
+        df: Input DataFrame.
+        treatment: Name of the treatment variable column.
+        outcome: Name of the outcome variable column.
+        covariates: Optional list of covariate names.
+        query_str: Optional user query for context (e.g., for LLM).
+        llm: Optional Language Model instance.
+        **kwargs: Additional keyword arguments.
+    Returns:
+        Dictionary containing estimation results:
+        - 'effect_estimate': The estimated coefficient for the treatment variable.
+        - 'p_value': The p-value associated with the treatment coefficient.
+        - 'confidence_interval': The 95% confidence interval for the effect.
+        - 'standard_error': The standard error of the treatment coefficient.
+        - 'formula': The regression formula used.
+        - 'model_summary': Summary object from statsmodels.
+        - 'diagnostics': Placeholder for diagnostic results.
+        - 'interpretation': Placeholder for LLM interpretation.
+    """
+    if covariates is None:
+        covariates = []
+    # Retrieve additional args from kwargs
+    interaction_term_suggested = kwargs.get('interaction_term_suggested', False)
+    # interaction_variable_candidate is the *original* name from query_interpreter
+    interaction_variable_candidate_orig_name = kwargs.get('interaction_variable_candidate')
+    treatment_reference_level = kwargs.get('treatment_reference_level')
+    column_mappings = kwargs.get('column_mappings', {})
+    required_cols = [treatment, outcome] + covariates
+    # If interaction variable is suggested, ensure it (or its processed form) is in df for analysis
+    # This check is complex here as interaction_variable_candidate_orig_name needs mapping to processed column(s)
+    # We'll rely on df_analysis.dropna() and formula construction to handle missing interaction var columns later
+    missing_cols = [col for col in required_cols if col not in df.columns]
+    if missing_cols:
+        raise ValueError(f"Missing required columns: {missing_cols}")
+    # Prepare data for statsmodels (add constant, handle potential NaNs)
+    df_analysis = df[required_cols].dropna()
+    if df_analysis.empty:
+        raise ValueError("No data remaining after dropping NaNs for required columns.")
+    X = df_analysis[[treatment] + covariates]
+    X = sm.add_constant(X) # Add intercept
+    y = df_analysis[outcome]
+    # --- Formula Construction ---
+    outcome_col_name = outcome # Name in processed df
+    treatment_col_name = treatment # Name in processed df
+    processed_covariate_col_names = covariates # List of names in processed df
+    rhs_terms = []
+    # 1. Treatment Term
+    treatment_patsy_term = treatment_col_name # Default
+    original_treatment_info = column_mappings.get(treatment_col_name, {}) # Info from preprocess_data
+    is_binary_encoded = original_treatment_info.get('transformed_as') == 'label_encoded_binary'
+    is_still_categorical_in_df = df_analysis[treatment_col_name].dtype.name in ['object', 'category']
+    if is_still_categorical_in_df and not is_binary_encoded: # Covers multi-level and binary categoricals not yet numeric
+        if treatment_reference_level:
+            treatment_patsy_term = f"C({treatment_col_name}, Treatment(reference='{treatment_reference_level}'))"
+            logger.info(f"Treating '{treatment_col_name}' as multi-level categorical with reference '{treatment_reference_level}'.")
+        else:
+            # Default C() wrapping for categoricals if no specific reference is given.
+            # This applies to multi-level or binary categoricals that were not label_encoded to 0/1 by preprocess_data.
+            treatment_patsy_term = f"C({treatment_col_name})"
+            logger.info(f"Treating '{treatment_col_name}' as categorical (Patsy will pick reference).")
+    elif is_binary_encoded: # Was binary and explicitly label encoded to 0/1 by preprocess_data
+        # Even if it's now numeric 0/1, C() ensures Patsy treats it categorically for parameter naming consistency.
+        treatment_patsy_term = f"C({treatment_col_name})"
+        logger.info(f"Treating label-encoded binary '{treatment_col_name}' as categorical for Patsy.")
+    else: # Assumed to be already numeric (continuous or discrete numeric not needing C() for main effect)
+        # treatment_patsy_term remains treatment_col_name (default)
+        logger.info(f"Treating '{treatment_col_name}' as numeric for Patsy formula.")
+    rhs_terms.append(treatment_patsy_term)
+    # 2. Covariate Terms
+    for cov_col_name in processed_covariate_col_names:
+        if cov_col_name == treatment_col_name: # Should not happen if covariates list is clean
+            continue
+        # Assume covariates are already numeric/dummy. If one was object/category in df_analysis (unlikely), C() it.
+        if df_analysis[cov_col_name].dtype.name in ['object', 'category']:
+            rhs_terms.append(f"C({cov_col_name})")
+        else:
+            rhs_terms.append(cov_col_name)
+    # 3. Interaction Term (Simplified: interaction_variable_candidate_orig_name must map to a single column in df_analysis)
+    actual_interaction_term_added_to_formula = None
+    if interaction_term_suggested and interaction_variable_candidate_orig_name:
+        processed_interaction_col_name = None
+        interaction_var_info = column_mappings.get(interaction_variable_candidate_orig_name, {})
+        if interaction_var_info.get('transformed_as') == 'one_hot_encoded':
+            logger.warning(f"Interaction with one-hot encoded variable '{interaction_variable_candidate_orig_name}' is complex. Currently skipping this interaction for Linear Regression.")
+        elif interaction_var_info.get('new_column_name') and interaction_var_info['new_column_name'] in df_analysis.columns:
+            processed_interaction_col_name = interaction_var_info['new_column_name']
+        elif interaction_variable_candidate_orig_name in df_analysis.columns: # Was not in mappings, or mapping didn't change name (e.g. numeric)
+            processed_interaction_col_name = interaction_variable_candidate_orig_name
+        if processed_interaction_col_name:
+            interaction_var_patsy_term = processed_interaction_col_name
+            # If the processed interaction column itself is categorical (e.g. label encoded binary)
+            if df_analysis[processed_interaction_col_name].dtype.name in ['object', 'category', 'bool'] or \
+               interaction_var_info.get('original_dtype') in ['bool', 'category']:
+                interaction_var_patsy_term = f"C({processed_interaction_col_name})"
+            actual_interaction_term_added_to_formula = f"{treatment_patsy_term}:{interaction_var_patsy_term}"
+            rhs_terms.append(actual_interaction_term_added_to_formula)
+            logger.info(f"Adding interaction term to formula: {actual_interaction_term_added_to_formula}")
+        elif interaction_variable_candidate_orig_name: # Log if it was suggested but couldn't be mapped/found
+            logger.warning(f"Could not resolve interaction variable candidate '{interaction_variable_candidate_orig_name}' to a single usable column in processed data. Skipping interaction term.")
+    # Build the formula string for reporting and fitting
+    if not rhs_terms: # Should always have at least treatment
+        formula = f"{outcome_col_name} ~ 1"
+    else:
+        formula = f"{outcome_col_name} ~ {' + '.join(rhs_terms)}"
+    logger.info(f"Using formula for Linear Regression: {formula}")
+    try:
+        model = smf.ols(formula=formula, data=df_analysis)
+        results = model.fit()
+        logger.info("OLS model fitted successfully.")
+        logger.info(results.summary()) # Changed to debug level for less verbose default logging
+        # --- Result Extraction: LLM attempt first, then Regex fallback ---
+        effect_estimates_by_level = {}
+        all_params_extracted = False # Default to False
+        llm_extraction_successful = False
+        # Attempt LLM-based extraction if llm client and query are available
+        llm = get_llm_client()
+        if llm and query_str:
+            logger.info(f"Attempting LLM-based result extraction (informed by query: '{query_str[:50]}...').")
+            try:
+                param_names_list = results.params.index.tolist()
+                param_estimates_list = results.params.tolist()
+                param_p_values_list = results.pvalues.tolist()
+                param_std_errs_list = results.bse.tolist()
+                conf_int_df = results.conf_int(alpha=0.05)
+                param_conf_ints_low_list = []
+                param_conf_ints_high_list = []
+                if not conf_int_df.empty and len(conf_int_df.columns) == 2:
+                    aligned_conf_int_df = conf_int_df.reindex(results.params.index)
+                    param_conf_ints_low_list = aligned_conf_int_df.iloc[:, 0].fillna(float('nan')).tolist()
+                    param_conf_ints_high_list = aligned_conf_int_df.iloc[:, 1].fillna(float('nan')).tolist()
+                else:
+                    nan_list_ci = [float('nan')] * len(param_names_list)
+                    param_conf_ints_low_list = nan_list_ci
+                    param_conf_ints_high_list = nan_list_ci
+                # Placeholder for the new prompt template tailored for this extraction task
+                # MOVED TO causalscientist/auto_causal/prompts/regression_prompts.py
+                is_multilevel_case_for_prompt = bool(treatment_reference_level and is_still_categorical_in_df and not is_binary_encoded)
+                reference_level_for_prompt_str = str(treatment_reference_level) if is_multilevel_case_for_prompt else "N/A"
+                indexed_param_names_for_prompt = [f"{idx}: '{name}'" for idx, name in enumerate(param_names_list)]
+                indexed_param_names_str_for_prompt = "\n".join(indexed_param_names_for_prompt)
+                prompt_text_for_identification = STATSMODELS_PARAMS_IDENTIFICATION_PROMPT_TEMPLATE.format(
+                    user_query=query_str,
+                    treatment_patsy_term=treatment_patsy_term,
+                    treatment_col_name=treatment_col_name,
+                    is_multilevel_case=is_multilevel_case_for_prompt,
+                    reference_level_for_prompt=reference_level_for_prompt_str,
+                    indexed_param_names_str=indexed_param_names_str_for_prompt, # Pass the indexed list as a string
+                    llm_response_schema_json=json.dumps(LLMIdentifiedRelevantParams.model_json_schema(), indent=2)
+                )
+                llm_identification_response = _call_llm_for_var(llm, prompt_text_for_identification, LLMIdentifiedRelevantParams)
+                if llm_identification_response and llm_identification_response.identified_params:
+                    logger.info("LLM identified relevant parameters. Proceeding with programmatic extraction.")
+                    for item in llm_identification_response.identified_params:
+                        param_idx = item.param_index
+                        # Validate index against actual list length
+                        if 0 <= param_idx < len(results.params.index):
+                            actual_param_name = results.params.index[param_idx]
+                            # Sanity check if LLM returned name matches actual name at index
+                            if item.param_name != actual_param_name:
+                                logger.warning(f"LLM returned param_name '{item.param_name}' but name at index {param_idx} is '{actual_param_name}'. Using actual name from results.")
+                            current_effect_stats = {
+                                'estimate': results.params.iloc[param_idx],
+                                'p_value': results.pvalues.iloc[param_idx],
+                                'conf_int': results.conf_int(alpha=0.05).iloc[param_idx].tolist(),
+                                'std_err': results.bse.iloc[param_idx]
+                            }
+                            key_for_effect_dict = 'treatment_effect' # Default for single/binary
+                            if is_multilevel_case_for_prompt: # If it was a multi-level case
+                                match = re.search(r'\[T\.([^]]+)]', actual_param_name) # Use actual_param_name
+                                if match:
+                                    level = match.group(1)
+                                    if level != reference_level_for_prompt_str: # Ensure it's not the ref level itself
+                                        key_for_effect_dict = level
+                                else:
+                                    logger.warning(f"Could not parse level from LLM-identified param: {actual_param_name}. Storing with raw name.")
+                                    key_for_effect_dict = actual_param_name # Fallback key
+                            effect_estimates_by_level[key_for_effect_dict] = current_effect_stats
+                        else:
+                            logger.warning(f"LLM returned an invalid parameter index: {param_idx}. Skipping.")
+                    if effect_estimates_by_level: # If any effects were successfully processed
+                        all_params_extracted = llm_identification_response.all_parameters_successfully_identified
+                        llm_extraction_successful = True
+                        logger.info(f"Successfully processed LLM-identified parameters. all_parameters_successfully_identified={all_params_extracted}")
+                        print(f"effect_estimates_by_level: {effect_estimates_by_level}")
+                    else:
+                        logger.warning("LLM identified parameters, but none could be processed into effects_estimates_by_level. Falling back to regex.")
+                else:
+                    logger.warning("LLM parameter identification did not yield usable parameters. Falling back to regex.")
+            except Exception as e_llm:
+                logger.warning(f"LLM-based result extraction failed: {e_llm}. Falling back to regex.", exc_info=True)
+            # --- End of Existing Regex Logic Block ---
+        # Primary effect_estimate for simple reporting (e.g. first level or the only one)
+        # For multi-level, this is ambiguous. For now, let's report None or the first one.
+        # The full details are in effect_estimates_by_level.
+        main_effect_estimate = None
+        main_p_value = None
+        main_conf_int = [None, None] # Default for single or if no effects
+        main_std_err = None
+        if effect_estimates_by_level:
+            if 'treatment_effect' in effect_estimates_by_level: # Single effect case
+                single_effect_data = effect_estimates_by_level['treatment_effect']
+                main_effect_estimate = single_effect_data['estimate']
+                main_p_value = single_effect_data['p_value']
+                main_conf_int = single_effect_data['conf_int']
+                main_std_err = single_effect_data['std_err']
+            else: # Multi-level case
+                logger.info("Multi-level treatment effects extracted. Populating dicts for main estimate fields.")
+                effect_estimate_dict = {}
+                p_value_dict = {}
+                conf_int_dict = {}
+                std_err_dict = {}
+                for level, stats in effect_estimates_by_level.items():
+                    effect_estimate_dict[level] = stats.get('estimate')
+                    p_value_dict[level] = stats.get('p_value')
+                    conf_int_dict[level] = stats.get('conf_int') # This is already a list [low, high]
+                    std_err_dict[level] = stats.get('std_err')
+                main_effect_estimate = effect_estimate_dict
+                main_p_value = p_value_dict
+                main_conf_int = conf_int_dict
+                main_std_err = std_err_dict
+        interpretation_details = {}
+        if actual_interaction_term_added_to_formula and actual_interaction_term_added_to_formula in results.params.index:
+            interpretation_details['interaction_term_coefficient'] = results.params[actual_interaction_term_added_to_formula]
+            interpretation_details['interaction_term_p_value'] = results.pvalues[actual_interaction_term_added_to_formula]
+            logger.info(f"Interaction term '{actual_interaction_term_added_to_formula}' coeff: {interpretation_details['interaction_term_coefficient']}")
+        diag_results = {}
+        interpretation = "Interpretation not available."
+        output_dict = {
+            'effect_estimate': main_effect_estimate,
+            'p_value': main_p_value,
+            'confidence_interval': main_conf_int,
+            'standard_error': main_std_err,
+            'estimated_effects_by_level': effect_estimates_by_level if (treatment_reference_level and is_still_categorical_in_df and not is_binary_encoded and effect_estimates_by_level) else None,
+            'reference_level_used': treatment_reference_level if (treatment_reference_level and is_still_categorical_in_df and not is_binary_encoded) else None,
+            'formula': formula,
+            'model_summary_text': results.summary().as_text(), # Store as text for easier serialization
+            'diagnostics': diag_results,
+            'interpretation_details': interpretation_details, # Added interaction details
+            'interpretation': interpretation,
+            'method_used': 'Linear Regression (OLS)'
+        }
+        if not all_params_extracted:
+            output_dict['warnings'] = ["Could not reliably extract all requested parameters from model results. Please check model_summary_text."]
+        return output_dict
+    except Exception as e:
+        logger.error(f"Linear Regression failed: {e}")
+        raise # Re-raise the exception after logging

auto_causal/methods/linear_regression/llm_assist.py ADDED Viewed

	@@ -0,0 +1,146 @@

+"""
+LLM assistance functions for Linear Regression analysis.
+"""
+from typing import List, Dict, Any, Optional
+import logging
+# Imported for type hinting
+from langchain.chat_models.base import BaseChatModel
+from statsmodels.regression.linear_model import RegressionResultsWrapper
+# Import shared LLM helpers
+from auto_causal.utils.llm_helpers import call_llm_with_json_output
+logger = logging.getLogger(__name__)
+def suggest_lr_covariates(
+    df_cols: List[str],
+    treatment: str,
+    outcome: str,
+    query: str,
+    llm: Optional[BaseChatModel] = None
+) -> List[str]:
+    """
+    (Placeholder) Use LLM to suggest relevant covariates for linear regression.
+    Args:
+        df_cols: List of available column names.
+        treatment: Treatment variable name.
+        outcome: Outcome variable name.
+        query: User's causal query text.
+        llm: Optional LLM model instance.
+    Returns:
+        List of suggested covariate names.
+    """
+    logger.info("LLM covariate suggestion for LR is not implemented yet.")
+    if llm:
+        # Placeholder: Call LLM here in future
+        pass
+    return []
+def interpret_lr_results(
+    results: RegressionResultsWrapper,
+    diagnostics: Dict[str, Any],
+    treatment_var: str, # Need treatment variable name to extract coefficient
+    llm: Optional[BaseChatModel] = None
+) -> str:
+    """
+    Use LLM to interpret Linear Regression results.
+    Args:
+        results: Fitted statsmodels OLS results object.
+        diagnostics: Dictionary of diagnostic test results.
+        treatment_var: Name of the treatment variable.
+        llm: Optional LLM model instance.
+    Returns:
+        String containing natural language interpretation.
+    """
+    default_interpretation = "LLM interpretation not available for Linear Regression."
+    if llm is None:
+        logger.info("LLM not provided for LR interpretation.")
+        return default_interpretation
+    try:
+        # --- Prepare summary for LLM ---
+        results_summary = {}
+        treatment_val = results.params.get(treatment_var)
+        pval_val = results.pvalues.get(treatment_var)
+        if treatment_val is not None:
+            results_summary['Treatment Effect Estimate'] = f"{treatment_val:.3f}"
+        else:
+            logger.warning(f"Treatment variable '{treatment_var}' not found in regression parameters.")
+            results_summary['Treatment Effect Estimate'] = "Not Found"
+        if pval_val is not None:
+            results_summary['Treatment P-value'] = f"{pval_val:.3f}"
+        else:
+             logger.warning(f"P-value for treatment variable '{treatment_var}' not found in regression results.")
+             results_summary['Treatment P-value'] = "Not Found"
+        try:
+            conf_int = results.conf_int().loc[treatment_var]
+            results_summary['Treatment 95% CI'] = f"[{conf_int[0]:.3f}, {conf_int[1]:.3f}]"
+        except KeyError:
+            logger.warning(f"Confidence interval for treatment variable '{treatment_var}' not found.")
+            results_summary['Treatment 95% CI'] = "Not Found"
+        except Exception as ci_e:
+             logger.warning(f"Could not extract confidence interval for '{treatment_var}': {ci_e}")
+             results_summary['Treatment 95% CI'] = "Error"
+        results_summary['R-squared'] = f"{results.rsquared:.3f}"
+        results_summary['Adj. R-squared'] = f"{results.rsquared_adj:.3f}"
+        diag_summary = {}
+        if diagnostics.get("status") == "Success":
+            diag_details = diagnostics.get("details", {})
+            # Format p-values only if they are numbers
+            jb_p = diag_details.get('residuals_normality_jb_p_value')
+            bp_p = diag_details.get('homoscedasticity_bp_lm_p_value')
+            diag_summary['Residuals Normality (Jarque-Bera P-value)'] = f"{jb_p:.3f}" if isinstance(jb_p, (int, float)) else str(jb_p)
+            diag_summary['Homoscedasticity (Breusch-Pagan P-value)'] = f"{bp_p:.3f}" if isinstance(bp_p, (int, float)) else str(bp_p)
+            diag_summary['Homoscedasticity Status'] = diag_details.get('homoscedasticity_status', 'N/A')
+            diag_summary['Residuals Normality Status'] = diag_details.get('residuals_normality_status', 'N/A')
+        else:
+             diag_summary['Status'] = diagnostics.get("status", "Unknown")
+             if "error" in diagnostics:
+                 diag_summary['Error'] = diagnostics["error"]
+        # --- Construct Prompt ---
+        prompt = f"""
+        You are assisting with interpreting Linear Regression (OLS) results for causal inference.
+        Model Results Summary:
+        {results_summary}
+        Model Diagnostics Summary:
+        {diag_summary}
+        Explain these results in 2-4 concise sentences. Focus on:
+        1. The estimated causal effect of the treatment variable '{treatment_var}' (magnitude, direction, statistical significance based on p-value < 0.05).
+        2. Overall model fit (using R-squared as a rough guide).
+        3. Key diagnostic findings (specifically, mention if residuals are non-normal or if heteroscedasticity is detected, as these violate OLS assumptions and can affect inference).
+        Return ONLY a valid JSON object with the following structure (no explanations or surrounding text):
+        {{
+          "interpretation": "<your concise interpretation text>"
+        }}
+        """
+        # --- Call LLM ---
+        response = call_llm_with_json_output(llm, prompt)
+        # --- Process Response ---
+        if response and isinstance(response, dict) and \
+           "interpretation" in response and isinstance(response["interpretation"], str):
+            return response["interpretation"]
+        else:
+            logger.warning(f"Failed to get valid interpretation from LLM. Response: {response}")
+            return default_interpretation
+    except Exception as e:
+        logger.error(f"Error during LLM interpretation for LR: {e}")
+        return f"Error generating interpretation: {e}"

auto_causal/methods/propensity_score/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from .base import estimate_propensity_scores
+from .matching import estimate_effect as estimate_matching_effect
+from .weighting import estimate_effect as estimate_weighting_effect
+from .diagnostics import assess_balance, plot_overlap, plot_balance
+__all__ = [
+    "estimate_propensity_scores",
+    "estimate_matching_effect",
+    "estimate_weighting_effect",
+    "assess_balance",
+    "plot_overlap",
+    "plot_balance"
+]

auto_causal/methods/propensity_score/base.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Base functionality for Propensity Score methods
+import pandas as pd
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+from sklearn.preprocessing import StandardScaler
+from typing import List, Optional, Dict, Any
+# Placeholder for LLM interaction to select model type
+def select_propensity_model(df: pd.DataFrame, treatment: str, covariates: List[str],
+                            query: Optional[str] = None) -> str:
+    '''Selects the appropriate propensity score model type (e.g., logistic, GBM).
+    Placeholder: Currently defaults to Logistic Regression.
+    '''
+    # TODO: Implement LLM call or heuristic to select model based on data characteristics
+    return "logistic"
+def estimate_propensity_scores(df: pd.DataFrame, treatment: str,
+                               covariates: List[str], model_type: str = 'logistic',
+                               **kwargs) -> np.ndarray:
+    '''Estimate propensity scores using a specified model.
+    Args:
+        df: DataFrame containing the data
+        treatment: Name of the treatment variable
+        covariates: List of covariate variable names
+        model_type: Type of model to use ('logistic' supported for now)
+        **kwargs: Additional arguments for the model
+    Returns:
+        Array of propensity scores
+    '''
+    X = df[covariates]
+    y = df[treatment]
+    # Standardize covariates for logistic regression
+    scaler = StandardScaler()
+    X_scaled = scaler.fit_transform(X)
+    if model_type.lower() == 'logistic':
+        # Fit logistic regression
+        model = LogisticRegression(max_iter=kwargs.get('max_iter', 1000),
+                                   solver=kwargs.get('solver', 'liblinear'), # Use liblinear for L1/L2
+                                   C=kwargs.get('C', 1.0),
+                                   penalty=kwargs.get('penalty', 'l2'))
+        model.fit(X_scaled, y)
+        # Predict probabilities
+        propensity_scores = model.predict_proba(X_scaled)[:, 1]
+    # TODO: Add other model types like Gradient Boosting, etc.
+    # elif model_type.lower() == 'gbm':
+    #     from sklearn.ensemble import GradientBoostingClassifier
+    #     model = GradientBoostingClassifier(...)
+    #     model.fit(X, y)
+    #     propensity_scores = model.predict_proba(X)[:, 1]
+    else:
+        raise ValueError(f"Unsupported propensity score model type: {model_type}")
+    # Clip scores to avoid extremes which can cause issues in weighting/matching
+    propensity_scores = np.clip(propensity_scores, 0.01, 0.99)
+    return propensity_scores
+# Common formatting function (can be expanded)
+def format_ps_results(effect_estimate: float, effect_se: float,
+                      diagnostics: Dict[str, Any], method_details: str,
+                      parameters: Dict[str, Any]) -> Dict[str, Any]:
+    '''Standard formatter for PS method results.'''
+    ci_lower = effect_estimate - 1.96 * effect_se
+    ci_upper = effect_estimate + 1.96 * effect_se
+    return {
+        "effect_estimate": float(effect_estimate),
+        "effect_se": float(effect_se),
+        "confidence_interval": [float(ci_lower), float(ci_upper)],
+        "diagnostics": diagnostics,
+        "method_details": method_details,
+        "parameters": parameters
+        # Add p-value if needed (can be calculated from estimate and SE)
+    }

auto_causal/methods/propensity_score/diagnostics.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# Balance and sensitivity analysis diagnostics for Propensity Score methods
+import pandas as pd
+import numpy as np
+from typing import Dict, List, Optional, Any
+# Import necessary plotting libraries if visualizations are needed
+# import matplotlib.pyplot as plt
+# import seaborn as sns
+# Import utility for standardized differences if needed
+from auto_causal.methods.utils import calculate_standardized_differences
+def assess_balance(df_original: pd.DataFrame, df_matched_or_weighted: pd.DataFrame,
+                   treatment: str, covariates: List[str],
+                   method: str,
+                   propensity_scores_original: Optional[np.ndarray] = None,
+                   propensity_scores_matched: Optional[np.ndarray] = None,
+                   weights: Optional[np.ndarray] = None) -> Dict[str, Any]:
+    '''Assesses covariate balance before and after matching/weighting.
+    Placeholder: Returns dummy diagnostic data.
+    '''
+    print(f"Assessing balance for {method}...")
+    # TODO: Implement actual balance checking using standardized differences,
+    # variance ratios, KS tests, etc.
+    # Example using standardized differences (needs calculate_standardized_differences):
+    # std_diff_before = calculate_standardized_differences(df_original, treatment, covariates)
+    # std_diff_after = calculate_standardized_differences(df_matched_or_weighted, treatment, covariates, weights=weights)
+    dummy_balance_metric = {cov: np.random.rand() * 0.1 for cov in covariates} # Simulate good balance
+    return {
+        "balance_metrics": dummy_balance_metric,
+        "balance_achieved": True, # Placeholder
+        "problematic_covariates": [], # Placeholder
+        # Add plots or paths to plots if generated
+        "plots": {
+            "balance_plot": "balance_plot.png",
+            "overlap_plot": "overlap_plot.png"
+        }
+    }
+def assess_weight_distribution(weights: np.ndarray, treatment_indicator: pd.Series) -> Dict[str, Any]:
+    '''Assesses the distribution of IPW weights.
+    Placeholder: Returns dummy diagnostic data.
+    '''
+    print("Assessing weight distribution...")
+    # TODO: Implement checks for extreme weights, effective sample size, etc.
+    return {
+        "min_weight": float(np.min(weights)),
+        "max_weight": float(np.max(weights)),
+        "mean_weight": float(np.mean(weights)),
+        "std_dev_weight": float(np.std(weights)),
+        "effective_sample_size": len(weights) / (1 + np.std(weights)**2 / np.mean(weights)**2), # Kish's ESS approx
+        "potential_issues": np.max(weights) > 20 # Example check
+    }
+def plot_overlap(df: pd.DataFrame, treatment: str, propensity_scores: np.ndarray, save_path: str = 'overlap_plot.png'):
+    '''Generates plot showing propensity score overlap.
+    Placeholder: Does nothing.
+    '''
+    print(f"Generating overlap plot (placeholder) -> {save_path}")
+    # TODO: Implement actual plotting (e.g., using seaborn histplot or kdeplot)
+    pass
+def plot_balance(balance_metrics_before: Dict[str, float], balance_metrics_after: Dict[str, float], save_path: str = 'balance_plot.png'):
+    '''Generates plot showing covariate balance before/after.
+    Placeholder: Does nothing.
+    '''
+    print(f"Generating balance plot (placeholder) -> {save_path}")
+    # TODO: Implement actual plotting (e.g., Love plot)
+    pass

auto_causal/methods/propensity_score/llm_assist.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# LLM Integration points for Propensity Score methods
+import pandas as pd
+from typing import List, Optional, Dict, Any
+def determine_optimal_caliper(df: pd.DataFrame, treatment: str,
+                              covariates: List[str],
+                              query: Optional[str] = None) -> float:
+    '''Determines optimal caliper for PSM using data or LLM.
+    Placeholder: Returns a default value.
+    '''
+    # TODO: Implement data-driven (e.g., based on PS distribution) or LLM-assisted caliper selection.
+    # Common rule of thumb is 0.2 * std dev of logit(PS), but that requires calculating PS first.
+    return 0.2
+def determine_optimal_weight_type(df: pd.DataFrame, treatment: str,
+                                  query: Optional[str] = None) -> str:
+    '''Determines the optimal type of IPW weights (ATE, ATT, etc.).
+    Placeholder: Defaults to ATE.
+    '''
+    # TODO: Implement LLM or rule-based selection.
+    return "ATE"
+def determine_optimal_trim_threshold(df: pd.DataFrame, treatment: str,
+                                     propensity_scores: Optional[pd.Series] = None,
+                                     query: Optional[str] = None) -> Optional[float]:
+    '''Determines optimal threshold for trimming extreme propensity scores.
+    Placeholder: Defaults to no trimming (None).
+    '''
+    # TODO: Implement data-driven or LLM-assisted threshold selection (e.g., based on score distribution).
+    return None # Corresponds to no trimming by default
+# Placeholder for calling LLM to get parameters (can use the one in utils if general enough)
+def get_llm_parameters(df: pd.DataFrame, query: str, method: str) -> Dict[str, Any]:
+    '''Placeholder to get parameters via LLM based on dataset and query.'''
+    # In reality, call something like analyze_dataset_for_method from utils.llm_helpers
+    print(f"Simulating LLM call to get parameters for {method}...")
+    if method == "PS.Matching":
+        return {"parameters": {"caliper": 0.15}, "validation": {"check_balance": True}}
+    elif method == "PS.Weighting":
+        return {"parameters": {"weight_type": "ATE", "trim_threshold": 0.05}, "validation": {"check_weights": True}}
+    else:
+        return {"parameters": {}, "validation": {}}

auto_causal/methods/propensity_score/matching.py ADDED Viewed

	@@ -0,0 +1,341 @@

+# Propensity Score Matching Implementation
+import pandas as pd
+import numpy as np
+from sklearn.neighbors import NearestNeighbors
+import statsmodels.api as sm # For bias adjustment regression
+import logging # For logging fallback
+from typing import Dict, List, Optional, Any
+# Import DoWhy
+from dowhy import CausalModel
+from .base import estimate_propensity_scores, format_ps_results, select_propensity_model
+from .diagnostics import assess_balance #, plot_overlap, plot_balance # Import diagnostic functions
+# Remove determine_optimal_caliper, it will be replaced by a heuristic
+from .llm_assist import get_llm_parameters # Import LLM helpers
+logger = logging.getLogger(__name__)
+def _calculate_logit(pscore):
+    """Calculate logit of propensity score, clipping to avoid inf."""
+    # Clip pscore to prevent log(0) or log(1) issues which lead to inf
+    epsilon = 1e-7
+    pscore_clipped = np.clip(pscore, epsilon, 1 - epsilon)
+    return np.log(pscore_clipped / (1 - pscore_clipped))
+def _perform_matching_and_get_att(
+    df_sample: pd.DataFrame,
+    treatment: str,
+    outcome: str,
+    covariates: List[str],
+    propensity_model_type: str,
+    n_neighbors: int,
+    caliper: float,
+    perform_bias_adjustment: bool,
+    **kwargs
+) -> float:
+    """
+    Helper to perform Custom KNN PSM and calculate ATT, potentially with bias adjustment.
+    Returns the ATT estimate.
+    """
+    df_ps = df_sample.copy()
+    try:
+        propensity_scores = estimate_propensity_scores(
+            df_ps, treatment, covariates, model_type=propensity_model_type, **kwargs
+        )
+    except Exception as e:
+        logger.warning(f"Propensity score estimation failed in helper: {e}")
+        return np.nan # Cannot proceed without propensity scores
+    df_ps['propensity_score'] = propensity_scores
+    treated = df_ps[df_ps[treatment] == 1]
+    control = df_ps[df_ps[treatment] == 0]
+    if treated.empty or control.empty:
+        return np.nan
+    nn = NearestNeighbors(n_neighbors=n_neighbors, radius=caliper if caliper is not None else np.inf, metric='minkowski', p=2)
+    try:
+        # Ensure control PS are valid before fitting
+        control_ps_values = control[['propensity_score']].values
+        if np.isnan(control_ps_values).any():
+            logger.warning("NaN values found in control propensity scores before NN fitting.")
+            return np.nan
+        nn.fit(control_ps_values)
+        # Ensure treated PS are valid before querying
+        treated_ps_values = treated[['propensity_score']].values
+        if np.isnan(treated_ps_values).any():
+             logger.warning("NaN values found in treated propensity scores before NN query.")
+             return np.nan
+        distances, indices = nn.kneighbors(treated_ps_values)
+    except ValueError as e:
+        # Handles case where control group might be too small or have NaN PS scores
+        logger.warning(f"NearestNeighbors fitting/query failed: {e}")
+        return np.nan
+    matched_outcomes_treated = []
+    matched_outcomes_control_means = []
+    propensity_diffs = []
+    for i in range(len(treated)):
+        treated_unit = treated.iloc[[i]]
+        valid_neighbors_mask = distances[i] <= (caliper if caliper is not None else np.inf)
+        valid_neighbors_idx = indices[i][valid_neighbors_mask]
+        if len(valid_neighbors_idx) > 0:
+            matched_controls_for_this_treated = control.iloc[valid_neighbors_idx]
+            if matched_controls_for_this_treated.empty:
+                continue # Should not happen with valid_neighbors_idx check, but safety
+            matched_outcomes_treated.append(treated_unit[outcome].values[0])
+            matched_outcomes_control_means.append(matched_controls_for_this_treated[outcome].mean())
+            if perform_bias_adjustment:
+                # Ensure PS scores are valid before calculating difference
+                treated_ps = treated_unit['propensity_score'].values[0]
+                control_ps_mean = matched_controls_for_this_treated['propensity_score'].mean()
+                if np.isnan(treated_ps) or np.isnan(control_ps_mean):
+                    logger.warning("NaN propensity score encountered during bias adjustment calculation.")
+                    # Cannot perform bias adjustment for this unit, potentially skip or handle
+                    # For now, let's skip adding to propensity_diffs if NaN found
+                    continue
+                propensity_diff = treated_ps - control_ps_mean
+                propensity_diffs.append(propensity_diff)
+    if not matched_outcomes_treated:
+        return np.nan
+    raw_att_components = np.array(matched_outcomes_treated) - np.array(matched_outcomes_control_means)
+    if perform_bias_adjustment:
+        # Ensure lengths match *after* potential skips due to NaNs
+        if not propensity_diffs or len(raw_att_components) != len(propensity_diffs):
+            logger.warning("Bias adjustment skipped due to inconsistent data lengths after NaN checks.")
+            return np.mean(raw_att_components)
+        try:
+            X_bias_adj = sm.add_constant(np.array(propensity_diffs))
+            y_bias_adj = raw_att_components
+            # Add check for NaNs/Infs in inputs to OLS
+            if np.isnan(X_bias_adj).any() or np.isnan(y_bias_adj).any() or \
+               np.isinf(X_bias_adj).any() or np.isinf(y_bias_adj).any():
+                logger.warning("NaN/Inf values detected in OLS inputs for bias adjustment. Falling back.")
+                return np.mean(raw_att_components)
+            bias_model = sm.OLS(y_bias_adj, X_bias_adj).fit()
+            bias_adjusted_att = bias_model.params[0]
+            return bias_adjusted_att
+        except Exception as e:
+            logger.warning(f"OLS for bias adjustment failed: {e}. Falling back to raw ATT.")
+            return np.mean(raw_att_components)
+    else:
+        return np.mean(raw_att_components)
+def estimate_effect(df: pd.DataFrame, treatment: str, outcome: str,
+                      covariates: List[str], **kwargs) -> Dict[str, Any]:
+    '''Estimate ATT using Propensity Score Matching.
+    Tries DoWhy's PSM first, falls back to custom implementation if DoWhy fails.
+    Uses bootstrap SE based on the custom implementation regardless.
+    '''
+    query = kwargs.get('query')
+    n_bootstraps = kwargs.get('n_bootstraps', 100)
+    # --- Parameter Setup (as before) ---
+    llm_params = get_llm_parameters(df, query, "PS.Matching")
+    llm_suggested_params = llm_params.get("parameters", {})
+    caliper = kwargs.get('caliper', llm_suggested_params.get('caliper'))
+    temp_propensity_scores_for_caliper = None
+    try:
+        temp_propensity_scores_for_caliper = estimate_propensity_scores(
+            df, treatment, covariates,
+            model_type=llm_suggested_params.get('propensity_model_type', 'logistic'),
+            **kwargs
+        )
+        if caliper is None and temp_propensity_scores_for_caliper is not None:
+            logit_ps = _calculate_logit(temp_propensity_scores_for_caliper)
+            if not np.isnan(logit_ps).all(): # Check if logit calculation was successful
+                 caliper = 0.2 * np.nanstd(logit_ps) # Use nanstd for robustness
+            else:
+                 logger.warning("Logit of propensity scores resulted in NaNs, cannot calculate heuristic caliper.")
+                 caliper = None
+        elif caliper is None:
+             logger.warning("Could not estimate propensity scores for caliper heuristic.")
+             caliper = None
+    except Exception as e:
+        logger.warning(f"Failed to estimate initial propensity scores for caliper heuristic: {e}. Caliper set to None.")
+        caliper = None # Proceed without caliper if heuristic fails
+    n_neighbors = kwargs.get('n_neighbors', llm_suggested_params.get('n_neighbors', 1))
+    propensity_model_type = kwargs.get('propensity_model_type',
+                                   llm_suggested_params.get('propensity_model_type',
+                                                          select_propensity_model(df, treatment, covariates, query)))
+    # --- Attempt DoWhy PSM for Point Estimate ---
+    att_estimate = np.nan
+    method_used_for_att = "Fallback Custom PSM"
+    dowhy_model = None
+    identified_estimand = None
+    try:
+        logger.info("Attempting estimation using DoWhy Propensity Score Matching...")
+        dowhy_model = CausalModel(
+            data=df,
+            treatment=treatment,
+            outcome=outcome,
+            common_causes=covariates,
+             estimand_type='nonparametric-ate' # Provide list of names directly
+        )
+        # Identify estimand (optional step, but good practice)
+        identified_estimand = dowhy_model.identify_effect(proceed_when_unidentifiable=True)
+        logger.info(f"DoWhy identified estimand: {identified_estimand}")
+        # Estimate effect using DoWhy's PSM
+        estimate = dowhy_model.estimate_effect(
+            identified_estimand,
+            method_name="backdoor.propensity_score_matching",
+            target_units="att",
+            method_params={}
+        )
+        att_estimate = estimate.value
+        method_used_for_att = "DoWhy PSM"
+        logger.info(f"DoWhy PSM successful. ATT Estimate: {att_estimate}")
+    except Exception as e:
+        logger.warning(f"DoWhy PSM failed: {e}. Falling back to custom PSM implementation.")
+        # Fallback is triggered implicitly if att_estimate remains NaN
+    # --- Fallback or if DoWhy failed ---
+    if np.isnan(att_estimate):
+        logger.info("Calculating ATT estimate using fallback custom PSM...")
+        att_estimate = _perform_matching_and_get_att(
+            df, treatment, outcome, covariates,
+            propensity_model_type, n_neighbors, caliper,
+            perform_bias_adjustment=True, **kwargs # Bias adjust the fallback
+        )
+        method_used_for_att = "Fallback Custom PSM" # Confirm it's fallback
+        if np.isnan(att_estimate):
+             raise ValueError("Fallback custom PSM estimation also failed. Cannot proceed.")
+        logger.info(f"Fallback Custom PSM successful. ATT Estimate: {att_estimate}")
+    # --- Bootstrap SE (using custom helper for consistency) ---
+    logger.info(f"Calculating Bootstrap SE using custom helper ({n_bootstraps} iterations)...")
+    bootstrap_atts = []
+    for i in range(n_bootstraps):
+        try:
+            # Ensure bootstrap samples are drawn correctly
+            df_boot = df.sample(n=len(df), replace=True, random_state=np.random.randint(1000000) + i)
+            # Bias adjustment in bootstrap can be slow, optionally disable it
+            boot_att = _perform_matching_and_get_att(
+                df_boot, treatment, outcome, covariates,
+                propensity_model_type, n_neighbors, caliper,
+                perform_bias_adjustment=False, **kwargs # Set bias adjustment to False for speed in bootstrap
+            )
+            if not np.isnan(boot_att):
+                bootstrap_atts.append(boot_att)
+        except Exception as boot_e:
+            logger.warning(f"Bootstrap iteration {i+1} failed: {boot_e}")
+            continue # Skip failed bootstrap iteration
+    att_se = np.nanstd(bootstrap_atts) if bootstrap_atts else np.nan # Use nanstd
+    actual_bootstrap_iterations = len(bootstrap_atts)
+    logger.info(f"Bootstrap SE calculated: {att_se} from {actual_bootstrap_iterations} successful iterations.")
+    # --- Diagnostics (using custom matching logic for consistency) ---
+    logger.info("Performing diagnostic checks using custom matching logic...")
+    diagnostics = {"error": "Diagnostics failed to run."}
+    propensity_scores_orig = temp_propensity_scores_for_caliper # Reuse if available and not None
+    if propensity_scores_orig is None:
+        try:
+             propensity_scores_orig = estimate_propensity_scores(
+                 df, treatment, covariates, model_type=propensity_model_type, **kwargs
+             )
+        except Exception as e:
+             logger.error(f"Failed to estimate propensity scores for diagnostics: {e}")
+             propensity_scores_orig = None
+    if propensity_scores_orig is not None and not np.isnan(propensity_scores_orig).all():
+        df_ps_orig = df.copy()
+        df_ps_orig['propensity_score'] = propensity_scores_orig
+        treated_orig = df_ps_orig[df_ps_orig[treatment] == 1]
+        control_orig = df_ps_orig[df_ps_orig[treatment] == 0]
+        unmatched_treated_count = 0
+        # Drop rows with NaN propensity scores before diagnostics
+        treated_orig = treated_orig.dropna(subset=['propensity_score'])
+        control_orig = control_orig.dropna(subset=['propensity_score'])
+        if not treated_orig.empty and not control_orig.empty:
+            try:
+                nn_diag = NearestNeighbors(n_neighbors=n_neighbors, radius=caliper if caliper is not None else np.inf, metric='minkowski', p=2)
+                nn_diag.fit(control_orig[['propensity_score']].values)
+                distances_diag, indices_diag = nn_diag.kneighbors(treated_orig[['propensity_score']].values)
+                matched_treated_indices_diag = []
+                matched_control_indices_diag = []
+                for i in range(len(treated_orig)):
+                    valid_neighbors_mask_diag = distances_diag[i] <= (caliper if caliper is not None else np.inf)
+                    valid_neighbors_idx_diag = indices_diag[i][valid_neighbors_mask_diag]
+                    if len(valid_neighbors_idx_diag) > 0:
+                        # Get original DataFrame indices from control_orig based on iloc indices
+                        selected_control_original_indices = control_orig.index[valid_neighbors_idx_diag]
+                        matched_treated_indices_diag.extend([treated_orig.index[i]] * len(selected_control_original_indices))
+                        matched_control_indices_diag.extend(selected_control_original_indices)
+                    else:
+                        unmatched_treated_count += 1
+                if matched_control_indices_diag:
+                    # Use unique indices for creating the diagnostic dataframe
+                    unique_matched_control_indices = list(set(matched_control_indices_diag))
+                    unique_matched_treated_indices = list(set(matched_treated_indices_diag))
+                    matched_control_df_diag = df.loc[unique_matched_control_indices]
+                    matched_treated_df_for_diag = df.loc[unique_matched_treated_indices]
+                    matched_df_diag = pd.concat([matched_treated_df_for_diag, matched_control_df_diag]).drop_duplicates()
+                    # Retrieve propensity scores for the specific units in matched_df_diag
+                    ps_matched_for_diag = propensity_scores_orig.loc[matched_df_diag.index]
+                    diagnostics = assess_balance(df, matched_df_diag, treatment, covariates,
+                                           method="PSM",
+                                           propensity_scores_original=propensity_scores_orig,
+                                           propensity_scores_matched=ps_matched_for_diag)
+                else:
+                    diagnostics = {"message": "No units could be matched for diagnostic assessment."}
+                    # If no controls were matched, all treated were unmatched
+                    unmatched_treated_count = len(treated_orig) if not treated_orig.empty else 0
+            except Exception as diag_e:
+                 logger.error(f"Error during diagnostic matching/balance assessment: {diag_e}")
+                 diagnostics = {"error": f"Diagnostics failed: {diag_e}"}
+        else:
+            diagnostics = {"message": "Treatment or control group empty after dropping NaN PS, diagnostics skipped."}
+            unmatched_treated_count = len(treated_orig) if not treated_orig.empty else 0
+        # Ensure unmatched count calculation is safe
+        if 'unmatched_treated_count' not in locals():
+            unmatched_treated_count = 0 # Initialize if loop didn't run
+        diagnostics["unmatched_treated_count"] = unmatched_treated_count
+        diagnostics["percent_treated_matched"] = (len(treated_orig) - unmatched_treated_count) / len(treated_orig) * 100 if len(treated_orig) > 0 else 0
+    else:
+        diagnostics = {"error": "Propensity scores could not be estimated for diagnostics."}
+    # Add final details to diagnostics
+    diagnostics["att_estimation_method"] = method_used_for_att
+    diagnostics["propensity_score_model"] = propensity_model_type
+    diagnostics["bootstrap_iterations_for_se"] = actual_bootstrap_iterations
+    diagnostics["final_caliper_used"] = caliper
+    # --- Format and return results ---
+    logger.info(f"Formatting results. ATT Estimate: {att_estimate}, SE: {att_se}, Method: {method_used_for_att}")
+    return format_ps_results(att_estimate, att_se, diagnostics,
+                           method_details=f"PSM ({method_used_for_att})",
+                           parameters={"caliper": caliper,
+                                         "n_neighbors": n_neighbors, # n_neighbors used in fallback/bootstrap/diag
+                                         "propensity_model": propensity_model_type,
+                                         "n_bootstraps_config": n_bootstraps})