FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04

# Set environment variables
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
ENV HF_HOME=/app/.cache/huggingface
ENV TRANSFORMERS_CACHE=/app/.cache/huggingface/transformers
ENV MPLCONFIGDIR=/tmp/matplotlib
# Force PyTorch to use the NCCl backend
ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128

# Create necessary directories with proper permissions
RUN mkdir -p /app/.cache/huggingface/transformers && \
    mkdir -p /tmp/matplotlib && \
    mkdir -p /app/gradio_cached_examples && \
    chmod -R 777 /app && \
    chmod -R 777 /tmp/matplotlib

# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
    build-essential \
    git \
    curl \
    ca-certificates \
    python3-pip \
    python3-dev \
    python3-setuptools \
    && rm -rf /var/lib/apt/lists/*

# Create a working directory
WORKDIR /app

# Add a script to check GPU status at startup
RUN echo '#!/bin/bash \n\
echo "Checking NVIDIA GPU status..." \n\
if ! command -v nvidia-smi &> /dev/null; then \n\
    echo "WARNING: nvidia-smi command not found. NVIDIA driver might not be installed." \n\
else \n\
    echo "NVIDIA driver found. Running nvidia-smi:" \n\
    nvidia-smi \n\
fi \n\
echo "Environment variables for GPU:" \n\
echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}" \n\
echo "NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES}" \n\
exec "$@"' > /entrypoint.sh && \
chmod +x /entrypoint.sh

# Copy requirements file
COPY requirements.txt .

# Upgrade pip and install dependencies in specific order to avoid conflicts
RUN pip3 install --no-cache-dir --upgrade pip && \
    # Install torch and torchvision first with CUDA support
    pip3 install --no-cache-dir torch==2.0.1+cu118 torchvision==0.15.2+cu118 --extra-index-url https://download.pytorch.org/whl/cu118 && \
    # Install core dependencies
    pip3 install --no-cache-dir numpy==1.24.3 scipy==1.11.3 requests==2.31.0 && \
    # Install typing-extensions first to ensure proper version for other packages
    pip3 install --no-cache-dir typing-extensions==4.10.0 && \
    # Install huggingface dependencies
    pip3 install --no-cache-dir transformers==4.37.2 safetensors==0.4.1 huggingface_hub==0.19.4 && \
    # Install timm for vision models
    pip3 install --no-cache-dir timm==0.9.11 && \
    # Install nest-asyncio for handling nested event loops
    pip3 install --no-cache-dir nest-asyncio==1.5.8 && \
    # Install lmdeploy and its dependencies first
    pip3 install --no-cache-dir "accelerate==0.30.0" && \
    pip3 install --no-cache-dir "lmdeploy==0.5.3" && \
    # Install other acceleration libraries
    pip3 install --no-cache-dir bitsandbytes==0.41.3 && \
    # Install gradio
    pip3 install --no-cache-dir gradio==3.38.0 && \
    # Install any remaining requirements
    pip3 install --no-cache-dir packaging==23.2 pyyaml==6.0.1 tqdm==4.66.1 openai==1.6.1

# Copy the application files
COPY . .

# Make sure the runtime directories exist and have proper permissions
RUN mkdir -p gradio_cached_examples && \
    chmod -R 777 gradio_cached_examples && \
    mkdir -p .cache/huggingface/transformers && \
    chmod -R 777 .cache

# Make port 7860 available for the app
EXPOSE 7860

# Use our entrypoint script to check GPU status before starting the app
ENTRYPOINT ["/entrypoint.sh"]

# Start the application
CMD ["python3", "app_internvl2.py"]