nalin0503 commited on
Commit
b5e1a79
·
1 Parent(s): 0ce9aad

try dockerfile, last

Browse files
Files changed (1) hide show
  1. Dockerfile +35 -55
Dockerfile CHANGED
@@ -1,4 +1,5 @@
1
- FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
 
2
 
3
  # Set environment variables
4
  ENV DEBIAN_FRONTEND=noninteractive
@@ -8,54 +9,51 @@ ENV TF_FORCE_GPU_ALLOW_GROWTH=true
8
 
9
  # Install system dependencies
10
  RUN apt-get update && apt-get install -y --no-install-recommends \
11
- git \
12
- wget \
13
- curl \
14
- ca-certificates \
15
- python3 \
16
- python3-pip \
17
- python3-dev \
18
- ffmpeg \
19
- libsm6 \
20
- libxext6 \
21
- libgl1-mesa-glx \
22
- && apt-get clean \
23
- && rm -rf /var/lib/apt/lists/*
 
 
 
 
 
24
 
25
  # Set working directory
26
  WORKDIR /app
27
 
28
- # Copy requirements.txt
29
  COPY requirements.txt /app/
30
-
31
- # Install Python dependencies with specific compatible versions
32
  RUN pip3 install --no-cache-dir --upgrade pip setuptools wheel
33
- # Install TensorFlow with GPU support (compatible with CUDA 11.8)
34
- RUN pip3 install --no-cache-dir tensorflow==2.12.0
35
- # Install other dependencies but skip tensorflow (already installed)
36
  RUN pip3 install --no-cache-dir --no-deps -r requirements.txt
37
  RUN pip3 install --no-cache-dir tensorflow-hub==0.14.0
38
  RUN pip3 install --no-cache-dir opencv-python-headless opencv-contrib-python-headless
39
 
40
- # Copy application code
41
  COPY . /app/
42
 
43
- # Create a robust CPU fallback implementation
44
  RUN echo 'import tensorflow as tf\n\
45
  import os\n\
46
- \n\
47
- # Set TensorFlow logging level\n\
48
  os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"\n\
49
- \n\
50
- # Function to setup GPU with memory growth or fallback to CPU\n\
51
- def setup_tensorflow():\n\
52
  try:\n\
53
- # List physical devices\n\
54
  physical_devices = tf.config.list_physical_devices("GPU")\n\
55
- if len(physical_devices) > 0:\n\
56
  print(f"Found {len(physical_devices)} GPU(s)")\n\
57
  for device in physical_devices:\n\
58
- # Allow memory growth to avoid allocating all GPU memory at once\n\
59
  tf.config.experimental.set_memory_growth(device, True)\n\
60
  print(f"Enabled memory growth for {device}")\n\
61
  else:\n\
@@ -63,56 +61,38 @@ def setup_tensorflow():\n\
63
  except Exception as e:\n\
64
  print(f"Error setting up TensorFlow: {e}")\n\
65
  print("Disabling GPU and falling back to CPU")\n\
66
- # Force CPU usage if there was an error with GPU setup\n\
67
  os.environ["CUDA_VISIBLE_DEVICES"] = "-1"\n\
68
  \n\
69
- # Call the setup function\n\
70
- setup_tensorflow()\n\
71
- ' > /app/tf_setup.py
72
 
73
- # Modify FILM.py to properly handle CPU fallback
74
  RUN if [ -f "/app/FILM.py" ]; then \
75
- # Import our setup at the top of the file\
76
  sed -i '1s/^/import tensorflow as tf\nfrom tf_setup import setup_tensorflow\n/' /app/FILM.py && \
77
- # Add GPU check and CPU fallback in __init__\
78
  sed -i '/def __init__/a\ # Check if GPU is disabled and use CPU if needed\n if "CUDA_VISIBLE_DEVICES" in os.environ and os.environ["CUDA_VISIBLE_DEVICES"] == "-1":\n print("GPU is disabled, using CPU for FILM")\n self._device = "/cpu:0"\n else:\n self._device = "/gpu:0"\n print(f"FILM will use device: {self._device}")' /app/FILM.py && \
79
- # Add device context to __call__\
80
  sed -i '/def __call__/a\ with tf.device(self._device):' /app/FILM.py && \
81
- # Fix the model call indentation after adding the with statement\
82
  sed -i 's/ result = self._model/ try:\n result = self._model/g' /app/FILM.py && \
83
- sed -i '/result = self._model/a\ except Exception as e:\n print(f"Error during model inference: {e}, trying CPU fallback")\n with tf.device("/cpu:0"):\n result = self._model(inputs, training=False)' /app/FILM.py; \
84
- # Make sure os is imported if not already\
85
  sed -i '1s/^/import os\n/' /app/FILM.py; \
86
  fi
87
 
88
- # Set environment variables for GPU compatibility
89
- ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}
90
- ENV PATH=/usr/local/cuda/bin:${PATH}
91
- ENV CUDA_VISIBLE_DEVICES=0
92
- ENV TF_FORCE_GPU_ALLOW_GROWTH=true
93
-
94
- # Create a startup script with proper error handling
95
  RUN echo '#!/bin/bash\n\
96
  set -e\n\
97
  \n\
98
- # Check CUDA and cuDNN status\n\
99
  echo "CUDA libraries:"\n\
100
  ldconfig -p | grep cuda\n\
101
  echo "cuDNN libraries:"\n\
102
  ldconfig -p | grep cudnn\n\
103
  \n\
104
- # Test TensorFlow GPU\n\
105
  python3 -c "import tensorflow as tf; print(\\"Num GPUs Available: \\", len(tf.config.list_physical_devices(\\"GPU\\")))" || {\n\
106
  echo "TensorFlow GPU test failed, falling back to CPU"\n\
107
  export CUDA_VISIBLE_DEVICES=-1\n\
108
  }\n\
109
  \n\
110
- # Run the app with proper error handling\n\
111
- exec streamlit run app.py --server.port=8501 --server.address=0.0.0.0\n\
112
- ' > /app/start.sh && chmod +x /app/start.sh
113
 
114
- # Expose port for Streamlit
115
  EXPOSE 8501
116
 
117
- # Use the startup script
118
  CMD ["/app/start.sh"]
 
1
+ # Use a CUDA base image without preinstalled cuDNN to avoid conflicts
2
+ FROM nvidia/cuda:12.3.2-devel-ubuntu22.04
3
 
4
  # Set environment variables
5
  ENV DEBIAN_FRONTEND=noninteractive
 
9
 
10
  # Install system dependencies
11
  RUN apt-get update && apt-get install -y --no-install-recommends \
12
+ git wget curl ca-certificates \
13
+ python3 python3-pip python3-dev \
14
+ ffmpeg libsm6 libxext6 libgl1-mesa-glx && \
15
+ apt-get clean && rm -rf /var/lib/apt/lists/*
16
+
17
+ # --- Download and install cuDNN 9.3.0 ---
18
+ # Download the archive directly from NVIDIA
19
+ RUN wget -O /tmp/cudnn-linux-x86_64-9.3.0.75_cuda12-archive.tar.xz https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-9.3.0.75_cuda12-archive.tar.xz && \
20
+ tar -xJvf /tmp/cudnn-linux-x86_64-9.3.0.75_cuda12-archive.tar.xz -C /tmp && \
21
+ cp -P /tmp/cudnn-linux-x86_64-9.3.0.75_cuda12-archive/cuda/include/cudnn*.h /usr/local/cuda/include && \
22
+ cp -P /tmp/cudnn-linux-x86_64-9.3.0.75_cuda12-archive/cuda/lib64/libcudnn* /usr/local/cuda/lib64 && \
23
+ chmod a+r /usr/local/cuda/include/cudnn*.h /usr/local/cuda/lib64/libcudnn* && \
24
+ rm -rf /tmp/cudnn-linux-x86_64-9.3.0.75_cuda12-archive.tar.xz /tmp/cudnn-linux-x86_64-9.3.0.75_cuda12-archive
25
+
26
+ # Set environment variables for CUDA/cuDNN libraries
27
+ ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}
28
+ ENV PATH=/usr/local/cuda/bin:${PATH}
29
+ ENV CUDA_VISIBLE_DEVICES=0
30
 
31
  # Set working directory
32
  WORKDIR /app
33
 
34
+ # Copy requirements and install Python dependencies
35
  COPY requirements.txt /app/
 
 
36
  RUN pip3 install --no-cache-dir --upgrade pip setuptools wheel
37
+ # Install TensorFlow GPU support (using version 2.15.0 here for compatibility)
38
+ RUN pip3 install --no-cache-dir tensorflow==2.15.0
39
+ # Install the remaining packages from requirements.txt (skip dependency resolution)
40
  RUN pip3 install --no-cache-dir --no-deps -r requirements.txt
41
  RUN pip3 install --no-cache-dir tensorflow-hub==0.14.0
42
  RUN pip3 install --no-cache-dir opencv-python-headless opencv-contrib-python-headless
43
 
44
+ # Copy the application code
45
  COPY . /app/
46
 
47
+ # Create a CPU fallback setup for TensorFlow
48
  RUN echo 'import tensorflow as tf\n\
49
  import os\n\
 
 
50
  os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"\n\
51
+ \ndef setup_tensorflow():\n\
 
 
52
  try:\n\
 
53
  physical_devices = tf.config.list_physical_devices("GPU")\n\
54
+ if physical_devices:\n\
55
  print(f"Found {len(physical_devices)} GPU(s)")\n\
56
  for device in physical_devices:\n\
 
57
  tf.config.experimental.set_memory_growth(device, True)\n\
58
  print(f"Enabled memory growth for {device}")\n\
59
  else:\n\
 
61
  except Exception as e:\n\
62
  print(f"Error setting up TensorFlow: {e}")\n\
63
  print("Disabling GPU and falling back to CPU")\n\
 
64
  os.environ["CUDA_VISIBLE_DEVICES"] = "-1"\n\
65
  \n\
66
+ setup_tensorflow()\n' > /app/tf_setup.py
 
 
67
 
68
+ # Patch FILM.py to ensure proper GPU/CPU fallback, if the file exists
69
  RUN if [ -f "/app/FILM.py" ]; then \
 
70
  sed -i '1s/^/import tensorflow as tf\nfrom tf_setup import setup_tensorflow\n/' /app/FILM.py && \
 
71
  sed -i '/def __init__/a\ # Check if GPU is disabled and use CPU if needed\n if "CUDA_VISIBLE_DEVICES" in os.environ and os.environ["CUDA_VISIBLE_DEVICES"] == "-1":\n print("GPU is disabled, using CPU for FILM")\n self._device = "/cpu:0"\n else:\n self._device = "/gpu:0"\n print(f"FILM will use device: {self._device}")' /app/FILM.py && \
 
72
  sed -i '/def __call__/a\ with tf.device(self._device):' /app/FILM.py && \
 
73
  sed -i 's/ result = self._model/ try:\n result = self._model/g' /app/FILM.py && \
74
+ sed -i '/result = self._model/a\ except Exception as e:\n print(f"Error during model inference: {e}, trying CPU fallback")\n with tf.device("/cpu:0"):\n result = self._model(inputs, training=False)' /app/FILM.py && \
 
75
  sed -i '1s/^/import os\n/' /app/FILM.py; \
76
  fi
77
 
78
+ # Create a startup script that checks CUDA/cuDNN status and launches Streamlit
 
 
 
 
 
 
79
  RUN echo '#!/bin/bash\n\
80
  set -e\n\
81
  \n\
 
82
  echo "CUDA libraries:"\n\
83
  ldconfig -p | grep cuda\n\
84
  echo "cuDNN libraries:"\n\
85
  ldconfig -p | grep cudnn\n\
86
  \n\
 
87
  python3 -c "import tensorflow as tf; print(\\"Num GPUs Available: \\", len(tf.config.list_physical_devices(\\"GPU\\")))" || {\n\
88
  echo "TensorFlow GPU test failed, falling back to CPU"\n\
89
  export CUDA_VISIBLE_DEVICES=-1\n\
90
  }\n\
91
  \n\
92
+ exec streamlit run app.py --server.port=8501 --server.address=0.0.0.0\n' > /app/start.sh && chmod +x /app/start.sh
 
 
93
 
94
+ # Expose the port for Streamlit
95
  EXPOSE 8501
96
 
97
+ # Use the startup script as the container's entrypoint
98
  CMD ["/app/start.sh"]