File size: 7,041 Bytes
69ad385 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
#!/usr/bin/env bash
#GPU驱动安装需要先将原有的显示关闭, 重启机器, 再进行安装.
#参考链接:
#https://blog.csdn.net/kingschan/article/details/19033595
#https://blog.csdn.net/HaixWang/article/details/90408538
#
#>>> yum install -y pciutils
#查看 linux 机器上是否有 GPU
#lspci |grep -i nvidia
#
#>>> lspci |grep -i nvidia
#00:08.0 3D controller: NVIDIA Corporation TU104GL [Tesla T4] (rev a1)
#
#
#NVIDIA 驱动程序下载
#先在 pytorch 上查看应该用什么 cuda 版本, 再安装对应的 cuda-toolkit cuda.
#再根据 gpu 版本下载安装对应的 nvidia 驱动
#
## pytorch 版本
#https://pytorch.org/get-started/locally/
#
## CUDA 下载 (好像不需要这个)
#https://developer.nvidia.com/cuda-toolkit-archive
#
## nvidia 驱动
#https://www.nvidia.cn/Download/index.aspx?lang=cn
#http://www.nvidia.com/Download/index.aspx
#
#在下方的下拉列表中进行选择,针对您的 NVIDIA 产品确定合适的驱动。
#产品类型:
#Data Center / Tesla
#产品系列:
#T-Series
#产品家族:
#Tesla T4
#操作系统:
#Linux 64-bit
#CUDA Toolkit:
#10.2
#语言:
#Chinese (Simpleified)
#
#
#>>> mkdir -p /data/tianxing
#>>> cd /data/tianxing
#>>> wget https://cn.download.nvidia.com/tesla/440.118.02/NVIDIA-Linux-x86_64-440.118.02.run
#>>> sh NVIDIA-Linux-x86_64-440.118.02.run
#
## 异常:
#ERROR: The Nouveau kernel driver is currently in use by your system. This driver is incompatible with the NVIDIA driver, and must be disabled before proceeding. Please consult the NVIDIA driver README and your
#Linux distribution's documentation for details on how to correctly disable the Nouveau kernel driver.
#[OK]
#
#For some distributions, Nouveau can be disabled by adding a file in the modprobe configuration directory. Would you like nvidia-installer to attempt to create this modprobe file for you?
#[NO]
#
#ERROR: Installation has failed. Please see the file '/var/log/nvidia-installer.log' for details. You may find suggestions on fixing installation problems in the README available on the Linux driver download
#page at www.nvidia.com.
#[OK]
#
## 参考链接:
#https://blog.csdn.net/kingschan/article/details/19033595
#
## 禁用原有的显卡驱动 nouveau
#>>> echo -e "blacklist nouveau\noptions nouveau modeset=0\n" > /etc/modprobe.d/blacklist-nouveau.conf
#>>> sudo dracut --force
## 重启
#>>> reboot
#
#>>> init 3
#>>> sh NVIDIA-Linux-x86_64-440.118.02.run
#
## 异常
#ERROR: Unable to find the kernel source tree for the currently running kernel. Please make sure you have installed the kernel source files for your kernel and that they are properly configured; on Red Hat Linux systems, for example, be sure you have the 'kernel-source' or 'kernel-devel' RPM installed. If you know the correct kernel source files are installed, you may specify the kernel source path with the '--kernel-source-path' command line option.
#[OK]
#ERROR: Installation has failed. Please see the file '/var/log/nvidia-installer.log' for details. You may find suggestions on fixing installation problems in the README available on the Linux driver download
#page at www.nvidia.com.
#[OK]
#
## 参考链接
## https://blog.csdn.net/HaixWang/article/details/90408538
#
#>>> uname -r
#3.10.0-1160.49.1.el7.x86_64
#>>> yum install kernel-devel kernel-headers -y
#>>> yum info kernel-devel kernel-headers
#>>> yum install -y "kernel-devel-uname-r == $(uname -r)"
#>>> yum -y distro-sync
#
#>>> sh NVIDIA-Linux-x86_64-440.118.02.run
#
## 安装成功
#WARNING: nvidia-installer was forced to guess the X library path '/usr/lib64' and X module path '/usr/lib64/xorg/modules'; these paths were not queryable from the system. If X fails to find the NVIDIA X driver
#module, please install the `pkg-config` utility and the X.Org SDK/development package for your distribution and reinstall the driver.
#[OK]
#Install NVIDIA's 32-bit compatibility libraries?
#[YES]
#Installation of the kernel module for the NVIDIA Accelerated Graphics Driver for Linux-x86_64 (version 440.118.02) is now complete.
#[OK]
#
#
## 查看 GPU 使用情况; watch -n 1 -d nvidia-smi 每1秒刷新一次.
#>>> nvidia-smi
#Thu Mar 9 12:00:37 2023
#+-----------------------------------------------------------------------------+
#| NVIDIA-SMI 440.118.02 Driver Version: 440.118.02 CUDA Version: 10.2 |
#|-------------------------------+----------------------+----------------------+
#| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
#| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
#|===============================+======================+======================|
#| 0 Tesla T4 Off | 00000000:00:08.0 Off | Off |
#| N/A 54C P0 22W / 70W | 0MiB / 16127MiB | 0% Default |
#+-------------------------------+----------------------+----------------------+
#
#+-----------------------------------------------------------------------------+
#| Processes: GPU Memory |
#| GPU PID Type Process name Usage |
#|=============================================================================|
#| No running processes found |
#+-----------------------------------------------------------------------------+
#
#
# params
stage=1
nvidia_driver_filename=https://cn.download.nvidia.com/tesla/440.118.02/NVIDIA-Linux-x86_64-440.118.02.run
# parse options
while true; do
[ -z "${1:-}" ] && break; # break if there are no arguments
case "$1" in
--*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g);
eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
old_value="(eval echo \\$$name)";
if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then
was_bool=true;
else
was_bool=false;
fi
# Set the variable to the right value-- the escaped quotes make it work if
# the option had spaces, like --cmd "queue.pl -sync y"
eval "${name}=\"$2\"";
# Check that Boolean-valued arguments are really Boolean.
if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
exit 1;
fi
shift 2;
;;
*) break;
esac
done
echo "stage: ${stage}";
yum -y install wget
yum -y install sudo
if [ ${stage} -eq 0 ]; then
mkdir -p /data/dep
cd /data/dep || echo 1;
wget -P /data/dep ${nvidia_driver_filename}
echo -e "blacklist nouveau\noptions nouveau modeset=0\n" > /etc/modprobe.d/blacklist-nouveau.conf
sudo dracut --force
# 重启
reboot
elif [ ${stage} -eq 1 ]; then
init 3
yum install -y kernel-devel kernel-headers
yum info kernel-devel kernel-headers
yum install -y "kernel-devel-uname-r == $(uname -r)"
yum -y distro-sync
cd /data/dep || echo 1;
# 安装时, 需要回车三下.
sh NVIDIA-Linux-x86_64-440.118.02.run
nvidia-smi
fi
|