说明

理论上适用于其他云平台的GPU计算型实例或者自建GPU计算服务器
阿里云实例类型GPU计算型 gn5
GPU型号Nvidia Tesla P100
操作系统CentOS-7.7-1908 64位
Nvidia驱动版本390.116
CUDA Driver版本9.1.85
CUDA Runtime版本9.0.176
cuDNN版本v7.6.0.64
Nvidia GPU 兼容性的可以看这篇文章
Docker版本19.03.5
GPU驱动、CUDA、cuDNN版本的选择，请自行根据项目要求进行调整！

手动安装

环境准备

禁用SELINUX

1 2	sed -i 's,^SELINUX=.*,SELINUX=disabled,g' /etc/selinux/config setenforce 0

禁用防火墙

1	systemctl disable --now firewalld.service

添加sysctl参数

fs参数

cat > /etc/sysctl.d/99-fs.conf <<EOF
# 最大文件句柄数
fs.file-max=1048576
# 最大文件打开数
fs.nr_open=1048576
# 同一时间异步IO请求数
fs.aio-max-nr=1048576
# 在CentOS7.4引入了一个新的参数来控制内核的行为。 
# /proc/sys/fs/may_detach_mounts 默认设置为0
# 当系统有容器运行的时候，需要将该值设置为1。
fs.may_detach_mounts=1
EOF

vm参数

cat > /etc/sysctl.d/99-vm.conf <<EOF
# 内存耗尽才使用swap分区
vm.swappiness=10
# 当内存耗尽时，内核会触发OOM killer根据oom_score杀掉最耗内存的进程
vm.panic_on_oom=0
# 允许overcommit
vm.overcommit_memory=1
# 定义了进程能拥有的最多内存区域，默认65536
vm.max_map_count=262144
EOF

net参数

cat > /etc/sysctl.d/99-net.conf <<EOF
# 二层的网桥在转发包时也会被iptables的FORWARD规则所过滤
net.bridge.bridge-nf-call-arptables=1
net.bridge.bridge-nf-call-iptables=1
net.bridge.bridge-nf-call-ip6tables=1
# 关闭严格校验数据包的反向路径，默认值1
net.ipv4.conf.default.rp_filter=0
net.ipv4.conf.all.rp_filter=0
# 进程间通信发送数据, 默认100
net.unix.max_dgram_qlen=512
# 设置 conntrack 的上限
net.netfilter.nf_conntrack_max=1048576
# 设置连接跟踪表中处于TIME_WAIT状态的超时时间
net.netfilter.nf_conntrack_tcp_timeout_timewait=30
# 设置连接跟踪表中TCP连接超时时间
net.netfilter.nf_conntrack_tcp_timeout_established=1200
# 端口最大的监听队列的长度
net.core.somaxconn=21644
# 接收自网卡、但未被内核协议栈处理的报文队列长度
net.core.netdev_max_backlog=262144
# 系统无内存压力、启动压力模式阈值、最大值，单位为页的数量
#net.ipv4.tcp_mem=1541646 2055528 3083292
# 内核socket接收缓存区字节数min/default/max
net.core.rmem=4096 65536 8388608
# 内核socket发送缓存区字节数min/default/max
net.core.wmem=4096 65536 8388608
# 开启自动调节缓存模式
net.ipv4.tcp_moderate_rcvbuf=1
# TCP阻塞控制算法BBR，Linux内核版本4.9开始内置BBR算法
#net.ipv4.tcp_congestion_control=bbr
#net.core.default_qdisc=fq
# 用作本地随机TCP端口的范围
net.ipv4.ip_local_port_range=10000 65000
# 打开ipv4数据包转发
net.ipv4.ip_forward=1
# 允许应用程序能够绑定到不属于本地网卡的地址
net.ipv4.ip_nonlocal_bind=1
# 系统中处于 SYN_RECV 状态的 TCP 连接数量
net.ipv4.tcp_max_syn_backlog=16384
# 内核中管理 TIME_WAIT 状态的数量
net.ipv4.tcp_max_tw_buckets=5000
# 指定重发 SYN/ACK 的次数
net.ipv4.tcp_synack_retries=2
# TCP连接中TIME_WAIT sockets的快速回收
net.ipv4.tcp_tw_recycle=0
# 不属于任何进程的tcp socket最大数量. 超过这个数量的socket会被reset, 并告警
net.ipv4.tcp_max_orphans=1024
# TCP FIN报文重试次数
net.ipv4.tcp_orphan_retries=8
# 加快系统关闭处于 FIN_WAIT2 状态的 TCP 连接
net.ipv4.tcp_fin_timeout=15
# TCP连接keepalive的持续时间，默认7200
net.ipv4.tcp_keepalive_time=600
# TCP keepalive探测包发送间隔
net.ipv4.tcp_keepalive_intvl=30
# TCP keepalive探测包重试次数
net.ipv4.tcp_keepalive_probes=10
# TCP FastOpen
# 0:关闭 ; 1:作为客户端时使用 ; 2:作为服务器端时使用 ; 3:无论作为客户端还是服务器端都使用
net.ipv4.tcp_fastopen=3
# 限制TCP重传次数
net.ipv4.tcp_retries1=3
# TCP重传次数到达上限时，关闭TCP连接
net.ipv4.tcp_retries2=15
EOF

修改limits参数

cat > /etc/security/limits.d/99-centos.conf <<EOF
* - nproc 1048576
* - nofile 1048576
EOF

修改journal设置

sed -e 's,^#Compress=yes,Compress=yes,' \
    -e 's,^#SystemMaxUse=,SystemMaxUse=2G,' \
    -e 's,^#Seal=yes,Seal=yes,' \
    -e 's,^#RateLimitBurst=1000,RateLimitBurst=5000,' \
    -i /etc/systemd/journald.conf

修改history参数

cat > /etc/profile.d/history.sh <<EOF
export HISTSIZE=10000
export HISTFILESIZE=10000
export HISTCONTROL=ignoredups
export HISTTIMEFORMAT="`whoami` %F %T "
export HISTIGNORE="ls:pwd:ll:ls -l:ls -a:ll -a"
EOF

更新软件

1	yum update -y

安装编译环境

1	yum groups install -y base 'Development Tools'

安装常用工具

yum install -y nc \
               git \
               vim \
               ipvsadm \
               tree \
               dstat \
               iotop \
               htop \
               socat \
               ipset\
               conntrack \
               bash-completion-extras \
               tcpdump \
               wireshark \
               bcc-tools \
               perf \
               trace-cmd \
               systemtap \
               nethogs \
               lshw

检查GPU

查看PCI设备

1	lspci \| grep -i nvidia

1	00:08.0 3D controller: NVIDIA Corporation GP100GL [Tesla P100 PCIe 16GB] (rev a1)

查看硬件信息

1	lshw -numeric -C display

*-display:0               
     description: VGA compatible controller
     product: GD 5446 [1013:B8]
     vendor: Cirrus Logic [1013]
     physical id: 2
     bus info: pci@0000:00:02.0
     version: 00
     width: 32 bits
     clock: 33MHz
     capabilities: vga_controller rom
     configuration: driver=cirrus latency=0
     resources: irq:0 memory:fa000000-fbffffff memory:fe850000-fe850fff memory:fe840000-fe84ffff
*-display:1
     description: 3D controller
     product: GP100GL [Tesla P100 PCIe 16GB] [10DE:15F8]
     vendor: NVIDIA Corporation [10DE]
     physical id: 8
     bus info: pci@0000:00:08.0
     logical name: /dev/fb0
     version: a1
     width: 64 bits
     clock: 33MHz
     capabilities: pm msi pciexpress bus_master cap_list fb
     configuration: depth=16 driver=nvidia latency=0 mode=1024x768 visual=truecolor xres=1024 yres=768
     resources: iomemory:100-ff iomemory:140-13f irq:11 memory:fd000000-fdffffff memory:1000000000-13ffffffff memory:1400000000-1401ffffff

禁用nouveau驱动

echo "blacklist nouveau" > /etc/modprobe.d/blacklist-nouveau.conf
echo "options nouveau modeset=0" >> /etc/modprobe.d/blacklist-nouveau.conf
rmmod nouveau
dracut --force

重启

reboot

安装GPU驱动

访问Nvidia官网

Nvidia驱动程序下载

下载驱动

网页下载

产品类型Tesla
产品类型P-Series
产品家族Tesla P100
操作系统Linux 64-bit RHEL7
CUDA Toolkit9.1
点击搜索，跳转到驱动程序下载页面
点击下载，跳转到驱动下载
再次点击下载，保存驱动文件

命令行下载

1	wget -t 10 --timeout=10 http://cn.download.nvidia.com/tesla/390.116/nvidia-diag-driver-local-repo-rhel7-390.116-1.0-1.x86_64.rpm

安装驱动

1
2
3

yum install nvidia-diag-driver-local-repo-rhel7-390.116-1.0-1.x86_64.rpm
yum clean all
yum install cuda-drivers-390.116-1.x86_64

重启

reboot

检查驱动情况

检查是否加载开源驱动nouveau

1	lsmod \| grep nouveau

检查驱动版本

1	cat /proc/driver/nvidia/version

1 2	NVRM version: NVIDIA UNIX x86_64 Kernel Module 390.116 Sun Jan 27 07:21:36 PST 2019 GCC version: gcc version 4.8.5 20150623 (Red Hat 4.8.5-39) (GCC)

查看GPU Summary

1	nvidia-smi

+-----------------------------------------------------------------------------+
| NVIDIA-SMI 390.116                Driver Version: 390.116                   |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  Tesla P100-PCIE...  Off  | 00000000:00:08.0 Off |                    0 |
| N/A   32C    P0    27W / 250W |      0MiB / 16280MiB |      3%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+

查看GPU详细信息

1	nvidia-smi -i 0 -q

==============NVSMI LOG==============

Driver Version                      : 390.116
CUDA Version                        : 9.1

Attached GPUs                       : 1
GPU 00000000:00:08.0
    Product Name                    : Tesla P100-PCIE-16GB
    Product Brand                   : Tesla
    Display Mode                    : Enabled
    Display Active                  : Disabled
    Persistence Mode                : Enabled
    Accounting Mode                 : Disabled
    Accounting Mode Buffer Size     : 4000
    Driver Model
        Current                     : N/A
        Pending                     : N/A
    Serial Number                   : 0322818005606
    GPU UUID                        : GPU-81662b67-e58c-d9b6-763f-8adc65071cba
    Minor Number                    : 0
    VBIOS Version                   : 86.00.4D.00.01
    MultiGPU Board                  : No
    Board ID                        : 0x8
    GPU Part Number                 : 900-2H400-0000-000
    Inforom Version
        Image Version               : H400.0201.00.08
        OEM Object                  : 1.1
        ECC Object                  : 4.1
        Power Management Object     : N/A
    GPU Operation Mode
        Current                     : N/A
        Pending                     : N/A
    GPU Virtualization Mode
        Virtualization mode         : Pass-Through
    IBMNPU
        Relaxed Ordering Mode       : N/A
    PCI
        Bus                         : 0x00
        Device                      : 0x08
        Domain                      : 0x0000
        Device Id                   : 0x15F810DE
        Bus Id                      : 00000000:00:08.0
        Sub System Id               : 0x118F10DE
        GPU Link Info
            PCIe Generation
                Max                 : 3
                Current             : 3
            Link Width
                Max                 : 16x
                Current             : 16x
        Bridge Chip
            Type                    : N/A
            Firmware                : N/A
        Replays since reset         : 0
        Tx Throughput               : 0 KB/s
        Rx Throughput               : 0 KB/s
    Fan Speed                       : N/A
    Performance State               : P0
    Clocks Throttle Reasons
        Idle                        : Not Active
        Applications Clocks Setting : Not Active
        SW Power Cap                : Not Active
        HW Slowdown                 : Not Active
            HW Thermal Slowdown     : Not Active
            HW Power Brake Slowdown : Not Active
        Sync Boost                  : Not Active
        SW Thermal Slowdown         : Not Active
        Display Clock Setting       : Not Active
    FB Memory Usage
        Total                       : 16280 MiB
        Used                        : 8133 MiB
        Free                        : 8147 MiB
    BAR1 Memory Usage
        Total                       : 16384 MiB
        Used                        : 2 MiB
        Free                        : 16382 MiB
    Compute Mode                    : Default
    Utilization
        Gpu                         : 0 %
        Memory                      : 0 %
        Encoder                     : 0 %
        Decoder                     : 0 %
    Encoder Stats
        Active Sessions             : 0
        Average FPS                 : 0
        Average Latency             : 0
    FBC Stats
        Active Sessions             : 0
        Average FPS                 : 0
        Average Latency             : 0
    Ecc Mode
        Current                     : Enabled
        Pending                     : Enabled
    ECC Errors
        Volatile
            Single Bit            
                Device Memory       : 0
                Register File       : 0
                L1 Cache            : N/A
                L2 Cache            : 0
                Texture Memory      : 0
                Texture Shared      : 0
                CBU                 : N/A
                Total               : 0
            Double Bit            
                Device Memory       : 0
                Register File       : 0
                L1 Cache            : N/A
                L2 Cache            : 0
                Texture Memory      : 0
                Texture Shared      : 0
                CBU                 : N/A
                Total               : 0
        Aggregate
            Single Bit            
                Device Memory       : 0
                Register File       : 0
                L1 Cache            : N/A
                L2 Cache            : 0
                Texture Memory      : 0
                Texture Shared      : 0
                CBU                 : N/A
                Total               : 0
            Double Bit            
                Device Memory       : 0
                Register File       : 0
                L1 Cache            : N/A
                L2 Cache            : 0
                Texture Memory      : 0
                Texture Shared      : 0
                CBU                 : N/A
                Total               : 0
    Retired Pages
        Single Bit ECC              : 0
        Double Bit ECC              : 0
        Pending Page Blacklist      : No
    Temperature
        GPU Current Temp            : 34 C
        GPU Shutdown Temp           : 85 C
        GPU Slowdown Temp           : 82 C
        GPU Max Operating Temp      : N/A
        Memory Current Temp         : N/A
        Memory Max Operating Temp   : N/A
    Power Readings
        Power Management            : Supported
        Power Draw                  : 31.01 W
        Power Limit                 : 250.00 W
        Default Power Limit         : 250.00 W
        Enforced Power Limit        : 250.00 W
        Min Power Limit             : 125.00 W
        Max Power Limit             : 250.00 W
    Clocks
        Graphics                    : 1189 MHz
        SM                          : 1189 MHz
        Memory                      : 715 MHz
        Video                       : 1063 MHz
    Applications Clocks
        Graphics                    : 1189 MHz
        Memory                      : 715 MHz
    Default Applications Clocks
        Graphics                    : 1189 MHz
        Memory                      : 715 MHz
    Max Clocks
        Graphics                    : 1328 MHz
        SM                          : 1328 MHz
        Memory                      : 715 MHz
        Video                       : 1328 MHz
    Max Customer Boost Clocks
        Graphics                    : 1328 MHz
    Clock Policy
        Auto Boost                  : N/A
        Auto Boost Default          : N/A
    Processes

禁用本地YUM源

1	yum-config-manager --disable nvidia-diag-driver-local-390.116

安装CUDA环境

添加CUDA源

1	yum-config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo

修改CUDA源地址

1
2
3

sed -e 's,developer.download.nvidia.cn/compute/cuda/repos/,mirrors.aliyun.com/nvidia-cuda,g' \
    -e 's,developer.download.nvidia.com/compute/cuda/repos,mirrors.aliyun.com/nvidia-cuda,g' \
    -i /etc/yum.repos.d/cuda-rhel7.repo

安装CUDA

1 2	yum makecache yum install cuda-9-0

设置环境变量

1
2
3

echo 'export PATH=/usr/local/cuda/bin:$PATH' >> /etc/profile.d/cuda.sh
echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' >> /etc/profile.d/cuda.sh
source /etc/profile.d/cuda-9.0.sh

测试CUDA

查看安装路径

1	ls -ld /usr/local/cuda*

1 2	lrwxrwxrwx 1 root root 8 Jan 1 11:08 /usr/local/cuda -> cuda-9.0 drwxr-xr-x 15 root root 4096 Jan 1 11:08 /usr/local/cuda-9.0

检查CUDA版本

1	nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2017 NVIDIA Corporation
Built on Fri_Sep__1_21:08:03_CDT_2017
Cuda compilation tools, release 9.0, V9.0.176

运行Samples

以下两个测试结果都为Result=PASS，则说明CUDA安装成功

deviceQuery

切换目录

1	cd /usr/local/cuda/samples/1_Utilities/deviceQuery

编译运行

1	make && ./deviceQuery

输出示例

./deviceQuery Starting...

 CUDA Device Query (Runtime API) version (CUDART static linking)

Detected 1 CUDA Capable device(s)

Device 0: "Tesla P100-PCIE-16GB"
  CUDA Driver Version / Runtime Version          9.1 / 9.0
  CUDA Capability Major/Minor version number:    6.0
  Total amount of global memory:                 16281 MBytes (17071734784 bytes)
  (56) Multiprocessors, ( 64) CUDA Cores/MP:     3584 CUDA Cores
  GPU Max Clock rate:                            1329 MHz (1.33 GHz)
  Memory Clock rate:                             715 Mhz
  Memory Bus Width:                              4096-bit
  L2 Cache Size:                                 4194304 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)
  Maximum Layered 1D Texture Size, (num) layers  1D=(32768), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(32768, 32768), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  2048
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and kernel execution:          Yes with 2 copy engine(s)
  Run time limit on kernels:                     No
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Enabled
  Device supports Unified Addressing (UVA):      Yes
  Supports Cooperative Kernel Launch:            Yes
  Supports MultiDevice Co-op Kernel Launch:      Yes
  Device PCI Domain ID / Bus ID / location ID:   0 / 0 / 8
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 9.1, CUDA Runtime Version = 9.0, NumDevs = 1
Result = PASS

bandwidthTest

切换目录

1	cd /usr/local/cuda/samples/1_Utilities/bandwidthTest

编译运行

1	make && ./bandwidthTest

输出示例

[CUDA Bandwidth Test] - Starting...
Running on...

 Device 0: Tesla P100-PCIE-16GB
 Quick Mode

 Host to Device Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)	Bandwidth(MB/s)
   33554432			10731.5

 Device to Host Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)	Bandwidth(MB/s)
   33554432			12833.8

 Device to Device Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)	Bandwidth(MB/s)
   33554432			499641.4

Result = PASS

NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.

禁用CUDA源

1	yum-config-manager --disable cuda

安装cuDNN

确定cuDNN下载地址

方法一

访问cuDNN Archive，登录之后下载

方法二

访问Nvidia托管在Gitlab的容器项目地址，根据自己操作系统的版本，找到对应的Dockerfile，提取cuDNN的下载地址

下载cuDNN

1	wget -O - https://developer.download.nvidia.cn/compute/redist/cudnn/v7.6.0/cudnn-9.0-linux-x64-v7.6.0.64.tgz \| tar xz

baseurl=$(curl -sSL http://100.100.100.200/latest/meta-data/source-address | head -1)
download_url="${baseurl}/opsx/ecs/linux/binary"
cuda_big_version=9.0
cudnn_version=7.6.0
cudnn_file="cudnn-"${cuda_big_version}"-linux-x64-v"${cudnn_version}".tgz"
wget -O - "${download_url}/nvidia/cudnn/${cuda_big_version}/${cudnn_file}" | tar xz

cuda_big_version可以通过以下命令查询

1	curl -sSL "$(curl -sSL http://100.100.100.200/latest/meta-data/source-address \| head -1)/opsx/ecs/linux/binary/nvidia/cudnn"

cudnn_version以通过以下命令查询

1	curl -sSL "$(curl -sSL http://100.100.100.200/latest/meta-data/source-address \| head -1)/opsx/ecs/linux/binary/nvidia/cudnn/${cuda_big_version}"

安装cuDNN

cp cuda/include/cudnn.h /usr/local/cuda-9.0/include
cp cuda/lib64/libcudnn.so.7.6.0 cuda/lib64/libcudnn_static.a /usr/local/cuda-9.0/lib64
cd /usr/local/cuda-9.0/lib64
ln -sv libcudnn.so.7.6.0 libcudnn.so
ln -sv libcudnn.so.7.6.0 libcudnn.so.7
ldconfig -v

验证cuDNN

1
2
3

git clone --depth=1 https://github.com/wilhelmguo/cudnn_samples_v7.git
cd cudnn_samples_v7/mnistCUDNN
make && ./mnistCUDNN

cudnnGetVersion() : 7600 , CUDNN_VERSION from cudnn.h : 7600 (7.6.0)
Host compiler version : GCC 4.8.5
There are 1 CUDA capable devices on your machine :
device 0 : sms 56  Capabilities 6.0, SmClock 1328.5 Mhz, MemSize (Mb) 16280, MemClock 715.0 Mhz, Ecc=1, boardGroupID=0
Using device 0

Testing single precision
Loading image data/one_28x28.pgm
Performing forward propagation ...
Testing cudnnGetConvolutionForwardAlgorithm ...
Fastest algorithm is Algo 1
Testing cudnnFindConvolutionForwardAlgorithm ...
^^^^ CUDNN_STATUS_SUCCESS for Algo 0: 0.040256 time requiring 0 memory
^^^^ CUDNN_STATUS_SUCCESS for Algo 1: 0.048864 time requiring 3464 memory
^^^^ CUDNN_STATUS_SUCCESS for Algo 2: 0.066016 time requiring 57600 memory
^^^^ CUDNN_STATUS_SUCCESS for Algo 7: 0.090944 time requiring 2057744 memory
^^^^ CUDNN_STATUS_SUCCESS for Algo 5: 0.115776 time requiring 203008 memory
Resulting weights from Softmax:
0.0000000 0.9999399 0.0000000 0.0000000 0.0000561 0.0000000 0.0000012 0.0000017 0.0000010 0.0000000 
Loading image data/three_28x28.pgm
Performing forward propagation ...
Resulting weights from Softmax:
0.0000000 0.0000000 0.0000000 0.9999288 0.0000000 0.0000711 0.0000000 0.0000000 0.0000000 0.0000000 
Loading image data/five_28x28.pgm
Performing forward propagation ...
Resulting weights from Softmax:
0.0000000 0.0000008 0.0000000 0.0000002 0.0000000 0.9999820 0.0000154 0.0000000 0.0000012 0.0000006 

Result of classification: 1 3 5

Test passed!

Testing half precision (math in single precision)
Loading image data/one_28x28.pgm
Performing forward propagation ...
Testing cudnnGetConvolutionForwardAlgorithm ...
Fastest algorithm is Algo 1
Testing cudnnFindConvolutionForwardAlgorithm ...
^^^^ CUDNN_STATUS_SUCCESS for Algo 0: 0.045184 time requiring 0 memory
^^^^ CUDNN_STATUS_SUCCESS for Algo 1: 0.051328 time requiring 3464 memory
^^^^ CUDNN_STATUS_SUCCESS for Algo 2: 0.067872 time requiring 28800 memory
^^^^ CUDNN_STATUS_SUCCESS for Algo 7: 0.087776 time requiring 2057744 memory
^^^^ CUDNN_STATUS_SUCCESS for Algo 4: 0.112032 time requiring 207360 memory
Resulting weights from Softmax:
0.0000001 1.0000000 0.0000001 0.0000000 0.0000563 0.0000001 0.0000012 0.0000017 0.0000010 0.0000001 
Loading image data/three_28x28.pgm
Performing forward propagation ...
Resulting weights from Softmax:
0.0000000 0.0000000 0.0000000 1.0000000 0.0000000 0.0000714 0.0000000 0.0000000 0.0000000 0.0000000 
Loading image data/five_28x28.pgm
Performing forward propagation ...
Resulting weights from Softmax:
0.0000000 0.0000008 0.0000000 0.0000002 0.0000000 1.0000000 0.0000154 0.0000000 0.0000012 0.0000006 

Result of classification: 1 3 5

Test passed!

设置GPU

手动设置

开启Persistence Mode

在Linux平台上，可以设置PersistenceMode让Nvidia GPU驱动在空闲的时候依然保持加载状态。
此功能可以加快频繁运行短任务，避免频繁加载驱动
会提高待机功耗

1 2	nvidia-persistenced nvidia-smi -pm 1

禁用受限模式

允许非管理员运行的CUDA程序调整频率

1	nvidia-smi -acp 0

禁用GPU Boost

GPU Boost功能可以让GPU根据负载、散热、供电动态超频运行
GPU Boost功能需要管理员权限才能调用，这里把权限放开给非管理员程序使用

1 2	nvidia-smi --auto-boost-permission=0 nvidia-smi --auto-boost-default=0

设置GPU运行频率

获取GPU支持的频率
设置GPU运行频率
-i参数指定显卡ID

1 2	CLOCK=$(nvidia-smi -i 0 --query-supported-clocks=mem,gr --format=csv,noheader\| head -n1 \| awk 'BEGIN { FS=" " } ; { print $1 "," $3 }') nvidia-smi -ac $CLOCK -i 0

加载Nvidia模块

按需调整
这里可以独立于Linux发行版的方式创建Nvidia设备文件
create-nvidia-device-file创建给定编号的Nvidia设备文件
unified-memory加载Nvidia Unified Memory模块
modeset加载Nvidia内核模块并创建设备文件

1	nvidia-modprobe --unified-memory --create-nvidia-device-file=0 --modeset

开机自动设置

cat >> /etc/rc.local <<EOF
nvidia-smi -pm 1 || true
nvidia-smi -acp 0 || true
nvidia-smi --auto-boost-default=0 || true
nvidia-smi --auto-boost-permission=0 || true
#nvidia-modprobe --unified-memory --create-nvidia-device-file=0 --modeset || true
#nvidia-smi -ac $(nvidia-smi -i 0 --query-supported-clocks=mem,gr --format=csv,noheader| head -n1 | awk 'BEGIN { FS=" " } ; { print $1 "," $3 }') -i 0
EOF

使用阿里云自动安装脚本

说明

阿里云在创建GPU计算型实例时，可以通过metadata的方式传入初始化脚本，实现自动安装Nvidia GPU驱动和CUDA
详细流程可以看阿里云的操作文档

运行脚本

驱动版本390.116
CUDA版本9.0.176
cuDNN版本7.5.0（阿里云没提供7.6.0.64的版本，根据文档选择了最新的7.5.0）

DRIVER_VERSION="390.116"
CUDA_VERSION="9.0.176"
CUDNN_VERSION="7.5.0"
IS_INSTALL_RAPIDS="FALSE"
IS_INSTALL_PERSEUS="TRUE"

INSTALL_DIR="/root/auto_install"
log=${INSTALL_DIR}/nvidia_install.log

mkdir $INSTALL_DIR && cd $INSTALL_DIR
script_download_url=$(curl http://100.100.100.200/latest/meta-data/source-address | head -1)"/opsx/ecs/linux/binary/script/auto_install.sh"
wget -t 10 --timeout=10 $script_download_url
bash "${INSTALL_DIR}/auto_install.sh" $DRIVER_VERSION $CUDA_VERSION $CUDNN_VERSION $IS_INSTALL_PERSEUS $IS_INSTALL_RAPIDS

Nvidia-Docker

说明

安装Docker-CE Stable
安装Nvidia-Docker

安装docker

清理旧Docker包

1	yum remove -y docker docker-client docker-client-latest docker-common docker-latest docker-latest-logrotate docker-logrotate docker-selinux docker-engine-selinux docker-engine

安装Docker依赖包

1	yum install -y yum-utils device-mapper-persistent-data lvm2

添加Docker-CE源

1	yum-config-manager --add-repo http://mirrors.aliyun.com/docker-ce/linux/centos/docker-ce.repo

安装Docker-CE

1	yum install docker-ce -y

配置Docker启动参数

mkdir -p /etc/docker
cat > /etc/docker/daemon.json <<EOF
{
    "exec-opts": [
    	"native.cgroupdriver=systemd"
    ],
    "registry-mirrors": [
    	"https://pqbap4ya.mirror.aliyuncs.com"
    ],
    "insecure-registries": [],
    "log-driver": "json-file",
    "log-opts": {
        "max-size": "100m",
        "max-file": "3"
    },
    "storage-driver": "overlay2",
    "storage-opts": [
        "overlay2.override_kernel_check=true"
    ],
    "data-root": "/var/lib/docker",
    "max-concurrent-downloads": 10
}
EOF

设置Docker命令补全

1	cp /usr/share/bash-completion/completions/docker /etc/bash_completion.d/

禁用Docker-CE源

1	yum-config-manager --disable docker-ce-stable

启动Docker

1	systemctl enable --now docker.service

检查Docker信息

1	docker info

安装Nvidia-Docker

添加YUM源

1 2	distribution=$(. /etc/os-release;echo $ID$VERSION_ID) curl -sSL https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.repo \| tee /etc/yum.repos.d/nvidia-docker.repo

安装NVIDIA Container Toolkit

1	yum install -y nvidia-container-toolkit

重启Docker

1	systemctl restart docker.service

测试容器调用GPU

1	docker run -it --rm --gpus all nvidia/cuda:9.0-base nvidia-smi

Unable to find image 'nvidia/cuda:9.0-base' locally
9.0-base: Pulling from nvidia/cuda
976a760c94fc: Pull complete 
c58992f3c37b: Pull complete 
0ca0e5e7f12e: Pull complete 
f2a274cc00ca: Pull complete 
708a53113e13: Pull complete 
371ddc2ca87b: Pull complete 
f81888eb6932: Pull complete 
Digest: sha256:56bfa4e0b6d923bf47a71c91b4e00b62ea251a04425598d371a5807d6ac471cb
Status: Downloaded newer image for nvidia/cuda:9.0-base
      
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 390.116                Driver Version: 390.116                   |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  Tesla P100-PCIE...  On   | 00000000:00:08.0 Off |                    0 |
| N/A   33C    P0    25W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+