<html xmlns="http://www.w3.org/1999/xhtml"><head><meta charset="utf-8"><meta name="generator" content="pdf2htmlEX"><meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"><link rel="stylesheet" href="https://csdnimg.cn/release/download_crawler_static/css/base.min.css"><link rel="stylesheet" href="https://csdnimg.cn/release/download_crawler_static/css/fancy.min.css"><link rel="stylesheet" href="https://csdnimg.cn/release/download_crawler_static/12090472/raw.css"><script src="https://csdnimg.cn/release/download_crawler_static/js/compatibility.min.js"></script><script src="https://csdnimg.cn/release/download_crawler_static/js/pdf2htmlEX.min.js"></script><script>try{pdf2htmlEX.defaultViewer = new pdf2htmlEX.Viewer({});}catch(e){}</script><title></title></head><body><div id="sidebar" style="display: none"><div id="outline"></div></div><div id="pf1" class="pf w0 h0" data-page-no="1"><div class="pc pc1 w0 h0"><img class="bi x0 y0 w1 h1" alt="" src="https://csdnimg.cn/release/download_crawler_static/12090472/bg1.jpg"><div class="t m0 x1 h2 y1 ff1 fs0 fc0 sc0 ls0 ws0">Nathan Luehr, Maggie Zhang, Josh Romero, Pooya Davoodi, Davide Onofrio</div><div class="t m0 x2 h3 y2 ff2 fs1 fc0 sc0 ls0 ws0">Accelerate Training, Inference, and ML </div><div class="t m0 x2 h3 y3 ff2 fs1 fc0 sc0 ls0 ws0">Applications on GPUs</div></div><div class="pi" data-data='{"ctm":[1.333333,0.000000,0.000000,1.333333,0.000000,0.000000]}'></div></div></body></html>
<div id="pf2" class="pf w0 h0" data-page-no="2"><div class="pc pc2 w0 h0"><img class="bi x0 y0 w1 h1" alt="" src="https://csdnimg.cn/release/download_crawler_static/12090472/bg2.jpg"><div class="t m0 x3 h4 y4 ff1 fs2 fc1 sc0 ls0 ws0">2<span class="ff3 fs3 fc2"> </span></div><div class="t m0 x4 h3 y5 ff2 fs1 fc0 sc0 ls0 ws0">Agenda</div><div class="t m0 x5 h5 y6 ff4 fs4 fc3 sc0 ls0 ws0">●<span class="_ _0"> </span>Automatic Mixed Precision <span class="_ _1"></span>T<span class="_ _2"></span>raining</div><div class="t m0 x5 h5 y7 ff4 fs4 fc3 sc0 ls0 ws0">●<span class="_ _0"> </span>DALI: Fast Data Pipelines for Deep Learning </div><div class="t m0 x5 h5 y8 ff4 fs4 fc3 sc0 ls0 ws0">●<span class="_ _0"> </span>Distributed <span class="_ _1"></span>T<span class="_ _2"></span>raining with Horovod</div><div class="t m0 x5 h5 y9 ff4 fs4 fc3 sc0 ls0 ws0">●<span class="_ _0"> </span>Accelerating Inference using <span class="_ _1"></span>T<span class="_ _3"></span>ensorR<span class="_ _1"></span>T</div><div class="t m0 x5 h5 ya ff4 fs4 fc3 sc0 ls0 ws0">●<span class="_ _0"> </span>NVIDIA<span class="_ _4"></span> Deep Learning Profiler</div></div><div class="pi" data-data='{"ctm":[1.333333,0.000000,0.000000,1.333333,0.000000,0.000000]}'></div></div>
<div id="pf3" class="pf w0 h0" data-page-no="3"><div class="pc pc3 w0 h0"><img class="bi x0 y0 w1 h1" alt="" src="https://csdnimg.cn/release/download_crawler_static/12090472/bg3.jpg"><div class="t m0 x3 h4 y4 ff1 fs2 fc1 sc0 ls0 ws0">3<span class="ff3 fs3 fc2"> </span></div><div class="t m0 x6 h6 yb ff4 fs0 fc4 sc0 ls0 ws0">A<span class="_ _2"></span>WS EC2 VM Instance</div><div class="t m0 x7 h6 yc ff4 fs0 fc4 sc0 ls0 ws0">Instance Name: g4dn.2xlarge</div><div class="t m0 x7 h6 yd ff4 fs0 fc4 sc0 ls0 ws0">GPU: NVIDIA<span class="_ _2"></span> <span class="_ _1"></span>T4 <span class="_ _1"></span>T<span class="_ _5"></span>ensor Core GPUs</div><div class="t m0 x7 h6 ye ff4 fs0 fc4 sc0 ls0 ws0">CPU: 8 vCPUs</div><div class="t m0 x7 h6 yf ff4 fs0 fc4 sc0 ls0 ws0">Memory: 32 GiB</div><div class="t m0 x7 h6 y10 ff4 fs0 fc4 sc0 ls0 ws0">Disk Space 130 GB</div><div class="t m0 x8 h7 y11 ff1 fs4 fc3 sc0 ls0 ws0">VM Description</div><div class="t m0 x9 h8 y12 ff5 fs1 fc4 sc0 ls0 ws0">Here’<span class="_ _4"></span>s what we’re running on today</div><div class="t m0 xa h9 y13 ff1 fs5 fc4 sc0 ls0 ws0">A Special thanks to Amazon for reserving the VMs we used in this session</div></div><div class="pi" data-data='{"ctm":[1.333333,0.000000,0.000000,1.333333,0.000000,0.000000]}'></div></div>
<div id="pf4" class="pf w0 h0" data-page-no="4"><div class="pc pc4 w0 h0"><img class="bi x0 y0 w1 h1" alt="" src="https://csdnimg.cn/release/download_crawler_static/12090472/bg4.jpg"><div class="t m0 x3 h4 y4 ff1 fs2 fc1 sc0 ls0 ws0">4<span class="ff3 fs3 fc2"> </span></div><div class="t m0 xb ha y14 ff6 fs0 fc5 sc0 ls0 ws0">1.<span class="_ _0"> </span><span class="fc4">Get IP address and credentials from instructor</span></div><div class="t m0 xb ha y15 ff6 fs0 fc5 sc0 ls0 ws0">2.<span class="_ _0"> </span><span class="fc4">Connect to machine via ssh: ssh nvidia@<IP ADDRESS></span></div><div class="t m0 xb hb y16 ff7 fs0 fc5 sc0 ls0 ws0">3.<span class="_ _0"> </span><span class="fc4">Startup NVIDIA 19.10 TF container: </span></div><div class="t m0 xc ha y17 ff6 fs0 fc4 sc0 ls0 ws0">nvidia-docker run --shm-size=1g --ulimit memlock=-1 --ulimit </div><div class="t m0 xc ha y18 ff6 fs0 fc4 sc0 ls0 ws0">stack=67108864 --rm -it -p 8080:8080 -v $PWD/data:/data -v </div><div class="t m0 xc ha y19 ff6 fs0 fc4 sc0 ls0 ws0">$PWD:/workspace nvcr.io/nvidia/tensorflow:19.10-py3</div><div class="t m0 xb hb y1a ff7 fs0 fc5 sc0 ls0 ws0">4.<span class="_ _0"> </span><span class="fc4">Start up JupyterLab:</span></div><div class="t m0 xc ha y1b ff6 fs0 fc4 sc0 ls0 ws0">jupyter lab --ip=0.0.0.0 --port=8080 --allow-root --no-browser </div><div class="t m0 xc ha y1c ff6 fs0 fc4 sc0 ls0 ws0">--NotebookApp.token='' --NotebookApp.password=''</div><div class="t m0 xd h2 y1d ff1 fs0 fc5 sc0 ls0 ws0">5.<span class="_ _0"> </span><span class="ff6 fc4">In your web browser, navigate to http://<IP ADDRESS>:8080</span></div><div class="t m0 xe h5 y11 ff4 fs4 fc3 sc0 ls0 ws0">Setting up JupyterLab Session</div><div class="t m0 x9 h8 y12 ff5 fs1 fc4 sc0 ls0 ws0">Here’<span class="_ _4"></span>s what we’re running on today</div></div><div class="pi" data-data='{"ctm":[1.333333,0.000000,0.000000,1.333333,0.000000,0.000000]}'></div></div>
<div id="pf5" class="pf w0 h0" data-page-no="5"><div class="pc pc5 w0 h0"><img class="bi x0 y0 w1 h1" alt="" src="https://csdnimg.cn/release/download_crawler_static/12090472/bg5.jpg"><div class="t m0 x3 h4 y4 ff1 fs2 fc1 sc0 ls0 ws0"><span class="fc6 sc0">5</span><span class="ff3 fs3 fc2"><span class="fc6 sc0"> </span></span></div><div class="t m0 xf hc y1e ff2 fs6 fc0 sc0 ls0 ws0">Automatic Mixed </div><div class="t m0 x10 hc y1f ff2 fs6 fc0 sc0 ls0 ws0">Precision Training</div></div><div class="pi" data-data='{"ctm":[1.333333,0.000000,0.000000,1.333333,0.000000,0.000000]}'></div></div>