Running causal_conv1d benchmark on cuda with 24 workloads.
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 411.136us 2127.15% 411.136us 411.136us 1
torch_eager 8.60% 205.173us 99.40% 2.372ms 2.372ms 0.000us 0.00% 21.632us 21.632us 1
aten::to 0.40% 9.649us 83.06% 1.982ms 330.358us 0.000us 0.00% 14.272us 2.379us 6
aten::_to_copy 1.47% 35.141us 82.65% 1.973ms 328.750us 0.000us 0.00% 14.272us 2.379us 6
aten::copy_ 2.42% 57.830us 79.13% 1.889ms 314.753us 11.968us 61.92% 14.272us 2.379us 6
aten::conv1d 0.32% 7.640us 6.22% 148.384us 49.461us 0.000us 0.00% 7.360us 2.453us 3
aten::convolution 0.55% 13.222us 5.90% 140.744us 46.915us 0.000us 0.00% 7.360us 2.453us 3
aten::_convolution 1.23% 29.427us 5.34% 127.522us 42.507us 0.000us 0.00% 7.360us 2.453us 3
aten::_conv_depthwise2d 1.41% 33.690us 3.44% 82.073us 27.358us 7.360us 38.08% 7.360us 2.453us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 38.08% 7.360us 2.453us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.304us 32.62% 6.304us 2.101us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.664us 29.30% 5.664us 1.888us 3
Activity Buffer Request 73.85% 1.762ms 73.85% 1.762ms 1.762ms 2.304us 11.92% 2.304us 2.304us 1
aten::empty_strided 2.05% 48.841us 2.05% 48.841us 8.140us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 3.88% 92.484us 3.88% 92.484us 10.276us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.94% 22.551us 1.23% 29.352us 3.261us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.46% 10.991us 0.46% 10.991us 0.733us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.53% 12.660us 0.53% 12.660us 4.220us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.49% 11.631us 0.49% 11.631us 3.877us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.27% 6.340us 0.32% 7.570us 2.523us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.386ms
Self CUDA time total: 19.328us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 320.094us 1629.14% 320.094us 320.094us 1
torch_eager 6.61% 147.267us 99.75% 2.222ms 2.222ms 0.000us 0.00% 21.856us 21.856us 1
aten::to 0.28% 6.328us 86.86% 1.935ms 322.525us 0.000us 0.00% 13.888us 2.315us 6
aten::_to_copy 0.99% 22.058us 86.58% 1.929ms 321.470us 0.000us 0.00% 13.888us 2.315us 6
aten::copy_ 2.09% 46.581us 84.13% 1.874ms 312.384us 11.680us 59.45% 13.888us 2.315us 6
aten::conv1d 0.26% 5.880us 5.20% 115.901us 38.634us 0.000us 0.00% 7.968us 2.656us 3
aten::convolution 0.41% 9.201us 4.94% 110.021us 36.674us 0.000us 0.00% 7.968us 2.656us 3
aten::_convolution 0.99% 22.029us 4.53% 100.820us 33.607us 0.000us 0.00% 7.968us 2.656us 3
aten::_conv_depthwise2d 0.98% 21.809us 2.84% 63.210us 21.070us 7.968us 40.55% 7.968us 2.656us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.968us 40.55% 7.968us 2.656us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.112us 31.11% 6.112us 2.037us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.568us 28.34% 5.568us 1.856us 3
Activity Buffer Request 79.89% 1.780ms 79.89% 1.780ms 1.780ms 2.208us 11.24% 2.208us 2.208us 1
aten::empty_strided 1.46% 32.461us 1.46% 32.461us 5.410us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 3.22% 71.802us 3.22% 71.802us 7.978us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.71% 15.809us 0.93% 20.750us 2.306us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.38% 8.492us 0.38% 8.492us 0.566us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.41% 9.081us 0.41% 9.081us 3.027us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.38% 8.530us 0.38% 8.530us 2.843us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.26% 5.730us 0.32% 7.140us 2.380us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.228ms
Self CUDA time total: 19.648us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 322.750us 1724.09% 322.750us 322.750us 1
torch_eager 6.97% 154.353us 99.74% 2.208ms 2.208ms 0.000us 0.00% 20.736us 20.736us 1
aten::to 0.30% 6.580us 86.44% 1.913ms 318.849us 0.000us 0.00% 13.791us 2.299us 6
aten::_to_copy 1.09% 24.161us 86.14% 1.907ms 317.752us 0.000us 0.00% 13.791us 2.299us 6
aten::copy_ 2.12% 46.909us 83.64% 1.851ms 308.533us 11.775us 62.90% 13.791us 2.299us 6
aten::conv1d 0.30% 6.591us 5.18% 114.662us 38.221us 0.000us 0.00% 6.945us 2.315us 3
aten::convolution 0.40% 8.811us 4.88% 108.071us 36.024us 0.000us 0.00% 6.945us 2.315us 3
aten::_convolution 0.96% 21.188us 4.48% 99.260us 33.087us 0.000us 0.00% 6.945us 2.315us 3
aten::_conv_depthwise2d 0.97% 21.520us 2.82% 62.461us 20.820us 6.945us 37.10% 6.945us 2.315us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 6.945us 37.10% 6.945us 2.315us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.047us 32.30% 6.047us 2.016us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.728us 30.60% 5.728us 1.909us 3
Activity Buffer Request 79.41% 1.758ms 79.41% 1.758ms 1.758ms 2.016us 10.77% 2.016us 2.016us 1
aten::empty_strided 1.41% 31.151us 1.41% 31.151us 5.192us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 3.17% 70.153us 3.17% 70.153us 7.795us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.77% 17.060us 1.01% 22.310us 2.479us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.39% 8.641us 0.39% 8.641us 0.576us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.42% 9.380us 0.42% 9.380us 3.127us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.37% 8.090us 0.37% 8.090us 2.697us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.25% 5.450us 0.31% 6.801us 2.267us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.213ms
Self CUDA time total: 18.720us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 328.254us 1673.31% 328.254us 328.254us 1
torch_eager 6.02% 146.742us 99.79% 2.431ms 2.431ms 0.000us 0.00% 21.729us 21.729us 1
aten::to 0.25% 6.201us 87.89% 2.141ms 356.794us 0.000us 0.00% 14.048us 2.341us 6
aten::_to_copy 0.95% 23.051us 87.64% 2.135ms 355.761us 0.000us 0.00% 14.048us 2.341us 6
aten::copy_ 1.93% 46.899us 85.39% 2.080ms 346.662us 11.936us 60.85% 14.048us 2.341us 6
aten::conv1d 0.28% 6.941us 4.83% 117.552us 39.184us 0.000us 0.00% 7.681us 2.560us 3
aten::convolution 0.38% 9.320us 4.54% 110.611us 36.870us 0.000us 0.00% 7.681us 2.560us 3
aten::_convolution 0.86% 20.861us 4.16% 101.291us 33.764us 0.000us 0.00% 7.681us 2.560us 3
aten::_conv_depthwise2d 0.93% 22.752us 2.67% 64.991us 21.664us 7.681us 39.15% 7.681us 2.560us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.681us 39.15% 7.681us 2.560us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.208us 31.65% 6.208us 2.069us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.728us 29.20% 5.728us 1.909us 3
Activity Buffer Request 75.50% 1.839ms 75.50% 1.839ms 1.839ms 2.112us 10.77% 2.112us 2.112us 1
aten::empty_strided 1.29% 31.540us 1.29% 31.540us 5.257us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 8.87% 216.103us 8.87% 216.103us 24.011us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.70% 16.989us 0.90% 21.970us 2.441us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.35% 8.601us 0.35% 8.601us 0.573us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.43% 10.359us 0.43% 10.359us 3.453us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.40% 9.840us 0.40% 9.840us 3.280us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.22% 5.410us 0.28% 6.920us 2.307us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.436ms
Self CUDA time total: 19.617us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 325.374us 1318.69% 325.374us 325.374us 1
torch_eager 6.23% 145.210us 99.78% 2.326ms 2.326ms 0.000us 0.00% 26.978us 26.978us 1
aten::to 0.28% 6.471us 87.58% 2.041ms 340.232us 0.000us 0.00% 15.298us 2.550us 6
aten::_to_copy 1.01% 23.559us 87.30% 2.035ms 339.154us 0.000us 0.00% 15.298us 2.550us 6
aten::copy_ 2.04% 47.563us 85.03% 1.982ms 330.320us 12.994us 52.66% 15.298us 2.550us 6
aten::conv1d 0.26% 6.060us 4.91% 114.341us 38.114us 0.000us 0.00% 11.680us 3.893us 3
aten::convolution 0.40% 9.250us 4.65% 108.281us 36.094us 0.000us 0.00% 11.680us 3.893us 3
aten::_convolution 0.89% 20.669us 4.25% 99.031us 33.010us 0.000us 0.00% 11.680us 3.893us 3
aten::_conv_depthwise2d 0.95% 22.039us 2.73% 63.550us 21.183us 11.680us 47.34% 11.680us 3.893us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 11.680us 47.34% 11.680us 3.893us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.657us 26.98% 6.657us 2.219us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.337us 25.68% 6.337us 2.112us 3
Activity Buffer Request 74.59% 1.739ms 74.59% 1.739ms 1.739ms 2.304us 9.34% 2.304us 2.304us 1
aten::empty_strided 1.26% 29.442us 1.26% 29.442us 4.907us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.39% 218.802us 9.39% 218.802us 24.311us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.69% 16.041us 0.91% 21.173us 2.353us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.37% 8.602us 0.37% 8.602us 0.573us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.40% 9.341us 0.40% 9.341us 3.114us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.39% 8.990us 0.39% 8.990us 2.997us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.23% 5.290us 0.28% 6.580us 2.193us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.331ms
Self CUDA time total: 24.674us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 325.853us 1241.91% 325.853us 325.853us 1
torch_eager 6.02% 142.382us 99.78% 2.359ms 2.359ms 0.000us 0.00% 28.510us 28.510us 1
aten::to 0.27% 6.279us 87.80% 2.076ms 345.959us 0.000us 0.00% 15.262us 2.544us 6
aten::_to_copy 0.97% 22.980us 87.54% 2.069ms 344.912us 0.000us 0.00% 15.262us 2.544us 6
aten::copy_ 2.02% 47.672us 85.33% 2.017ms 336.189us 12.990us 49.51% 15.262us 2.544us 6
aten::conv1d 0.27% 6.391us 4.88% 115.262us 38.421us 0.000us 0.00% 13.248us 4.416us 3
aten::convolution 0.41% 9.629us 4.61% 108.871us 36.290us 0.000us 0.00% 13.248us 4.416us 3
aten::_convolution 0.88% 20.800us 4.20% 99.242us 33.081us 0.000us 0.00% 13.248us 4.416us 3
aten::_conv_depthwise2d 0.93% 21.882us 2.62% 62.041us 20.680us 13.248us 50.49% 13.248us 4.416us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 13.248us 50.49% 13.248us 4.416us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.622us 25.24% 6.622us 2.207us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.368us 24.27% 6.368us 2.123us 3
Activity Buffer Request 75.21% 1.778ms 75.21% 1.778ms 1.778ms 2.272us 8.66% 2.272us 2.272us 1
aten::empty_strided 1.24% 29.361us 1.24% 29.361us 4.893us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 8.97% 212.032us 8.97% 212.032us 23.559us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.75% 17.821us 0.98% 23.130us 2.570us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.37% 8.699us 0.37% 8.699us 0.580us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.38% 9.090us 0.38% 9.090us 3.030us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.44% 10.480us 0.44% 10.480us 3.493us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.24% 5.631us 0.30% 7.011us 2.337us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.364ms
Self CUDA time total: 26.238us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 331.328us 858.50% 331.328us 331.328us 1
torch_eager 5.97% 146.471us 99.79% 2.446ms 2.446ms 0.000us 0.00% 41.186us 41.186us 1
aten::conv1d 0.25% 6.210us 4.77% 116.961us 38.987us 0.000us 0.00% 22.849us 7.616us 3
aten::convolution 0.40% 9.740us 4.52% 110.751us 36.917us 0.000us 0.00% 22.849us 7.616us 3
aten::_convolution 0.89% 21.911us 4.12% 101.011us 33.670us 0.000us 0.00% 22.849us 7.616us 3
aten::_conv_depthwise2d 0.92% 22.550us 2.59% 63.530us 21.177us 22.849us 59.20% 22.849us 7.616us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.849us 59.20% 22.849us 7.616us 3
aten::to 0.25% 6.228us 88.01% 2.158ms 359.617us 0.000us 0.00% 18.337us 3.056us 6
aten::_to_copy 1.00% 24.602us 87.76% 2.151ms 358.579us 0.000us 0.00% 18.337us 3.056us 6
aten::copy_ 1.98% 48.619us 85.49% 2.096ms 349.334us 15.745us 40.80% 18.337us 3.056us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.385us 21.73% 8.385us 2.795us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 19.07% 7.360us 2.453us 3
Activity Buffer Request 75.73% 1.857ms 75.73% 1.857ms 1.857ms 2.592us 6.72% 2.592us 2.592us 1
aten::empty_strided 1.26% 30.871us 1.26% 30.871us 5.145us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 8.69% 213.074us 8.69% 213.074us 23.675us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.69% 16.899us 0.91% 22.302us 2.478us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.35% 8.674us 0.35% 8.674us 0.578us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.39% 9.670us 0.39% 9.670us 3.223us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.37% 9.000us 0.37% 9.000us 3.000us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.23% 5.570us 0.28% 6.790us 2.263us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.452ms
Self CUDA time total: 38.594us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 324.382us 781.00% 324.382us 324.382us 1
torch_eager 6.15% 143.693us 99.76% 2.329ms 2.329ms 0.000us 0.00% 44.158us 44.158us 1
aten::conv1d 0.25% 5.870us 4.90% 114.381us 38.127us 0.000us 0.00% 25.694us 8.565us 3
aten::convolution 0.39% 9.129us 4.65% 108.511us 36.170us 0.000us 0.00% 25.694us 8.565us 3
aten::_convolution 0.92% 21.560us 4.26% 99.382us 33.127us 0.000us 0.00% 25.694us 8.565us 3
aten::_conv_depthwise2d 0.91% 21.251us 2.67% 62.331us 20.777us 25.694us 61.86% 25.694us 8.565us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 25.694us 61.86% 25.694us 8.565us 3
aten::to 0.26% 6.051us 87.64% 2.046ms 341.007us 0.000us 0.00% 18.464us 3.077us 6
aten::_to_copy 0.99% 23.033us 87.38% 2.040ms 339.999us 0.000us 0.00% 18.464us 3.077us 6
aten::copy_ 2.09% 48.709us 85.05% 1.985ms 330.910us 15.840us 38.14% 18.464us 3.077us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.448us 20.34% 8.448us 2.816us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.392us 17.80% 7.392us 2.464us 3
Activity Buffer Request 74.80% 1.746ms 74.80% 1.746ms 1.746ms 2.624us 6.32% 2.624us 2.624us 1
aten::empty_strided 1.35% 31.498us 1.35% 31.498us 5.250us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.10% 212.334us 9.10% 212.334us 23.593us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.70% 16.311us 0.92% 21.550us 2.394us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.38% 8.780us 0.38% 8.780us 0.585us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.39% 9.170us 0.39% 9.170us 3.057us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.44% 10.170us 0.44% 10.170us 3.390us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.24% 5.530us 0.30% 6.891us 2.297us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.335ms
Self CUDA time total: 41.534us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 319.038us 307.34% 319.038us 319.038us 1
torch_eager 4.95% 115.620us 99.75% 2.329ms 2.329ms 0.000us 0.00% 109.886us 109.886us 1
aten::conv1d 0.24% 5.500us 4.79% 111.722us 37.241us 0.000us 0.00% 71.360us 23.787us 3
aten::convolution 0.38% 8.820us 4.55% 106.222us 35.407us 0.000us 0.00% 71.360us 23.787us 3
aten::_convolution 0.86% 20.169us 4.17% 97.402us 32.467us 0.000us 0.00% 71.360us 23.787us 3
aten::_conv_depthwise2d 0.88% 20.499us 2.70% 62.992us 20.997us 71.360us 68.74% 71.360us 23.787us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 71.360us 68.74% 71.360us 23.787us 3
aten::to 0.25% 5.942us 88.99% 2.078ms 346.257us 0.000us 0.00% 38.526us 6.421us 6
aten::_to_copy 0.97% 22.531us 88.74% 2.072ms 345.267us 0.000us 0.00% 38.526us 6.421us 6
aten::copy_ 1.95% 45.459us 86.50% 2.019ms 336.557us 32.447us 31.26% 38.526us 6.421us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.791us 17.14% 17.791us 5.930us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.656us 14.12% 14.656us 4.885us 3
Activity Buffer Request 76.44% 1.784ms 76.44% 1.784ms 1.784ms 6.079us 5.86% 6.079us 6.079us 1
aten::empty_strided 1.27% 29.730us 1.27% 29.730us 4.955us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.13% 213.066us 9.13% 213.066us 23.674us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.66% 15.410us 0.85% 19.870us 2.208us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.33% 7.790us 0.33% 7.790us 0.519us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.44% 10.351us 0.44% 10.351us 3.450us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.36% 8.461us 0.36% 8.461us 2.820us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.23% 5.401us 0.29% 6.691us 2.230us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.335ms
Self CUDA time total: 103.807us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 320.032us 281.56% 320.032us 320.032us 1
torch_eager 4.89% 112.502us 99.77% 2.297ms 2.297ms 0.000us 0.00% 119.649us 119.649us 1
aten::conv1d 0.24% 5.540us 4.86% 111.980us 37.327us 0.000us 0.00% 81.407us 27.136us 3
aten::convolution 0.38% 8.839us 4.62% 106.440us 35.480us 0.000us 0.00% 81.407us 27.136us 3
aten::_convolution 0.90% 20.821us 4.24% 97.601us 32.534us 0.000us 0.00% 81.407us 27.136us 3
aten::_conv_depthwise2d 0.94% 21.639us 2.69% 61.990us 20.663us 81.407us 71.62% 81.407us 27.136us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 81.407us 71.62% 81.407us 27.136us 3
aten::to 0.26% 5.912us 88.93% 2.047ms 341.211us 0.000us 0.00% 38.242us 6.374us 6
aten::_to_copy 0.96% 22.099us 88.68% 2.041ms 340.225us 0.000us 0.00% 38.242us 6.374us 6
aten::copy_ 2.13% 49.062us 86.51% 1.991ms 331.902us 32.257us 28.38% 38.242us 6.374us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.665us 15.54% 17.665us 5.888us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.592us 12.84% 14.592us 4.864us 3
Activity Buffer Request 76.05% 1.751ms 76.05% 1.751ms 1.751ms 5.985us 5.27% 5.985us 5.985us 1
aten::empty_strided 1.21% 27.841us 1.21% 27.841us 4.640us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.26% 213.213us 9.26% 213.213us 23.690us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.70% 16.150us 0.91% 21.061us 2.340us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.36% 8.381us 0.36% 8.381us 0.559us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.40% 9.130us 0.40% 9.130us 3.043us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.42% 9.600us 0.42% 9.600us 3.200us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.24% 5.419us 0.29% 6.669us 2.223us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.302ms
Self CUDA time total: 113.664us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 4.70% 113.641us 96.03% 2.320ms 2.320ms 0.000us 0.00% 464.763us 464.763us 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 453.786us 106.62% 453.786us 453.786us 1
aten::conv1d 0.23% 5.630us 4.62% 111.673us 37.224us 0.000us 0.00% 278.940us 92.980us 3
aten::convolution 0.36% 8.651us 4.39% 106.043us 35.348us 0.000us 0.00% 278.940us 92.980us 3
aten::_convolution 0.86% 20.739us 4.03% 97.392us 32.464us 0.000us 0.00% 278.940us 92.980us 3
aten::_conv_depthwise2d 0.90% 21.710us 2.57% 62.062us 20.687us 278.940us 65.54% 278.940us 92.980us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 278.940us 65.54% 278.940us 92.980us 3
aten::to 0.24% 5.880us 85.69% 2.071ms 345.102us 0.000us 0.00% 185.823us 30.970us 6
aten::_to_copy 0.90% 21.820us 85.45% 2.065ms 344.122us 0.000us 0.00% 185.823us 30.970us 6
aten::copy_ 1.99% 48.071us 83.40% 2.015ms 335.882us 146.655us 34.46% 185.823us 30.970us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 105.919us 24.89% 105.919us 35.306us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.736us 9.57% 40.736us 13.579us 3
Activity Buffer Request 72.26% 1.746ms 72.26% 1.746ms 1.746ms 39.168us 9.20% 39.168us 39.168us 1
aten::empty_strided 1.14% 27.621us 1.14% 27.621us 4.604us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 10.07% 243.344us 10.07% 243.344us 27.038us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.66% 15.908us 0.86% 20.760us 2.307us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.34% 8.262us 0.34% 8.262us 0.551us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.37% 8.921us 0.37% 8.921us 2.974us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.38% 9.260us 0.38% 9.260us 3.087us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.22% 5.361us 0.27% 6.641us 2.214us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.416ms
Self CUDA time total: 425.595us
======================================================================
PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 4.81% 115.230us 95.51% 2.289ms 2.289ms 0.000us 0.00% 473.560us 473.560us 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 466.268us 106.59% 466.268us 466.268us 1
aten::conv1d 0.23% 5.540us 4.63% 111.002us 37.001us 0.000us 0.00% 298.430us 99.477us 3
aten::convolution 0.37% 8.900us 4.40% 105.462us 35.154us 0.000us 0.00% 298.430us 99.477us 3
aten::_convolution 0.85% 20.430us 4.03% 96.562us 32.187us 0.000us 0.00% 298.430us 99.477us 3
aten::_conv_depthwise2d 0.86% 20.562us 2.57% 61.592us 20.531us 298.430us 68.22% 298.430us 99.477us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 298.430us 68.22% 298.430us 99.477us 3
aten::to 0.24% 5.669us 85.05% 2.039ms 339.802us 0.000us 0.00% 175.130us 29.188us 6
aten::_to_copy 0.96% 22.942us 84.82% 2.033ms 338.857us 0.000us 0.00% 175.130us 29.188us 6
aten::copy_ 2.01% 48.190us 82.64% 1.981ms 330.170us 139.003us 31.78% 175.130us 29.188us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 98.430us 22.50% 98.430us 32.810us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.573us 9.28% 40.573us 13.524us 3
Activity Buffer Request 72.81% 1.745ms 72.81% 1.745ms 1.745ms 36.127us 8.26% 36.127us 36.127us 1
aten::empty_strided 1.22% 29.180us 1.22% 29.180us 4.863us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 8.73% 209.224us 8.73% 209.224us 23.247us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.66% 15.770us 0.87% 20.750us 2.306us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.35% 8.340us 0.35% 8.340us 0.556us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.43% 10.290us 0.43% 10.290us 3.430us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.37% 8.960us 0.37% 8.960us 2.987us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.22% 5.340us 0.28% 6.610us 2.203us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.397ms
Self CUDA time total: 437.433us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 325.149us 1725.02% 325.149us 325.149us 1
torch_eager 4.86% 112.628us 99.78% 2.311ms 2.311ms 0.000us 0.00% 20.769us 20.769us 1
aten::to 0.26% 5.932us 88.67% 2.054ms 342.251us 0.000us 0.00% 13.536us 2.256us 6
aten::_to_copy 1.00% 23.270us 88.41% 2.048ms 341.262us 0.000us 0.00% 13.536us 2.256us 6
aten::copy_ 2.14% 49.511us 86.15% 1.995ms 332.552us 11.616us 61.63% 13.536us 2.256us 6
aten::conv1d 0.24% 5.480us 5.19% 120.221us 40.074us 0.000us 0.00% 7.233us 2.411us 3
aten::convolution 0.37% 8.641us 4.95% 114.741us 38.247us 0.000us 0.00% 7.233us 2.411us 3
aten::_convolution 0.88% 20.361us 4.58% 106.100us 35.367us 0.000us 0.00% 7.233us 2.411us 3
aten::_conv_depthwise2d 0.96% 22.180us 3.05% 70.680us 23.560us 7.233us 38.37% 7.233us 2.411us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.233us 38.37% 7.233us 2.411us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.920us 31.41% 5.920us 1.973us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.696us 30.22% 5.696us 1.899us 3
Activity Buffer Request 75.90% 1.758ms 75.90% 1.758ms 1.758ms 1.920us 10.19% 1.920us 1.920us 1
aten::empty_strided 1.25% 28.990us 1.25% 28.990us 4.832us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.42% 218.162us 9.42% 218.162us 24.240us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.68% 15.833us 0.90% 20.731us 2.303us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.37% 8.468us 0.37% 8.468us 0.565us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.40% 9.220us 0.40% 9.220us 3.073us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.39% 8.980us 0.39% 8.980us 2.993us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.24% 5.550us 0.30% 7.000us 2.333us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.316ms
Self CUDA time total: 18.849us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 320.511us 1636.76% 320.511us 320.511us 1
torch_eager 5.91% 139.372us 99.79% 2.353ms 2.353ms 0.000us 0.00% 21.598us 21.598us 1
aten::to 0.25% 6.010us 87.93% 2.073ms 345.496us 0.000us 0.00% 13.663us 2.277us 6
aten::_to_copy 0.96% 22.549us 87.67% 2.067ms 344.494us 0.000us 0.00% 13.663us 2.277us 6
aten::copy_ 2.09% 49.251us 85.51% 2.016ms 335.977us 11.647us 59.48% 13.663us 2.277us 6
aten::conv1d 0.26% 6.081us 4.89% 115.321us 38.440us 0.000us 0.00% 7.935us 2.645us 3
aten::convolution 0.40% 9.450us 4.63% 109.240us 36.413us 0.000us 0.00% 7.935us 2.645us 3
aten::_convolution 0.90% 21.168us 4.23% 99.790us 33.263us 0.000us 0.00% 7.935us 2.645us 3
aten::_conv_depthwise2d 0.87% 20.610us 2.67% 62.871us 20.957us 7.935us 40.52% 7.935us 2.645us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.935us 40.52% 7.935us 2.645us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.983us 30.55% 5.983us 1.994us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.664us 28.92% 5.664us 1.888us 3
Activity Buffer Request 75.47% 1.779ms 75.47% 1.779ms 1.779ms 2.016us 10.30% 2.016us 2.016us 1
aten::empty_strided 1.21% 28.551us 1.21% 28.551us 4.759us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 8.91% 210.105us 8.91% 210.105us 23.345us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.72% 16.961us 0.93% 21.872us 2.430us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.36% 8.422us 0.36% 8.422us 0.561us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.46% 10.910us 0.46% 10.910us 3.637us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.37% 8.650us 0.37% 8.650us 2.883us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.24% 5.579us 0.30% 6.970us 2.323us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.358ms
Self CUDA time total: 19.582us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 310.009us 1591.01% 310.009us 310.009us 1
torch_eager 14.85% 113.881us 99.35% 762.102us 762.102us 0.000us 0.00% 21.693us 21.693us 1
aten::to 0.75% 5.742us 67.36% 516.710us 86.118us 0.000us 0.00% 14.398us 2.400us 6
aten::_to_copy 2.84% 21.798us 66.61% 510.968us 85.161us 0.000us 0.00% 14.398us 2.400us 6
aten::copy_ 6.26% 48.021us 59.81% 458.808us 76.468us 12.190us 62.56% 14.398us 2.400us 6
aten::conv1d 0.69% 5.290us 14.07% 107.951us 35.984us 0.000us 0.00% 7.295us 2.432us 3
aten::convolution 1.14% 8.770us 13.38% 102.661us 34.220us 0.000us 0.00% 7.295us 2.432us 3
aten::_convolution 2.56% 19.629us 12.24% 93.891us 31.297us 0.000us 0.00% 7.295us 2.432us 3
aten::_conv_depthwise2d 2.72% 20.851us 7.84% 60.152us 20.051us 7.295us 37.44% 7.295us 2.432us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.295us 37.44% 7.295us 2.432us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.271us 32.18% 6.271us 2.090us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.919us 30.38% 5.919us 1.973us 3
Activity Buffer Request 29.70% 227.833us 29.70% 227.833us 227.833us 2.208us 11.33% 2.208us 2.208us 1
aten::empty_strided 3.96% 30.362us 3.96% 30.362us 5.060us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 26.62% 204.185us 26.62% 204.185us 22.687us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.01% 15.431us 2.57% 19.700us 2.189us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.98% 7.520us 0.98% 7.520us 0.501us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.29% 9.930us 1.29% 9.930us 3.310us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.06% 8.140us 1.06% 8.140us 2.713us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.67% 5.119us 0.83% 6.400us 2.133us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 767.122us
Self CUDA time total: 19.485us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 312.058us 1547.83% 312.058us 312.058us 1
torch_eager 19.84% 167.701us 99.34% 839.603us 839.603us 0.000us 0.00% 22.369us 22.369us 1
aten::to 0.69% 5.791us 63.55% 537.169us 89.528us 0.000us 0.00% 14.400us 2.400us 6
aten::_to_copy 2.59% 21.910us 62.87% 531.378us 88.563us 0.000us 0.00% 14.400us 2.400us 6
aten::copy_ 5.79% 48.970us 56.91% 481.028us 80.171us 12.192us 60.47% 14.400us 2.400us 6
aten::conv1d 0.65% 5.520us 13.10% 110.752us 36.917us 0.000us 0.00% 7.969us 2.656us 3
aten::convolution 1.03% 8.700us 12.45% 105.232us 35.077us 0.000us 0.00% 7.969us 2.656us 3
aten::_convolution 2.40% 20.311us 11.42% 96.532us 32.177us 0.000us 0.00% 7.969us 2.656us 3
aten::_conv_depthwise2d 2.39% 20.240us 7.28% 61.521us 20.507us 7.969us 39.53% 7.969us 2.656us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.969us 39.53% 7.969us 2.656us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.272us 31.11% 6.272us 2.091us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.920us 29.36% 5.920us 1.973us 3
Activity Buffer Request 29.19% 246.714us 29.19% 246.714us 246.714us 2.208us 10.95% 2.208us 2.208us 1
aten::empty_strided 3.36% 28.440us 3.36% 28.440us 4.740us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 24.70% 208.775us 24.70% 208.775us 23.197us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 1.84% 15.580us 2.41% 20.350us 2.261us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.95% 8.049us 0.95% 8.049us 0.537us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.07% 9.050us 1.07% 9.050us 3.017us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.04% 8.800us 1.04% 8.800us 2.933us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.63% 5.361us 0.79% 6.650us 2.217us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 845.213us
Self CUDA time total: 20.161us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 312.867us 859.95% 312.867us 312.867us 1
torch_eager 14.44% 112.752us 99.36% 776.042us 776.042us 0.000us 0.00% 39.006us 39.006us 1
aten::conv1d 0.71% 5.580us 13.99% 109.252us 36.417us 0.000us 0.00% 20.512us 6.837us 3
aten::convolution 1.09% 8.531us 13.27% 103.672us 34.557us 0.000us 0.00% 20.512us 6.837us 3
aten::_convolution 2.62% 20.459us 12.18% 95.141us 31.714us 0.000us 0.00% 20.512us 6.837us 3
aten::_conv_depthwise2d 2.59% 20.222us 7.70% 60.162us 20.054us 20.512us 56.38% 20.512us 6.837us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 20.512us 56.38% 20.512us 6.837us 3
aten::to 0.75% 5.821us 67.81% 529.608us 88.268us 0.000us 0.00% 18.494us 3.082us 6
aten::_to_copy 2.86% 22.338us 67.06% 523.787us 87.298us 0.000us 0.00% 18.494us 3.082us 6
aten::copy_ 6.02% 47.020us 60.45% 472.148us 78.691us 15.870us 43.62% 18.494us 3.082us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.447us 23.22% 8.447us 2.816us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.423us 20.40% 7.423us 2.474us 3
Activity Buffer Request 30.80% 240.594us 30.80% 240.594us 240.594us 2.624us 7.21% 2.624us 2.624us 1
aten::empty_strided 3.75% 29.301us 3.75% 29.301us 4.884us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 26.46% 206.633us 26.46% 206.633us 22.959us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.01% 15.720us 2.61% 20.410us 2.268us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.02% 7.981us 1.02% 7.981us 0.532us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.13% 8.841us 1.13% 8.841us 2.947us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.15% 9.000us 1.15% 9.000us 3.000us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.68% 5.329us 0.84% 6.560us 2.187us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 781.073us
Self CUDA time total: 36.382us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 353.311us 916.31% 353.311us 353.311us 1
torch_eager 17.31% 144.171us 99.40% 827.943us 827.943us 0.000us 0.00% 41.150us 41.150us 1
aten::conv1d 0.66% 5.470us 14.12% 117.601us 39.200us 0.000us 0.00% 22.624us 7.541us 3
aten::convolution 1.09% 9.120us 13.46% 112.131us 37.377us 0.000us 0.00% 22.624us 7.541us 3
aten::_convolution 2.77% 23.100us 12.37% 103.011us 34.337us 0.000us 0.00% 22.624us 7.541us 3
aten::_conv_depthwise2d 2.63% 21.901us 7.78% 64.791us 21.597us 22.624us 58.68% 22.624us 7.541us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.624us 58.68% 22.624us 7.541us 3
aten::to 0.71% 5.920us 64.88% 540.450us 90.075us 0.000us 0.00% 18.526us 3.088us 6
aten::_to_copy 2.59% 21.613us 64.17% 534.530us 89.088us 0.000us 0.00% 18.526us 3.088us 6
aten::copy_ 5.88% 48.990us 58.06% 483.646us 80.608us 15.934us 41.32% 18.526us 3.088us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.575us 22.24% 8.575us 2.858us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.359us 19.09% 7.359us 2.453us 3
Activity Buffer Request 29.91% 249.164us 29.91% 249.164us 249.164us 2.592us 6.72% 2.592us 2.592us 1
aten::empty_strided 3.51% 29.271us 3.51% 29.271us 4.879us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 25.18% 209.712us 25.18% 209.712us 23.301us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 1.99% 16.542us 2.59% 21.611us 2.401us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.04% 8.638us 1.04% 8.638us 0.576us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.16% 9.650us 1.16% 9.650us 3.217us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.08% 9.020us 1.08% 9.020us 3.007us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.68% 5.681us 0.85% 7.060us 2.353us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 832.973us
Self CUDA time total: 38.558us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 316.829us 488.45% 316.829us 316.829us 1
torch_eager 14.19% 114.002us 99.33% 798.183us 798.183us 0.000us 0.00% 68.991us 68.991us 1
aten::conv1d 0.68% 5.460us 13.80% 110.892us 36.964us 0.000us 0.00% 42.304us 14.101us 3
aten::convolution 1.10% 8.859us 13.12% 105.432us 35.144us 0.000us 0.00% 42.304us 14.101us 3
aten::_convolution 2.59% 20.821us 12.02% 96.573us 32.191us 0.000us 0.00% 42.304us 14.101us 3
aten::_conv_depthwise2d 2.64% 21.190us 7.50% 60.251us 20.084us 42.304us 65.22% 42.304us 14.101us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 42.304us 65.22% 42.304us 14.101us 3
aten::to 0.75% 6.059us 68.35% 549.177us 91.530us 0.000us 0.00% 26.687us 4.448us 6
aten::_to_copy 2.76% 22.169us 67.59% 543.118us 90.520us 0.000us 0.00% 26.687us 4.448us 6
aten::copy_ 6.74% 54.161us 61.27% 492.308us 82.051us 22.560us 34.78% 26.687us 4.448us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 12.095us 18.65% 12.095us 4.032us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.465us 16.13% 10.465us 3.488us 3
Activity Buffer Request 31.75% 255.134us 31.75% 255.134us 255.134us 4.127us 6.36% 4.127us 4.127us 1
aten::empty_strided 3.56% 28.641us 3.56% 28.641us 4.773us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 25.49% 204.843us 25.49% 204.843us 22.760us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 2.06% 16.521us 2.65% 21.322us 2.369us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 1.02% 8.171us 1.02% 8.171us 0.545us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.14% 9.170us 1.14% 9.170us 3.057us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.00% 8.061us 1.00% 8.061us 2.687us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.66% 5.330us 0.81% 6.520us 2.173us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 803.533us
Self CUDA time total: 64.864us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 328.383us 466.25% 328.383us 328.383us 1
torch_eager 5.82% 138.672us 99.78% 2.376ms 2.376ms 0.000us 0.00% 74.527us 74.527us 1
aten::conv1d 0.24% 5.689us 4.87% 115.970us 38.657us 0.000us 0.00% 47.969us 15.990us 3
aten::convolution 0.43% 10.191us 4.63% 110.281us 36.760us 0.000us 0.00% 47.969us 15.990us 3
aten::_convolution 0.91% 21.579us 4.20% 100.090us 33.363us 0.000us 0.00% 47.969us 15.990us 3
aten::_conv_depthwise2d 0.87% 20.670us 2.63% 62.670us 20.890us 47.969us 68.11% 47.969us 15.990us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 47.969us 68.11% 47.969us 15.990us 3
aten::to 0.27% 6.430us 88.04% 2.097ms 349.464us 0.000us 0.00% 26.558us 4.426us 6
aten::_to_copy 0.99% 23.642us 87.77% 2.090ms 348.392us 0.000us 0.00% 26.558us 4.426us 6
aten::copy_ 2.06% 49.120us 85.54% 2.037ms 339.525us 22.462us 31.89% 26.558us 4.426us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.999us 17.04% 11.999us 4.000us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.463us 14.86% 10.463us 3.488us 3
Activity Buffer Request 75.66% 1.802ms 75.66% 1.802ms 1.802ms 4.096us 5.82% 4.096us 4.096us 1
aten::empty_strided 1.24% 29.560us 1.24% 29.560us 4.927us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 8.75% 208.373us 8.75% 208.373us 23.153us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.70% 16.782us 0.92% 21.972us 2.441us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.36% 8.520us 0.36% 8.520us 0.568us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.38% 9.160us 0.38% 9.160us 3.053us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.44% 10.580us 0.44% 10.580us 3.527us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.24% 5.730us 0.29% 7.020us 2.340us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.382ms
Self CUDA time total: 70.431us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 336.351us 179.68% 336.351us 336.351us 1
torch_eager 5.85% 142.571us 99.79% 2.430ms 2.430ms 0.000us 0.00% 197.311us 197.311us 1
aten::conv1d 0.28% 6.741us 4.71% 114.731us 38.244us 0.000us 0.00% 134.368us 44.789us 3
aten::convolution 0.38% 9.350us 4.43% 107.990us 35.997us 0.000us 0.00% 134.368us 44.789us 3
aten::_convolution 0.88% 21.488us 4.05% 98.640us 32.880us 0.000us 0.00% 134.368us 44.789us 3
aten::_conv_depthwise2d 0.83% 20.301us 2.51% 61.091us 20.364us 134.368us 71.78% 134.368us 44.789us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 134.368us 71.78% 134.368us 44.789us 3
aten::to 0.26% 6.379us 88.22% 2.148ms 358.072us 0.000us 0.00% 62.943us 10.491us 6
aten::_to_copy 0.93% 22.632us 87.96% 2.142ms 357.009us 0.000us 0.00% 62.943us 10.491us 6
aten::copy_ 2.03% 49.489us 85.76% 2.089ms 348.110us 52.831us 28.22% 62.943us 10.491us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.727us 15.88% 29.727us 9.909us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.104us 12.34% 23.104us 7.701us 3
Activity Buffer Request 76.11% 1.853ms 76.11% 1.853ms 1.853ms 10.112us 5.40% 10.112us 10.112us 1
aten::empty_strided 1.26% 30.760us 1.26% 30.760us 5.127us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 8.55% 208.274us 8.55% 208.274us 23.142us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.71% 17.184us 0.91% 22.223us 2.469us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.34% 8.338us 0.34% 8.338us 0.556us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.38% 9.180us 0.38% 9.180us 3.060us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.37% 9.020us 0.37% 9.020us 3.007us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.22% 5.460us 0.27% 6.690us 2.230us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.435ms
Self CUDA time total: 187.199us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 335.323us 159.21% 335.323us 335.323us 1
torch_eager 14.44% 115.471us 99.40% 794.842us 794.842us 0.000us 0.00% 223.709us 223.709us 1
aten::conv1d 0.70% 5.561us 13.80% 110.362us 36.787us 0.000us 0.00% 154.845us 51.615us 3
aten::convolution 1.15% 9.189us 13.11% 104.801us 34.934us 0.000us 0.00% 154.845us 51.615us 3
aten::_convolution 2.52% 20.182us 11.96% 95.612us 31.871us 0.000us 0.00% 154.845us 51.615us 3
aten::_conv_depthwise2d 2.51% 20.101us 7.60% 60.741us 20.247us 154.845us 73.52% 154.845us 51.615us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 154.845us 73.52% 154.845us 51.615us 3
aten::to 0.72% 5.750us 68.18% 545.179us 90.863us 0.000us 0.00% 68.864us 11.477us 6
aten::_to_copy 2.77% 22.130us 67.46% 539.429us 89.905us 0.000us 0.00% 68.864us 11.477us 6
aten::copy_ 5.86% 46.830us 60.79% 486.078us 81.013us 55.776us 26.48% 68.864us 11.477us 6
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 32.416us 15.39% 32.416us 10.805us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.360us 11.09% 23.360us 7.787us 3
Activity Buffer Request 31.66% 253.204us 31.66% 253.204us 253.204us 13.088us 6.21% 13.088us 13.088us 1
aten::empty_strided 3.90% 31.221us 3.90% 31.221us 5.203us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 26.02% 208.054us 26.02% 208.054us 23.117us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 1.93% 15.399us 2.47% 19.760us 2.196us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.98% 7.800us 0.98% 7.800us 0.520us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 1.23% 9.810us 1.23% 9.810us 3.270us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 1.10% 8.820us 1.10% 8.820us 2.940us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.69% 5.519us 0.86% 6.899us 2.300us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 799.662us
Self CUDA time total: 210.621us
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 6.62% 120.362us 52.56% 956.135us 956.135us 0.000us 0.00% 1.509ms 1.509ms 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.411ms 100.41% 1.411ms 1.411ms 1
aten::to 0.34% 6.140us 38.13% 693.750us 115.625us 0.000us 0.00% 815.515us 135.919us 6
aten::_to_copy 1.53% 27.810us 37.80% 687.610us 114.602us 0.000us 0.00% 815.515us 135.919us 6
aten::copy_ 2.83% 51.570us 25.68% 467.247us 77.874us 711.740us 50.66% 815.515us 135.919us 6
aten::conv1d 0.32% 5.781us 6.36% 115.702us 38.567us 0.000us 0.00% 693.278us 231.093us 3
aten::convolution 0.51% 9.289us 6.04% 109.921us 36.640us 0.000us 0.00% 693.278us 231.093us 3
aten::_convolution 1.19% 21.630us 5.53% 100.632us 33.544us 0.000us 0.00% 693.278us 231.093us 3
aten::_conv_depthwise2d 1.16% 21.108us 3.52% 63.951us 21.317us 693.278us 49.34% 693.278us 231.093us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 693.278us 49.34% 693.278us 231.093us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 405.439us 28.86% 405.439us 135.146us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 306.301us 21.80% 306.301us 102.100us 3
Activity Buffer Request 12.14% 220.924us 12.14% 220.924us 220.924us 103.775us 7.39% 103.775us 103.775us 1
aten::empty_strided 1.98% 36.051us 10.58% 192.553us 32.092us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 12.05% 219.204us 12.05% 219.204us 24.356us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.93% 16.940us 1.22% 22.200us 2.467us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.48% 8.651us 0.48% 8.651us 0.577us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.51% 9.201us 0.51% 9.201us 3.067us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.51% 9.191us 0.51% 9.191us 3.064us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.31% 5.621us 0.38% 6.871us 2.290us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.819ms
Self CUDA time total: 1.405ms
======================================================================
PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 6.07% 112.213us 42.26% 781.792us 781.792us 0.000us 0.00% 1.498ms 1.498ms 1
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.428ms 100.39% 1.428ms 1.428ms 1
aten::to 0.33% 6.130us 28.74% 531.749us 88.625us 0.000us 0.00% 757.569us 126.261us 6
aten::_to_copy 1.23% 22.780us 28.41% 525.619us 87.603us 0.000us 0.00% 757.569us 126.261us 6
aten::copy_ 2.64% 48.852us 25.56% 472.969us 78.828us 682.049us 47.95% 757.569us 126.261us 6
aten::conv1d 0.33% 6.130us 6.13% 113.361us 37.787us 0.000us 0.00% 740.449us 246.816us 3
aten::convolution 0.48% 8.889us 5.80% 107.231us 35.744us 0.000us 0.00% 740.449us 246.816us 3
aten::_convolution 1.13% 20.931us 5.32% 98.342us 32.781us 0.000us 0.00% 740.449us 246.816us 3
aten::_conv_depthwise2d 1.15% 21.330us 3.38% 62.491us 20.830us 740.449us 52.05% 740.449us 246.816us 3
void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 740.449us 52.05% 740.449us 246.816us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 397.857us 27.97% 397.857us 132.619us 3
void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 284.192us 19.98% 284.192us 94.731us 3
Activity Buffer Request 12.95% 239.644us 12.95% 239.644us 239.644us 75.520us 5.31% 75.520us 75.520us 1
aten::empty_strided 1.61% 29.870us 1.61% 29.870us 4.978us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 11.17% 206.574us 11.17% 206.574us 22.953us 0.000us 0.00% 0.000us 0.000us 9
aten::unsqueeze 0.85% 15.779us 1.12% 20.809us 2.312us 0.000us 0.00% 0.000us 0.000us 9
aten::as_strided 0.45% 8.409us 0.45% 8.409us 0.561us 0.000us 0.00% 0.000us 0.000us 15
aten::empty 0.49% 9.120us 0.49% 9.120us 3.040us 0.000us 0.00% 0.000us 0.000us 3
aten::resize_ 0.54% 9.940us 0.54% 9.940us 3.313us 0.000us 0.00% 0.000us 0.000us 3
aten::squeeze 0.29% 5.381us 0.36% 6.700us 2.233us 0.000us 0.00% 0.000us 0.000us 3
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.850ms
Self CUDA time total: 1.422ms
impl wl p50(ms) ok
torch_eager cuda_B2_D2048_S128_W2 0.08 True
torch_eager cuda_B2_D2048_S128_W4 0.08 True
torch_eager cuda_B2_D2048_S2048_W2 0.16 True
torch_eager cuda_B2_D2048_S2048_W4 0.16 True
torch_eager cuda_B2_D2048_S512_W2 0.08 True
torch_eager cuda_B2_D2048_S512_W4 0.08 True
torch_eager cuda_B2_D64_S128_W2 0.07 True
torch_eager cuda_B2_D64_S128_W4 0.08 True
torch_eager cuda_B2_D64_S2048_W2 0.08 True
torch_eager cuda_B2_D64_S2048_W4 0.08 True
torch_eager cuda_B2_D64_S512_W2 0.08 True
torch_eager cuda_B2_D64_S512_W4 0.08 True
torch_eager cuda_B4_D2048_S128_W2 0.08 True
torch_eager cuda_B4_D2048_S128_W4 0.08 True
torch_eager cuda_B4_D2048_S2048_W2 0.48 True
torch_eager cuda_B4_D2048_S2048_W4 0.50 True
torch_eager cuda_B4_D2048_S512_W2 0.09 True
torch_eager cuda_B4_D2048_S512_W4 0.10 True
torch_eager cuda_B4_D64_S128_W2 0.08 True
torch_eager cuda_B4_D64_S128_W4 0.08 True
torch_eager cuda_B4_D64_S2048_W2 0.08 True
torch_eager cuda_B4_D64_S2048_W4 0.08 True
torch_eager cuda_B4_D64_S512_W2 0.08 True
torch_eager cuda_B4_D64_S512_W4 0.08 True