@Article{CSIAM-AM-3-4,
author = {Yuqing, Li and Tao, Luo and Yip, Nung, Kwan},
title = {Towards an Understanding of Residual Networks Using Neural Tangent Hierarchy (NTH)},
journal = {CSIAM Transactions on Applied Mathematics},
year = {2022},
volume = {3},
number = {4},
pages = {692--760},
abstract = {<p style="text-align: justify;">Gradient descent yields zero training loss in polynomial time for deep neural networks despite non-convex nature of the objective function. The behavior of
network in the infinite width limit trained by gradient descent can be described by the
Neural Tangent Kernel (NTK) introduced in [25]. In this paper, we study dynamics of
the NTK for finite width Deep Residual Network (ResNet) using the neural tangent
hierarchy (NTH) proposed in [24]. For a ResNet with smooth and Lipschitz activation
function, we reduce the requirement on the layer width $m$ with respect to the number
of training samples $n$ from quartic to cubic. Our analysis suggests strongly that the
particular skip-connection structure of ResNet is the main reason for its triumph over
fully-connected network.</p>},
issn = {2708-0579},
doi = {https://doi.org/10.4208/csiam-am.SO-2021-0053},
url = {https://global-sci.com/article/82332/towards-an-understanding-of-residual-networks-using-neural-tangent-hierarchy-nth}
}