@InProceedings{ Ahmed.Aly.Gonzalez.ea.2012, title = {Scalable inference in latent variable models}, author = {Ahmed, Amr and Aly, Moahmed and Gonzalez, Joseph and Narayanamurthy, Shravan and Smola, Alexander J}, booktitle = {Proceedings of the fifth ACM international conference on Web search and data mining}, pages = {123--132}, year = {2012}, organization = {ACM} } @Article{ Aji.McEliece.2000, title = {The generalized distributive law}, author = {Aji, Srinivas M and McEliece, Robert J}, journal = {IEEE transactions on Information Theory}, volume = {46}, number = {2}, pages = {325--343}, year = {2000}, publisher = {IEEE} } @Article{ Ba.Kiros.Hinton.2016, title = {Layer normalization}, author = {Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E}, journal = {arXiv preprint arXiv:1607.06450}, year = {2016} } @Article{ Bahdanau.Cho.Bengio.2014, title = {Neural machine translation by jointly learning to align and translate}, author = {Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua}, journal = {arXiv preprint arXiv:1409.0473}, year = {2014} } @InProceedings{ Bay.Tuytelaars.Van-Gool.2006, title = {Surf: Speeded up robust features}, author = {Bay, Herbert and Tuytelaars, Tinne and Van Gool, Luc}, booktitle = {European conference on computer vision}, pages = {404--417}, year = {2006}, organization = {Springer} } @Article{ Bengio.Ducharme.Vincent.ea.2003, title = {A neural probabilistic language model}, author = {Bengio, Yoshua and Ducharme, R{\'e}jean and Vincent, Pascal and Jauvin, Christian}, journal = {Journal of machine learning research}, volume = {3}, number = {Feb}, pages = {1137--1155}, year = {2003} } @Article{ Bishop.1995, title = {Training with noise is equivalent to Tikhonov regularization}, author = {Bishop, Chris M}, journal = {Neural computation}, volume = {7}, number = {1}, pages = {108--116}, year = {1995}, publisher = {MIT Press} } @Book{ Bishop.2006, title = {Pattern recognition and machine learning}, author = {Bishop, Christopher M}, year = {2006}, publisher = {springer} } @InProceedings{ Bodla.Singh.Chellappa.ea.2017, title = {Soft-NMS--improving object detection with one line of code}, author = {Bodla, Navaneeth and Singh, Bharat and Chellappa, Rama and Davis, Larry S}, booktitle = {Proceedings of the IEEE international conference on computer vision}, pages = {5561--5569}, year = {2017} } @Article{ Bojanowski.Grave.Joulin.ea.2017, title = {Enriching word vectors with subword information}, author = {Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas}, journal = {Transactions of the Association for Computational Linguistics}, volume = {5}, pages = {135--146}, year = {2017}, publisher = {MIT Press} } @Book{ Bollobas.1999, title = {Linear analysis}, author = {Bollob{\'a}s, B}, year = {1999}, publisher = {Cambridge University Press, Cambridge} } @Article{ Bowman.Angeli.Potts.ea.2015, title = {A large annotated corpus for learning natural language inference}, author = {Bowman, Samuel R and Angeli, Gabor and Potts, Christopher and Manning, Christopher D}, journal = {arXiv preprint arXiv:1508.05326}, year = {2015} } @Book{ Boyd.Vandenberghe.2004, address = {Cambridge, England}, author = {Stephen Boyd and Lieven Vandenberghe}, publisher = {Cambridge University Press}, title = {Convex Optimization}, year = 2004 } @InProceedings{ Brown.Cocke.Della-Pietra.ea.1988, title = {A statistical approach to language translation}, author = {Brown, Peter F and Cocke, John and Della Pietra, Stephen A and Della Pietra, Vincent J and Jelinek, Frederick and Mercer, Robert L and Roossin, Paul}, booktitle = {Coling Budapest 1988 Volume 1: International Conference on Computational Linguistics}, year = {1988} } @Article{ Brown.Cocke.Della-Pietra.ea.1990, title = {A statistical approach to machine translation}, author = {Brown, Peter F and Cocke, John and Della Pietra, Stephen A and Della Pietra, Vincent J and Jelinek, Frederick and Lafferty, John and Mercer, Robert L and Roossin, Paul S}, journal = {Computational linguistics}, volume = {16}, number = {2}, pages = {79--85}, year = {1990} } @InProceedings{ Brown.Sandholm.2017, title = {Libratus: The Superhuman AI for No-Limit Poker.}, author = {Brown, Noam and Sandholm, Tuomas}, booktitle = {IJCAI}, pages = {5226--5228}, year = {2017} } @Article{ Campbell.Hoane-Jr.Hsu.2002, title = {Deep blue}, author = {Campbell, Murray and Hoane Jr, A Joseph and Hsu, Feng-hsiung}, journal = {Artificial intelligence}, volume = {134}, number = {1-2}, pages = {57--83}, year = {2002}, publisher = {Elsevier} } @InCollection{ Canny.1987, title = {A computational approach to edge detection}, author = {Canny, John}, booktitle = {Readings in computer vision}, pages = {184--203}, year = {1987}, publisher = {Elsevier} } @InProceedings{ Cer.Diab.Agirre.ea.2017, title = {SemEval-2017 Task 1: Semantic Textual Similarity Multilingual and Crosslingual Focused Evaluation}, author = {Cer, Daniel and Diab, Mona and Agirre, Eneko and Lopez-Gazpio, I{\~n}igo and Specia, Lucia}, booktitle = {Proceedings of the 11th International Workshop on Semantic Evaluation (SemEval-2017)}, pages = {1--14}, year = {2017} } @InProceedings{ Cheng.Dong.Lapata.2016, title = {Long Short-Term Memory-Networks for Machine Reading}, author = {Cheng, Jianpeng and Dong, Li and Lapata, Mirella}, booktitle = {Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing}, pages = {551--561}, year = {2016} } @Article{ Cho.Van-Merrienboer.Bahdanau.ea.2014, title = {On the properties of neural machine translation: Encoder-decoder approaches}, author = {Cho, Kyunghyun and Van Merri{\"e}nboer, Bart and Bahdanau, Dzmitry and Bengio, Yoshua}, journal = {arXiv preprint arXiv:1409.1259}, year = {2014} } @Article{ Cho.Van-Merrienboer.Gulcehre.ea.2014, title = {Learning phrase representations using RNN encoder-decoder for statistical machine translation}, author = {Cho, Kyunghyun and Van Merri{\"e}nboer, Bart and Gulcehre, Caglar and Bahdanau, Dzmitry and Bougares, Fethi and Schwenk, Holger and Bengio, Yoshua}, journal = {arXiv preprint arXiv:1406.1078}, year = {2014} } @Book{ Chowdhury.2010, title = {Introduction to modern information retrieval}, author = {Chowdhury, Gobinda G}, year = {2010}, publisher = {Facet publishing} } @Article{ Chung.Gulcehre.Cho.ea.2014, title = {Empirical evaluation of gated recurrent neural networks on sequence modeling}, author = {Chung, Junyoung and Gulcehre, Caglar and Cho, KyungHyun and Bengio, Yoshua}, journal = {arXiv preprint arXiv:1412.3555}, year = {2014} } @Article{ Collobert.Weston.Bottou.ea.2011, title = {Natural language processing (almost) from scratch}, author = {Collobert, Ronan and Weston, Jason and Bottou, L{\'e}on and Karlen, Michael and Kavukcuoglu, Koray and Kuksa, Pavel}, journal = {Journal of machine learning research}, volume = {12}, number = {ARTICLE}, pages = {2493--2537}, year = {2011} } @Article{ Csiszar.2008, title = {Axiomatic characterizations of information measures}, author = {Csisz{\'a}r, Imre}, journal = {Entropy}, volume = {10}, number = {3}, pages = {261--273}, year = {2008}, publisher = {Molecular Diversity Preservation International} } @InProceedings{ Dalal.Triggs.2005, title = {Histograms of oriented gradients for human detection}, author = {Dalal, Navneet and Triggs, Bill}, booktitle = {2005 IEEE computer society conference on computer vision and pattern recognition (CVPR'05)}, volume = {1}, pages = {886--893}, year = {2005}, organization = {IEEE} } @Article{ De-Cock.2011, title = {Ames, Iowa: Alternative to the Boston housing data as an end of semester regression project}, author = {De Cock, Dean}, journal = {Journal of Statistics Education}, volume = {19}, number = {3}, year = {2011}, publisher = {Taylor \& Francis} } @InProceedings{ DeCandia.Hastorun.Jampani.ea.2007, title = {Dynamo: Amazon's highly available key-value store}, author = {DeCandia, Giuseppe and Hastorun, Deniz and Jampani, Madan and Kakulapati, Gunavardhan and Lakshman, Avinash and Pilchin, Alex and Sivasubramanian, Swaminathan and Vosshall, Peter and Vogels, Werner}, booktitle = {ACM SIGOPS operating systems review}, volume = {41}, number = {6}, pages = {205--220}, year = {2007}, organization = {ACM} } @Article{ Devlin.Chang.Lee.ea.2018, title = {Bert: Pre-training of deep bidirectional transformers for language understanding}, author = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina}, journal = {arXiv preprint arXiv:1810.04805}, year = {2018} } @InProceedings{ Doersch.Gupta.Efros.2015, title = {Unsupervised visual representation learning by context prediction}, author = {Doersch, Carl and Gupta, Abhinav and Efros, Alexei A}, booktitle = {Proceedings of the IEEE international conference on computer vision}, pages = {1422--1430}, year = {2015} } @InProceedings{ Dosovitskiy.Beyer.Kolesnikov.ea.2021, title = {An image is worth 16x16 words: Transformers for image recognition at scale}, author = {Dosovitskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and others}, booktitle = {International Conference on Learning Representations}, year = {2021} } @InCollection{ Doucet.De-Freitas.Gordon.2001, title = {An introduction to sequential Monte Carlo methods}, author = {Doucet, Arnaud and De Freitas, Nando and Gordon, Neil}, booktitle = {Sequential Monte Carlo methods in practice}, pages = {3--14}, year = {2001}, publisher = {Springer} } @Article{ Duchi.Hazan.Singer.2011, title = {Adaptive subgradient methods for online learning and stochastic optimization}, author = {Duchi, John and Hazan, Elad and Singer, Yoram}, journal = {Journal of Machine Learning Research}, volume = {12}, number = {Jul}, pages = {2121--2159}, year = {2011} } @Article{ Dumoulin.Visin.2016, title = {A guide to convolution arithmetic for deep learning}, author = {Dumoulin, Vincent and Visin, Francesco}, journal = {arXiv preprint arXiv:1603.07285}, year = {2016} } @Article{ Edelman.Ostrovsky.Schwarz.2007, title = {Internet advertising and the generalized second-price auction: Selling billions of dollars worth of keywords}, author = {Edelman, Benjamin and Ostrovsky, Michael and Schwarz, Michael}, journal = {American economic review}, volume = {97}, number = {1}, pages = {242--259}, year = {2007} } @InProceedings{ Flammarion.Bach.2015, title = {From averaging to acceleration, there is only a step-size}, author = {Flammarion, Nicolas and Bach, Francis}, booktitle = {Conference on Learning Theory}, pages = {658--695}, year = {2015} } @InProceedings{ Gatys.Ecker.Bethge.2016, title = {Image style transfer using convolutional neural networks}, author = {Gatys, Leon A and Ecker, Alexander S and Bethge, Matthias}, booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition}, pages = {2414--2423}, year = {2016} } @Article{ Ginibre.1965, title = {Statistical ensembles of complex, quaternion, and real matrices}, author = {Ginibre, Jean}, journal = {Journal of Mathematical Physics}, volume = {6}, number = {3}, pages = {440--449}, year = {1965}, publisher = {AIP} } @InProceedings{ Girshick.2015, title = {Fast r-cnn}, author = {Girshick, Ross}, booktitle = {Proceedings of the IEEE international conference on computer vision}, pages = {1440--1448}, year = {2015} } @InProceedings{ Girshick.Donahue.Darrell.ea.2014, title = {Rich feature hierarchies for accurate object detection and semantic segmentation}, author = {Girshick, Ross and Donahue, Jeff and Darrell, Trevor and Malik, Jitendra}, booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition}, pages = {580--587}, year = {2014} } @InProceedings{ Glorot.Bengio.2010, title = {Understanding the difficulty of training deep feedforward neural networks}, author = {Glorot, Xavier and Bengio, Yoshua}, booktitle = {Proceedings of the thirteenth international conference on artificial intelligence and statistics}, pages = {249--256}, year = {2010} } @Article{ Goh.2017, author = {Goh, Gabriel}, title = {Why Momentum Really Works}, journal = {Distill}, year = {2017}, url = {http://distill.pub/2017/momentum}, doi = {10.23915/distill.00006} } @Article{ Goldberg.Nichols.Oki.ea.1992, title = {Using collaborative filtering to weave an information tapestry}, author = {Goldberg, David and Nichols, David and Oki, Brian M and Terry, Douglas}, journal = {Communications of the ACM}, volume = {35}, number = {12}, pages = {61--71}, year = {1992}, publisher = {Association for Computing Machinery, Inc.} } @Book{ Goodfellow.Bengio.Courville.2016, title = {Deep Learning}, author = {Ian Goodfellow and Yoshua Bengio and Aaron Courville}, publisher = {MIT Press}, note = {\url{http://www.deeplearningbook.org}}, year = {2016} } @InProceedings{ Goodfellow.Pouget-Abadie.Mirza.ea.2014, title = {Generative adversarial nets}, author = {Goodfellow, Ian and Pouget-Abadie, Jean and Mirza, Mehdi and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and Courville, Aaron and Bengio, Yoshua}, booktitle = {Advances in neural information processing systems}, pages = {2672--2680}, year = {2014} } @Article{ Gotmare.Keskar.Xiong.ea.2018, title = {A Closer Look at Deep Learning Heuristics: Learning rate restarts, Warmup and Distillation}, author = {Gotmare, Akhilesh and Keskar, Nitish Shirish and Xiong, Caiming and Socher, Richard}, journal = {arXiv preprint arXiv:1810.13243}, year = {2018} } @Article{ Graves.2013, title = {Generating sequences with recurrent neural networks}, author = {Graves, Alex}, journal = {arXiv preprint arXiv:1308.0850}, year = {2013} } @Article{ Graves.Schmidhuber.2005, title = {Framewise phoneme classification with bidirectional LSTM and other neural network architectures}, author = {Graves, Alex and Schmidhuber, J{\"u}rgen}, journal = {Neural networks}, volume = {18}, number = {5-6}, pages = {602--610}, year = {2005}, publisher = {Elsevier} } @InCollection{ Gunawardana.Shani.2015, title = {Evaluating recommender systems}, author = {Gunawardana, Asela and Shani, Guy}, booktitle = {Recommender systems handbook}, pages = {265--308}, year = {2015}, publisher = {Springer} } @InProceedings{ Guo.Tang.Ye.ea.2017, title = {DeepFM: a factorization-machine based neural network for CTR prediction}, author = {Guo, Huifeng and Tang, Ruiming and Ye, Yunming and Li, Zhenguo and He, Xiuqiang}, booktitle = {Proceedings of the 26th International Joint Conference on Artificial Intelligence}, pages = {1725--1731}, year = {2017}, organization = {AAAI Press} } @Article{ Hadjis.Zhang.Mitliagkas.ea.2016, title = {Omnivore: An optimizer for multi-device deep learning on cpus and gpus}, author = {Hadjis, Stefan and Zhang, Ce and Mitliagkas, Ioannis and Iter, Dan and R{\'e}, Christopher}, journal = {arXiv preprint arXiv:1606.04487}, year = {2016} } @InProceedings{ Hazan.Rakhlin.Bartlett.2008, title = {Adaptive online gradient descent}, author = {Hazan, Elad and Rakhlin, Alexander and Bartlett, Peter L}, booktitle = {Advances in Neural Information Processing Systems}, pages = {65--72}, year = {2008} } @InProceedings{ He.Chua.2017, title = {Neural factorization machines for sparse predictive analytics}, author = {He, Xiangnan and Chua, Tat-Seng}, booktitle = {Proceedings of the 40th International ACM SIGIR conference on Research and Development in Information Retrieval}, pages = {355--364}, year = {2017}, organization = {ACM} } @InProceedings{ He.Gkioxari.Dollar.ea.2017, title = {Mask r-cnn}, author = {He, Kaiming and Gkioxari, Georgia and Doll{\'a}r, Piotr and Girshick, Ross}, booktitle = {Proceedings of the IEEE international conference on computer vision}, pages = {2961--2969}, year = {2017} } @InProceedings{ He.Liao.Zhang.ea.2017, title = {Neural collaborative filtering}, author = {He, Xiangnan and Liao, Lizi and Zhang, Hanwang and Nie, Liqiang and Hu, Xia and Chua, Tat-Seng}, booktitle = {Proceedings of the 26th international conference on world wide web}, pages = {173--182}, year = {2017}, organization = {International World Wide Web Conferences Steering Committee} } @InProceedings{ He.Zhang.Ren.ea.2015, title = {Delving deep into rectifiers: Surpassing human-level performance on imagenet classification}, author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, booktitle = {Proceedings of the IEEE international conference on computer vision}, pages = {1026--1034}, year = {2015} } @InProceedings{ He.Zhang.Ren.ea.2016, title = {Deep residual learning for image recognition}, author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition}, pages = {770--778}, year = {2016} } @InProceedings{ He.Zhang.Ren.ea.2016*1, title = {Identity mappings in deep residual networks}, author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, booktitle = {European conference on computer vision}, pages = {630--645}, year = {2016}, organization = {Springer} } @Book{ Hebb.Hebb.1949, title = {The organization of behavior}, author = {Hebb, Donald Olding and Hebb, DO}, volume = {65}, year = {1949}, publisher = {Wiley New York} } @Article{ Hendrycks.Gimpel.2016, title = {Gaussian error linear units (gelus)}, author = {Hendrycks, Dan and Gimpel, Kevin}, journal = {arXiv preprint arXiv:1606.08415}, year = {2016} } @Book{ Hennessy.Patterson.2011, title = {Computer architecture: a quantitative approach}, author = {Hennessy, John L and Patterson, David A}, year = {2011}, publisher = {Elsevier} } @InProceedings{ Herlocker.Konstan.Borchers.ea.1999, title = {An algorithmic framework for performing collaborative filtering}, author = {Herlocker, Jonathan L and Konstan, Joseph A and Borchers, Al and Riedl, John}, booktitle = {22nd Annual International ACM SIGIR Conference on Research and Development in Information Retrieval, SIGIR 1999}, pages = {230--237}, year = {1999}, organization = {Association for Computing Machinery, Inc} } @Article{ Hidasi.Karatzoglou.Baltrunas.ea.2015, title = {Session-based recommendations with recurrent neural networks}, author = {Hidasi, Bal{\'a}zs and Karatzoglou, Alexandros and Baltrunas, Linas and Tikk, Domonkos}, journal = {arXiv preprint arXiv:1511.06939}, year = {2015} } @Misc{ Hochreiter.Bengio.Frasconi.ea.2001, title = {Gradient flow in recurrent nets: the difficulty of learning long-term dependencies}, author = {Hochreiter, Sepp and Bengio, Yoshua and Frasconi, Paolo and Schmidhuber, J{\"u}rgen and others}, year = {2001}, publisher = {A field guide to dynamical recurrent neural networks. IEEE Press} } @Article{ Hochreiter.Schmidhuber.1997, title = {Long short-term memory}, author = {Hochreiter, Sepp and Schmidhuber, J{\"u}rgen}, journal = {Neural computation}, volume = {9}, number = {8}, pages = {1735--1780}, year = {1997}, publisher = {MIT Press} } @InProceedings{ Hoyer.Janzing.Mooij.ea.2009, title = {Nonlinear causal discovery with additive noise models}, author = {Hoyer, Patrik O and Janzing, Dominik and Mooij, Joris M and Peters, Jonas and Sch{\"o}lkopf, Bernhard}, booktitle = {Advances in neural information processing systems}, pages = {689--696}, year = {2009} } @InProceedings{ Hu.Koren.Volinsky.2008, title = {Collaborative filtering for implicit feedback datasets}, author = {Hu, Yifan and Koren, Yehuda and Volinsky, Chris}, booktitle = {2008 Eighth IEEE International Conference on Data Mining}, pages = {263--272}, year = {2008}, organization = {Ieee} } @Article{ Hu.Lee.Aggarwal.ea.2020, title = {Text Style Transfer: A Review and Experimental Evaluation}, author = {Hu, Zhiqiang and Lee, Roy Ka-Wei and Aggarwal, Charu C and Zhang, Aston}, journal = {arXiv preprint arXiv:2010.12742}, year = {2020} } @InProceedings{ Hu.Shen.Sun.2018, title = {Squeeze-and-excitation networks}, author = {Hu, Jie and Shen, Li and Sun, Gang}, booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition}, pages = {7132--7141}, year = {2018} } @InProceedings{ Huang.Liu.Van-Der-Maaten.ea.2017, title = {Densely connected convolutional networks}, author = {Huang, Gao and Liu, Zhuang and Van Der Maaten, Laurens and Weinberger, Kilian Q}, booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition}, pages = {4700--4708}, year = {2017} } @InProceedings{ Ioffe.2017, title = {Batch renormalization: Towards reducing minibatch dependence in batch-normalized models}, author = {Ioffe, Sergey}, booktitle = {Advances in neural information processing systems}, pages = {1945--1953}, year = {2017} } @Article{ Ioffe.Szegedy.2015, title = {Batch normalization: Accelerating deep network training by reducing internal covariate shift}, author = {Ioffe, Sergey and Szegedy, Christian}, journal = {arXiv preprint arXiv:1502.03167}, year = {2015} } @Article{ Izmailov.Podoprikhin.Garipov.ea.2018, title = {Averaging weights leads to wider optima and better generalization}, author = {Izmailov, Pavel and Podoprikhin, Dmitrii and Garipov, Timur and Vetrov, Dmitry and Wilson, Andrew Gordon}, journal = {arXiv preprint arXiv:1803.05407}, year = {2018} } @Book{ Jaeger.2002, title = {Tutorial on training recurrent neural networks, covering BPPT, RTRL, EKF and the" echo state network" approach}, author = {Jaeger, Herbert}, volume = {5}, year = {2002}, publisher = {GMD-Forschungszentrum Informationstechnik Bonn} } @Book{ James.2007, title = {The principles of psychology}, author = {James, William}, volume = {1}, year = {2007}, publisher = {Cosimo, Inc.} } @Article{ Jia.Song.He.ea.2018, title = {Highly scalable deep learning training system with mixed-precision: Training imagenet in four minutes}, author = {Jia, Xianyan and Song, Shutao and He, Wei and Wang, Yangzihao and Rong, Haidong and Zhou, Feihu and Xie, Liqiang and Guo, Zhenyu and Yang, Yuanzhou and Yu, Liwei and others}, journal = {arXiv preprint arXiv:1807.11205}, year = {2018} } @InProceedings{ Jouppi.Young.Patil.ea.2017, title = {In-datacenter performance analysis of a tensor processing unit}, author = {Jouppi, Norman P and Young, Cliff and Patil, Nishant and Patterson, David and Agrawal, Gaurav and Bajwa, Raminder and Bates, Sarah and Bhatia, Suresh and Boden, Nan and Borchers, Al and others}, booktitle = {2017 ACM/IEEE 44th Annual International Symposium on Computer Architecture (ISCA)}, pages = {1--12}, year = {2017}, organization = {IEEE} } @Article{ Karras.Aila.Laine.ea.2017, title = {Progressive growing of gans for improved quality, stability, and variation}, author = {Karras, Tero and Aila, Timo and Laine, Samuli and Lehtinen, Jaakko}, journal = {arXiv preprint arXiv:1710.10196}, year = {2017} } @Article{ Kim.2014, title = {Convolutional neural networks for sentence classification}, author = {Kim, Yoon}, journal = {arXiv preprint arXiv:1408.5882}, year = {2014} } @Article{ Kingma.Ba.2014, title = {Adam: A method for stochastic optimization}, author = {Kingma, Diederik P and Ba, Jimmy}, journal = {arXiv preprint arXiv:1412.6980}, year = {2014} } @Book{ Koller.Friedman.2009, title = {Probabilistic graphical models: principles and techniques}, author = {Koller, Daphne and Friedman, Nir}, year = {2009}, publisher = {MIT press} } @Article{ Kolter.2008, title = {Linear Algebra Review and Reference}, author = {Kolter, Zico}, journal = {Available online: http}, year = {2008} } @InProceedings{ Koren.2009, title = {Collaborative filtering with temporal dynamics}, author = {Koren, Yehuda}, booktitle = {Proceedings of the 15th ACM SIGKDD international conference on Knowledge discovery and data mining}, pages = {447--456}, year = {2009}, organization = {ACM} } @Article{ Koren.Bell.Volinsky.2009, title = {Matrix factorization techniques for recommender systems}, author = {Koren, Yehuda and Bell, Robert and Volinsky, Chris}, journal = {Computer}, number = {8}, pages = {30--37}, year = {2009}, publisher = {IEEE} } @InProceedings{ Krizhevsky.Sutskever.Hinton.2012, title = {Imagenet classification with deep convolutional neural networks}, author = {Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E}, booktitle = {Advances in neural information processing systems}, pages = {1097--1105}, year = {2012} } @Article{ Kung.1988, title = {VLSI array processors}, author = {Kung, Sun Yuan}, journal = {Englewood Cliffs, NJ, Prentice Hall, 1988, 685 p. Research supported by the Semiconductor Research Corp., SDIO, NSF, and US Navy.}, year = {1988} } @Article{ LeCun.Bottou.Bengio.ea.1998, title = {Gradient-based learning applied to document recognition}, author = {LeCun, Yann and Bottou, L{\'e}on and Bengio, Yoshua and Haffner, Patrick and others}, journal = {Proceedings of the IEEE}, volume = {86}, number = {11}, pages = {2278--2324}, year = {1998}, publisher = {Taipei, Taiwan} } @PhDThesis{ Li.2017, title = {Scaling Distributed Machine Learning with System and Algorithm Co-design}, author = {Li, Mu}, year = {2017}, school = {PhD Thesis, CMU} } @InProceedings{ Li.Andersen.Park.ea.2014, title = {Scaling distributed machine learning with the parameter server}, author = {Li, Mu and Andersen, David G and Park, Jun Woo and Smola, Alexander J and Ahmed, Amr and Josifovski, Vanja and Long, James and Shekita, Eugene J and Su, Bor-Yiing}, booktitle = {11th $\{$USENIX$\}$ Symposium on Operating Systems Design and Implementation ($\{$OSDI$\}$ 14)}, pages = {583--598}, year = {2014} } @Article{ Lin.Chen.Yan.2013, title = {Network in network}, author = {Lin, Min and Chen, Qiang and Yan, Shuicheng}, journal = {arXiv preprint arXiv:1312.4400}, year = {2013} } @Article{ Lin.Feng.Santos.ea.2017, title = {A structured self-attentive sentence embedding}, author = {Lin, Zhouhan and Feng, Minwei and Santos, Cicero Nogueira dos and Yu, Mo and Xiang, Bing and Zhou, Bowen and Bengio, Yoshua}, journal = {arXiv preprint arXiv:1703.03130}, year = {2017} } @InProceedings{ Lin.Goyal.Girshick.ea.2017, title = {Focal loss for dense object detection}, author = {Lin, Tsung-Yi and Goyal, Priya and Girshick, Ross and He, Kaiming and Doll{\'a}r, Piotr}, booktitle = {Proceedings of the IEEE international conference on computer vision}, pages = {2980--2988}, year = {2017} } @Article{ Lin.Lv.Zhu.ea.2010, title = {Imagenet classification: fast descriptor coding and large-scale svm training}, author = {Lin, Yuanqing and Lv, F and Zhu, S and Yang, M and Cour, T and Yu, K and Cao, L and Li, Z and Tsai, MH and Zhou, X and others}, journal = {Large scale visual recognition challenge}, year = {2010} } @Article{ Lipton.Steinhardt.2018, title = {Troubling trends in machine learning scholarship}, author = {Lipton, Zachary C and Steinhardt, Jacob}, journal = {arXiv preprint arXiv:1807.03341}, year = {2018} } @InProceedings{ Liu.Anguelov.Erhan.ea.2016, title = {Ssd: Single shot multibox detector}, author = {Liu, Wei and Anguelov, Dragomir and Erhan, Dumitru and Szegedy, Christian and Reed, Scott and Fu, Cheng-Yang and Berg, Alexander C}, booktitle = {European conference on computer vision}, pages = {21--37}, year = {2016}, organization = {Springer} } @Article{ Liu.Ott.Goyal.ea.2019, title = {Roberta: A robustly optimized bert pretraining approach}, author = {Liu, Yinhan and Ott, Myle and Goyal, Naman and Du, Jingfei and Joshi, Mandar and Chen, Danqi and Levy, Omer and Lewis, Mike and Zettlemoyer, Luke and Stoyanov, Veselin}, journal = {arXiv preprint arXiv:1907.11692}, year = {2019} } @InProceedings{ Long.Shelhamer.Darrell.2015, title = {Fully convolutional networks for semantic segmentation}, author = {Long, Jonathan and Shelhamer, Evan and Darrell, Trevor}, booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition}, pages = {3431--3440}, year = {2015} } @Article{ Loshchilov.Hutter.2016, title = {Sgdr: Stochastic gradient descent with warm restarts}, author = {Loshchilov, Ilya and Hutter, Frank}, journal = {arXiv preprint arXiv:1608.03983}, year = {2016} } @Article{ Lowe.2004, title = {Distinctive image features from scale-invariant keypoints}, author = {Lowe, David G}, journal = {International journal of computer vision}, volume = {60}, number = {2}, pages = {91--110}, year = {2004}, publisher = {Springer} } @Article{ Luo.Wang.Shao.ea.2018, title = {Towards understanding regularization in batch normalization}, author = {Luo, Ping and Wang, Xinjiang and Shao, Wenqi and Peng, Zhanglin}, journal = {arXiv preprint}, year = {2018} } @InProceedings{ Maas.Daly.Pham.ea.2011, title = {Learning word vectors for sentiment analysis}, author = {Maas, Andrew L and Daly, Raymond E and Pham, Peter T and Huang, Dan and Ng, Andrew Y and Potts, Christopher}, booktitle = {Proceedings of the 49th annual meeting of the association for computational linguistics: Human language technologies-volume 1}, pages = {142--150}, year = {2011}, organization = {Association for Computational Linguistics} } @InProceedings{ McCann.Bradbury.Xiong.ea.2017, title = {Learned in translation: Contextualized word vectors}, author = {McCann, Bryan and Bradbury, James and Xiong, Caiming and Socher, Richard}, booktitle = {Advances in Neural Information Processing Systems}, pages = {6294--6305}, year = {2017} } @Article{ McCulloch.Pitts.1943, title = {A logical calculus of the ideas immanent in nervous activity}, author = {McCulloch, Warren S and Pitts, Walter}, journal = {The bulletin of mathematical biophysics}, volume = {5}, number = {4}, pages = {115--133}, year = {1943}, publisher = {Springer} } @InProceedings{ McMahan.Holt.Sculley.ea.2013, title = {Ad click prediction: a view from the trenches}, author = {McMahan, H Brendan and Holt, Gary and Sculley, David and Young, Michael and Ebner, Dietmar and Grady, Julian and Nie, Lan and Phillips, Todd and Davydov, Eugene and Golovin, Daniel and others}, booktitle = {Proceedings of the 19th ACM SIGKDD international conference on Knowledge discovery and data mining}, pages = {1222--1230}, year = {2013}, organization = {ACM} } @Article{ Merity.Xiong.Bradbury.ea.2016, title = {Pointer sentinel mixture models}, author = {Merity, Stephen and Xiong, Caiming and Bradbury, James and Socher, Richard}, journal = {arXiv preprint arXiv:1609.07843}, year = {2016} } @Article{ Mikolov.Chen.Corrado.ea.2013, title = {Efficient estimation of word representations in vector space}, author = {Mikolov, Tomas and Chen, Kai and Corrado, Greg and Dean, Jeffrey}, journal = {arXiv preprint arXiv:1301.3781}, year = {2013} } @InProceedings{ Mikolov.Sutskever.Chen.ea.2013, title = {Distributed representations of words and phrases and their compositionality}, author = {Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado, Greg S and Dean, Jeff}, booktitle = {Advances in neural information processing systems}, pages = {3111--3119}, year = {2013} } @InProceedings{ Mirhoseini.Pham.Le.ea.2017, title = {Device placement optimization with reinforcement learning}, author = {Mirhoseini, Azalia and Pham, Hieu and Le, Quoc V and Steiner, Benoit and Larsen, Rasmus and Zhou, Yuefeng and Kumar, Naveen and Norouzi, Mohammad and Bengio, Samy and Dean, Jeff}, booktitle = {Proceedings of the 34th International Conference on Machine Learning-Volume 70}, pages = {2430--2439}, year = {2017}, organization = {JMLR. org} } @InProceedings{ Mnih.Heess.Graves.ea.2014, title = {Recurrent models of visual attention}, author = {Mnih, Volodymyr and Heess, Nicolas and Graves, Alex and others}, booktitle = {Advances in neural information processing systems}, pages = {2204--2212}, year = {2014} } @Article{ Morey.Hoekstra.Rouder.ea.2016, title = {The fallacy of placing confidence in confidence intervals}, author = {Morey, Richard D and Hoekstra, Rink and Rouder, Jeffrey N and Lee, Michael D and Wagenmakers, Eric-Jan}, journal = {Psychonomic bulletin \& review}, volume = {23}, number = {1}, pages = {103--123}, year = {2016}, publisher = {Springer} } @Article{ Nadaraya.1964, title = {On estimating regression}, author = {Nadaraya, Elizbar A}, journal = {Theory of Probability \& Its Applications}, volume = {9}, number = {1}, pages = {141--142}, year = {1964}, publisher = {SIAM} } @Book{ Nesterov.2018, title = {Lectures on convex optimization}, author = {Nesterov, Yurii}, volume = {137}, year = {2018}, publisher = {Springer} } @Misc{ Nesterov.Vial.2000, title = {Confidence level solutions for stochastic programming, Stochastic Programming E-Print Series}, author = {Nesterov, Yu and Vial, J-Ph}, year = {2000} } @Article{ Neyman.1937, title = {Outline of a theory of statistical estimation based on the classical theory of probability}, author = {Neyman, Jerzy}, journal = {Philosophical Transactions of the Royal Society of London. Series A, Mathematical and Physical Sciences}, volume = {236}, number = {767}, pages = {333--380}, year = {1937}, publisher = {The Royal Society London} } @InProceedings{ Papineni.Roukos.Ward.ea.2002, title = {BLEU: a method for automatic evaluation of machine translation}, author = {Papineni, Kishore and Roukos, Salim and Ward, Todd and Zhu, Wei-Jing}, booktitle = {Proceedings of the 40th annual meeting of the Association for Computational Linguistics}, pages = {311--318}, year = {2002} } @Article{ Parikh.Tackstrom.Das.ea.2016, title = {A decomposable attention model for natural language inference}, author = {Parikh, Ankur P and T{\"a}ckstr{\"o}m, Oscar and Das, Dipanjan and Uszkoreit, Jakob}, journal = {arXiv preprint arXiv:1606.01933}, year = {2016} } @InProceedings{ Park.Liu.Wang.ea.2019, title = {Semantic image synthesis with spatially-adaptive normalization}, author = {Park, Taesung and Liu, Ming-Yu and Wang, Ting-Chun and Zhu, Jun-Yan}, booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, pages = {2337--2346}, year = {2019} } @Article{ Paulus.Xiong.Socher.2017, title = {A deep reinforced model for abstractive summarization}, author = {Paulus, Romain and Xiong, Caiming and Socher, Richard}, journal = {arXiv preprint arXiv:1705.04304}, year = {2017} } @InProceedings{ Pennington.Schoenholz.Ganguli.2017, title = {Resurrecting the sigmoid in deep learning through dynamical isometry: theory and practice}, author = {Pennington, Jeffrey and Schoenholz, Samuel and Ganguli, Surya}, booktitle = {Advances in neural information processing systems}, pages = {4785--4795}, year = {2017} } @InProceedings{ Pennington.Socher.Manning.2014, title = {Glove: Global vectors for word representation}, author = {Pennington, Jeffrey and Socher, Richard and Manning, Christopher}, booktitle = {Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP)}, pages = {1532--1543}, year = {2014} } @InProceedings{ Peters.Ammar.Bhagavatula.ea.2017, title = {Semi-supervised sequence tagging with bidirectional language models}, author = {Peters, Matthew and Ammar, Waleed and Bhagavatula, Chandra and Power, Russell}, booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, pages = {1756--1765}, year = {2017} } @Book{ Peters.Janzing.Scholkopf.2017, title = {Elements of causal inference: foundations and learning algorithms}, author = {Peters, Jonas and Janzing, Dominik and Sch{\"o}lkopf, Bernhard}, year = {2017}, publisher = {MIT press} } @InProceedings{ Peters.Neumann.Iyyer.ea.2018, title = {Deep Contextualized Word Representations}, author = {Peters, Matthew and Neumann, Mark and Iyyer, Mohit and Gardner, Matt and Clark, Christopher and Lee, Kenton and Zettlemoyer, Luke}, booktitle = {Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)}, pages = {2227--2237}, year = {2018} } @Article{ Petersen.Pedersen.ea.2008, title = {The matrix cookbook}, author = {Petersen, Kaare Brandt and Pedersen, Michael Syskind and others}, journal = {Technical University of Denmark}, volume = {7}, number = {15}, pages = {510}, year = {2008} } @Article{ Polyak.1964, title = {Some methods of speeding up the convergence of iteration methods}, author = {Polyak, Boris T}, journal = {USSR Computational Mathematics and Mathematical Physics}, volume = {4}, number = {5}, pages = {1--17}, year = {1964}, publisher = {Elsevier} } @Article{ Quadrana.Cremonesi.Jannach.2018, title = {Sequence-aware recommender systems}, author = {Quadrana, Massimo and Cremonesi, Paolo and Jannach, Dietmar}, journal = {ACM Computing Surveys (CSUR)}, volume = {51}, number = {4}, pages = {66}, year = {2018}, publisher = {ACM} } @Article{ Radford.Metz.Chintala.2015, title = {Unsupervised representation learning with deep convolutional generative adversarial networks}, author = {Radford, Alec and Metz, Luke and Chintala, Soumith}, journal = {arXiv preprint arXiv:1511.06434}, year = {2015} } @Article{ Radford.Narasimhan.Salimans.ea.2018, title = {Improving language understanding by generative pre-training}, author = {Radford, Alec and Narasimhan, Karthik and Salimans, Tim and Sutskever, Ilya}, journal = {OpenAI}, year = {2018} } @Article{ Radford.Wu.Child.ea.2019, title = {Language models are unsupervised multitask learners}, author = {Radford, Alec and Wu, Jeffrey and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya}, journal = {OpenAI Blog}, volume = {1}, number = {8}, pages = {9}, year = {2019} } @Article{ Rajpurkar.Zhang.Lopyrev.ea.2016, title = {Squad: 100,000+ questions for machine comprehension of text}, author = {Rajpurkar, Pranav and Zhang, Jian and Lopyrev, Konstantin and Liang, Percy}, journal = {arXiv preprint arXiv:1606.05250}, year = {2016} } @Article{ Reddi.Kale.Kumar.2019, title = {On the convergence of Adam and beyond}, author = {Reddi, Sashank J and Kale, Satyen and Kumar, Sanjiv}, journal = {arXiv preprint arXiv:1904.09237}, year = {2019} } @InProceedings{ Redmon.Divvala.Girshick.ea.2016, title = {You only look once: Unified, real-time object detection}, author = {Redmon, Joseph and Divvala, Santosh and Girshick, Ross and Farhadi, Ali}, booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition}, pages = {779--788}, year = {2016} } @Article{ Reed.De-Freitas.2015, title = {Neural programmer-interpreters}, author = {Reed, Scott and De Freitas, Nando}, journal = {arXiv preprint arXiv:1511.06279}, year = {2015} } @InProceedings{ Ren.He.Girshick.ea.2015, title = {Faster r-cnn: Towards real-time object detection with region proposal networks}, author = {Ren, Shaoqing and He, Kaiming and Girshick, Ross and Sun, Jian}, booktitle = {Advances in neural information processing systems}, pages = {91--99}, year = {2015} } @InProceedings{ Rendle.2010, title = {Factorization machines}, author = {Rendle, Steffen}, booktitle = {2010 IEEE International Conference on Data Mining}, pages = {995--1000}, year = {2010}, organization = {IEEE} } @InProceedings{ Rendle.Freudenthaler.Gantner.ea.2009, title = {BPR: Bayesian personalized ranking from implicit feedback}, author = {Rendle, Steffen and Freudenthaler, Christoph and Gantner, Zeno and Schmidt-Thieme, Lars}, booktitle = {Proceedings of the twenty-fifth conference on uncertainty in artificial intelligence}, pages = {452--461}, year = {2009}, organization = {AUAI Press} } @Article{ Rumelhart.Hinton.Williams.ea.1988, title = {Learning representations by back-propagating errors}, author = {Rumelhart, David E and Hinton, Geoffrey E and Williams, Ronald J and others}, journal = {Cognitive modeling}, volume = {5}, number = {3}, pages = {1}, year = {1988} } @Book{ Russell.Norvig.2016, title = {Artificial intelligence: a modern approach}, author = {Russell, Stuart J and Norvig, Peter}, year = {2016}, publisher = {Malaysia; Pearson Education Limited,} } @Article{ Salton.Wong.Yang.1975, title = {A vector space model for automatic indexing}, author = {Salton, Gerard and Wong, Anita and Yang, Chung-Shu}, journal = {Communications of the ACM}, volume = {18}, number = {11}, pages = {613--620}, year = {1975}, publisher = {ACM} } @InProceedings{ Santurkar.Tsipras.Ilyas.ea.2018, title = {How does batch normalization help optimization?}, author = {Santurkar, Shibani and Tsipras, Dimitris and Ilyas, Andrew and Madry, Aleksander}, booktitle = {Advances in Neural Information Processing Systems}, pages = {2483--2493}, year = {2018} } @Article{ Sarwar.Karypis.Konstan.ea.2001, title = {Item-based collaborative filtering recommendation algorithms.}, author = {Sarwar, Badrul Munir and Karypis, George and Konstan, Joseph A and Riedl, John and others}, journal = {Www}, volume = {1}, pages = {285--295}, year = {2001} } @InProceedings{ Schein.Popescul.Ungar.ea.2002, title = {Methods and metrics for cold-start recommendations}, author = {Schein, Andrew I and Popescul, Alexandrin and Ungar, Lyle H and Pennock, David M}, booktitle = {Proceedings of the 25th annual international ACM SIGIR conference on Research and development in information retrieval}, pages = {253--260}, year = {2002}, organization = {ACM} } @Article{ Schuster.Paliwal.1997, title = {Bidirectional recurrent neural networks}, author = {Schuster, Mike and Paliwal, Kuldip K}, journal = {IEEE Transactions on Signal Processing}, volume = {45}, number = {11}, pages = {2673--2681}, year = {1997}, publisher = {IEEE} } @InProceedings{ Sedhain.Menon.Sanner.ea.2015, title = {Autorec: Autoencoders meet collaborative filtering}, author = {Sedhain, Suvash and Menon, Aditya Krishna and Sanner, Scott and Xie, Lexing}, booktitle = {Proceedings of the 24th International Conference on World Wide Web}, pages = {111--112}, year = {2015}, organization = {ACM} } @Article{ Sennrich.Haddow.Birch.2015, title = {Neural machine translation of rare words with subword units}, author = {Sennrich, Rico and Haddow, Barry and Birch, Alexandra}, journal = {arXiv preprint arXiv:1508.07909}, year = {2015} } @Article{ Sergeev.Del-Balso.2018, title = {Horovod: fast and easy distributed deep learning in TensorFlow}, author = {Sergeev, Alexander and Del Balso, Mike}, journal = {arXiv preprint arXiv:1802.05799}, year = {2018} } @Article{ Shannon.1948, author = {Shannon, Claude Elwood}, journal = {The Bell System Technical Journal}, month = {7}, number = 3, pages = {379--423}, publisher = {Nokia Bell Labs}, title = {A Mathematical Theory of Communication}, volume = 27, year = 1948 } @InProceedings{ Shao.Yao.Sun.ea.2020, title = {ControlVAE: Controllable Variational Autoencoder}, author = {Shao, Huajie and Yao, Shuochao and Sun, Dachun and Zhang, Aston and Liu, Shengzhong and Liu, Dongxin and Wang, Jun and Abdelzaher, Tarek}, booktitle = {Proceedings of the 37th International Conference on Machine Learning}, year = {2020}, organization = {JMLR. org} } @Article{ Silver.Huang.Maddison.ea.2016, title = {Mastering the game of Go with deep neural networks and tree search}, author = {Silver, David and Huang, Aja and Maddison, Chris J and Guez, Arthur and Sifre, Laurent and Van Den Driessche, George and Schrittwieser, Julian and Antonoglou, Ioannis and Panneershelvam, Veda and Lanctot, Marc and others}, journal = {nature}, volume = {529}, number = {7587}, pages = {484}, year = {2016}, publisher = {Nature Publishing Group} } @Article{ Simonyan.Zisserman.2014, title = {Very deep convolutional networks for large-scale image recognition}, author = {Simonyan, Karen and Zisserman, Andrew}, journal = {arXiv preprint arXiv:1409.1556}, year = {2014} } @Article{ Smola.Narayanamurthy.2010, title = {An architecture for parallel topic models}, author = {Smola, Alexander and Narayanamurthy, Shravan}, journal = {Proceedings of the VLDB Endowment}, volume = {3}, number = {1-2}, pages = {703--710}, year = {2010}, publisher = {VLDB Endowment} } @Article{ Srivastava.Hinton.Krizhevsky.ea.2014, title = {Dropout: a simple way to prevent neural networks from overfitting}, author = {Srivastava, Nitish and Hinton, Geoffrey and Krizhevsky, Alex and Sutskever, Ilya and Salakhutdinov, Ruslan}, journal = {The Journal of Machine Learning Research}, volume = {15}, number = {1}, pages = {1929--1958}, year = {2014}, publisher = {JMLR. org} } @Book{ Strang.1993, title = {Introduction to linear algebra}, author = {Strang, Gilbert}, volume = {3}, year = {1993}, publisher = {Wellesley-Cambridge Press Wellesley, MA} } @Article{ Su.Khoshgoftaar.2009, title = {A survey of collaborative filtering techniques}, author = {Su, Xiaoyuan and Khoshgoftaar, Taghi M}, journal = {Advances in artificial intelligence}, volume = {2009}, year = {2009}, publisher = {Hindawi} } @InProceedings{ Sukhbaatar.Weston.Fergus.ea.2015, title = {End-to-end memory networks}, author = {Sukhbaatar, Sainbayar and Weston, Jason and Fergus, Rob and others}, booktitle = {Advances in neural information processing systems}, pages = {2440--2448}, year = {2015} } @InProceedings{ Sutskever.Martens.Dahl.ea.2013, title = {On the importance of initialization and momentum in deep learning}, author = {Sutskever, Ilya and Martens, James and Dahl, George and Hinton, Geoffrey}, booktitle = {International conference on machine learning}, pages = {1139--1147}, year = {2013} } @InProceedings{ Sutskever.Vinyals.Le.2014, title = {Sequence to sequence learning with neural networks}, author = {Sutskever, Ilya and Vinyals, Oriol and Le, Quoc V}, booktitle = {Advances in neural information processing systems}, pages = {3104--3112}, year = {2014} } @InProceedings{ Szegedy.Ioffe.Vanhoucke.ea.2017, title = {Inception-v4, inception-resnet and the impact of residual connections on learning}, author = {Szegedy, Christian and Ioffe, Sergey and Vanhoucke, Vincent and Alemi, Alexander A}, booktitle = {Thirty-First AAAI Conference on Artificial Intelligence}, year = {2017} } @InProceedings{ Szegedy.Liu.Jia.ea.2015, title = {Going deeper with convolutions}, author = {Szegedy, Christian and Liu, Wei and Jia, Yangqing and Sermanet, Pierre and Reed, Scott and Anguelov, Dragomir and Erhan, Dumitru and Vanhoucke, Vincent and Rabinovich, Andrew}, booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition}, pages = {1--9}, year = {2015} } @InProceedings{ Szegedy.Vanhoucke.Ioffe.ea.2016, title = {Rethinking the inception architecture for computer vision}, author = {Szegedy, Christian and Vanhoucke, Vincent and Ioffe, Sergey and Shlens, Jon and Wojna, Zbigniew}, booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition}, pages = {2818--2826}, year = {2016} } @Article{ Tallec.Ollivier.2017, title = {Unbiasing truncated backpropagation through time}, author = {Tallec, Corentin and Ollivier, Yann}, journal = {arXiv preprint arXiv:1705.08209}, year = {2017} } @InProceedings{ Tang.Wang.2018, title = {Personalized top-n sequential recommendation via convolutional sequence embedding}, author = {Tang, Jiaxi and Wang, Ke}, booktitle = {Proceedings of the Eleventh ACM International Conference on Web Search and Data Mining}, pages = {565--573}, year = {2018}, organization = {ACM} } @Article{ Tay.Dehghani.Bahri.ea.2020, title = {Efficient transformers: A survey}, author = {Tay, Yi and Dehghani, Mostafa and Bahri, Dara and Metzler, Donald}, journal = {arXiv preprint arXiv:2009.06732}, year = {2020} } @Article{ Teye.Azizpour.Smith.2018, title = {Bayesian uncertainty estimation for batch normalized deep networks}, author = {Teye, Mattias and Azizpour, Hossein and Smith, Kevin}, journal = {arXiv preprint arXiv:1802.06455}, year = {2018} } @Article{ Tieleman.Hinton.2012, title = {Lecture 6.5-rmsprop: Divide the gradient by a running average of its recent magnitude}, author = {Tieleman, Tijmen and Hinton, Geoffrey}, journal = {COURSERA: Neural networks for machine learning}, volume = {4}, number = {2}, pages = {26--31}, year = {2012} } @Article{ Toscher.Jahrer.Bell.2009, title = {The bigchaos solution to the netflix grand prize}, author = {T{\"o}scher, Andreas and Jahrer, Michael and Bell, Robert M}, journal = {Netflix prize documentation}, pages = {1--52}, year = {2009} } @Article{ Treisman.Gelade.1980, title = {A feature-integration theory of attention}, author = {Treisman, Anne M and Gelade, Garry}, journal = {Cognitive psychology}, volume = {12}, number = {1}, pages = {97--136}, year = {1980}, publisher = {Elsevier} } @Article{ Turing.1950, title = {Computing machinery and intelligence}, author = {Turing, Alan}, journal = {Mind}, volume = {59}, number = {236}, pages = {433}, year = {1950} } @Article{ Uijlings.Van-De-Sande.Gevers.ea.2013, title = {Selective search for object recognition}, author = {Uijlings, Jasper RR and Van De Sande, Koen EA and Gevers, Theo and Smeulders, Arnold WM}, journal = {International journal of computer vision}, volume = {104}, number = {2}, pages = {154--171}, year = {2013}, publisher = {Springer} } @Book{ Van-Loan.Golub.1983, title = {Matrix computations}, author = {Van Loan, Charles F and Golub, Gene H}, year = {1983}, publisher = {Johns Hopkins University Press} } @InProceedings{ Vaswani.Shazeer.Parmar.ea.2017, title = {Attention is all you need}, author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia}, booktitle = {Advances in neural information processing systems}, pages = {5998--6008}, year = {2017} } @InProceedings{ Wang.Davidson.Pan.ea.2016, title = {Gunrock: A high-performance graph processing library on the GPU}, author = {Wang, Yangzihao and Davidson, Andrew and Pan, Yuechao and Wu, Yuduo and Riffel, Andy and Owens, John D}, booktitle = {ACM SIGPLAN Notices}, volume = {51}, number = {8}, pages = {11}, year = {2016}, organization = {ACM} } @Article{ Wang.Li.Liberty.ea.2018, title = {Optimal Message Scheduling for Aggregation}, author = {Wang, Leyuan and Li, Mu and Liberty, Edo and Smola, Alex J}, journal = {NETWORKS}, volume = {2}, number = {3}, pages = {2--3}, year = {2018} } @Article{ Warstadt.Singh.Bowman.2019, title = {Neural network acceptability judgments}, author = {Warstadt, Alex and Singh, Amanpreet and Bowman, Samuel R}, journal = {Transactions of the Association for Computational Linguistics}, volume = {7}, pages = {625--641}, year = {2019}, publisher = {MIT Press} } @Book{ Wasserman.2013, title = {All of statistics: a concise course in statistical inference}, author = {Wasserman, Larry}, year = {2013}, publisher = {Springer Science \& Business Media} } @Article{ Watkins.Dayan.1992, title = {Q-learning}, author = {Watkins, Christopher JCH and Dayan, Peter}, journal = {Machine learning}, volume = {8}, number = {3-4}, pages = {279--292}, year = {1992}, publisher = {Springer} } @Article{ Watson.1964, title = {Smooth regression analysis}, author = {Watson, Geoffrey S}, journal = {Sankhy{\=a}: The Indian Journal of Statistics, Series A}, pages = {359--372}, year = {1964}, publisher = {JSTOR} } @InProceedings{ Welling.Teh.2011, title = {Bayesian learning via stochastic gradient Langevin dynamics}, author = {Welling, Max and Teh, Yee W}, booktitle = {Proceedings of the 28th international conference on machine learning (ICML-11)}, pages = {681--688}, year = {2011} } @Article{ Werbos.1990, title = {Backpropagation through time: what it does and how to do it}, author = {Werbos, Paul J}, journal = {Proceedings of the IEEE}, volume = {78}, number = {10}, pages = {1550--1560}, year = {1990}, publisher = {IEEE} } @InProceedings{ Wigner.1958, title = {On the distribution of the roots of certain symmetric matrices}, author = {Wigner, Eugene P.}, booktitle = {Ann. Math}, pages = {325--327}, year = {1958} } @TechReport{ Williams.Waterman.Patterson.2009, title = {Roofline: An insightful visual performance model for floating-point programs and multicore architectures}, author = {Williams, Samuel and Waterman, Andrew and Patterson, David}, year = {2009}, institution = {Lawrence Berkeley National Lab.(LBNL), Berkeley, CA (United States)} } @Article{ Wood.Gasthaus.Archambeau.ea.2011, title = {The sequence memoizer}, author = {Wood, Frank and Gasthaus, Jan and Archambeau, C{\'e}dric and James, Lancelot and Teh, Yee Whye}, journal = {Communications of the ACM}, volume = {54}, number = {2}, pages = {91--98}, year = {2011}, publisher = {ACM} } @InProceedings{ Wu.Ahmed.Beutel.ea.2017, title = {Recurrent recommender networks}, author = {Wu, Chao-Yuan and Ahmed, Amr and Beutel, Alex and Smola, Alexander J and Jing, How}, booktitle = {Proceedings of the tenth ACM international conference on web search and data mining}, pages = {495--503}, year = {2017}, organization = {ACM} } @Article{ Wu.Schuster.Chen.ea.2016, title = {Google's neural machine translation system: Bridging the gap between human and machine translation}, author = {Wu, Yonghui and Schuster, Mike and Chen, Zhifeng and Le, Quoc V and Norouzi, Mohammad and Macherey, Wolfgang and Krikun, Maxim and Cao, Yuan and Gao, Qin and Macherey, Klaus and others}, journal = {arXiv preprint arXiv:1609.08144}, year = {2016} } @InProceedings{ Xiao.Bahri.Sohl-Dickstein.ea.2018, title = {Dynamical Isometry and a Mean Field Theory of CNNs: How to Train 10,000-Layer Vanilla Convolutional Neural Networks}, author = {Xiao, Lechao and Bahri, Yasaman and Sohl-Dickstein, Jascha and Schoenholz, Samuel and Pennington, Jeffrey}, booktitle = {International Conference on Machine Learning}, pages = {5393--5402}, year = {2018} } @Article{ Xiao.Rasul.Vollgraf.2017, title = {Fashion-mnist: a novel image dataset for benchmarking machine learning algorithms}, author = {Xiao, Han and Rasul, Kashif and Vollgraf, Roland}, journal = {arXiv preprint arXiv:1708.07747}, year = {2017} } @InProceedings{ Xiong.Wu.Alleva.ea.2018, title = {The Microsoft 2017 conversational speech recognition system}, author = {Xiong, Wayne and Wu, Lingfeng and Alleva, Fil and Droppo, Jasha and Huang, Xuedong and Stolcke, Andreas}, booktitle = {2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, pages = {5934--5938}, year = {2018}, organization = {IEEE} } @InProceedings{ Ye.Yin.Lee.ea.2011, title = {Exploiting geographical influence for collaborative point-of-interest recommendation}, author = {Ye, Mao and Yin, Peifeng and Lee, Wang-Chien and Lee, Dik-Lun}, booktitle = {Proceedings of the 34th international ACM SIGIR conference on Research and development in Information Retrieval}, pages = {325--334}, year = {2011}, organization = {ACM} } @Article{ You.Gitman.Ginsburg.2017, title = {Large batch training of convolutional networks}, author = {You, Yang and Gitman, Igor and Ginsburg, Boris}, journal = {arXiv preprint arXiv:1708.03888}, year = {2017} } @InProceedings{ Zaheer.Reddi.Sachan.ea.2018, title = {Adaptive methods for nonconvex optimization}, author = {Zaheer, Manzil and Reddi, Sashank and Sachan, Devendra and Kale, Satyen and Kumar, Sanjiv}, booktitle = {Advances in Neural Information Processing Systems}, pages = {9793--9803}, year = {2018} } @Article{ Zeiler.2012, title = {ADADELTA: an adaptive learning rate method}, author = {Zeiler, Matthew D}, journal = {arXiv preprint arXiv:1212.5701}, year = {2012} } @InProceedings{ Zhang.Tay.Zhang.ea.2021, title = {Beyond Fully-Connected Layers with Quaternions: Parameterization of Hypercomplex Multiplications with 1/n Parameters}, author = {Zhang, Aston and Tay, Yi and Zhang, Shuai and Chan, Alvin and Luu, Anh Tuan and Hui, Siu Cheung and Fu, Jie}, booktitle = {International Conference on Learning Representations}, year = {2021} } @Article{ Zhang.Yao.Sun.ea.2019, title = {Deep learning based recommender system: A survey and new perspectives}, author = {Zhang, Shuai and Yao, Lina and Sun, Aixin and Tay, Yi}, journal = {ACM Computing Surveys (CSUR)}, volume = {52}, number = {1}, pages = {5}, year = {2019}, publisher = {ACM} } @Article{ Zhao.Zheng.Xu.ea.2019, title = {Object detection with deep learning: A review}, author = {Zhao, Zhong-Qiu and Zheng, Peng and Xu, Shou-tao and Wu, Xindong}, journal = {IEEE transactions on neural networks and learning systems}, volume = {30}, number = {11}, pages = {3212--3232}, year = {2019}, publisher = {IEEE} } @InProceedings{ Zhu.Kiros.Zemel.ea.2015, title = {Aligning books and movies: Towards story-like visual explanations by watching movies and reading books}, author = {Zhu, Yukun and Kiros, Ryan and Zemel, Rich and Salakhutdinov, Ruslan and Urtasun, Raquel and Torralba, Antonio and Fidler, Sanja}, booktitle = {Proceedings of the IEEE international conference on computer vision}, pages = {19--27}, year = {2015} } @InProceedings{ Zhu.Park.Isola.ea.2017, title = {Unpaired image-to-image translation using cycle-consistent adversarial networks}, author = {Zhu, Jun-Yan and Park, Taesung and Isola, Phillip and Efros, Alexei A}, booktitle = {Proceedings of the IEEE international conference on computer vision}, pages = {2223--2232}, year = {2017} }