@article{devlin2018bert, title={Bert: Pre-training of deep bidirectional transformers for language understanding}, author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina}, journal={arXiv preprint arXiv:1810.04805}, year={2018} } @article{shoeybi2019megatron, title={Megatron-lm: Training multi-billion parameter language models using model parallelism}, author={Shoeybi, Mohammad and Patwary, Mostofa and Puri, Raul and LeGresley, Patrick and Casper, Jared and Catanzaro, Bryan}, journal={arXiv preprint arXiv:1909.08053}, year={2019} } @InProceedings{maas2011, author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher}, title = {Learning Word Vectors for Sentiment Analysis}, booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies}, month = {June}, year = {2011}, address = {Portland, Oregon, USA}, publisher = {Association for Computational Linguistics}, pages = {142--150}, url = {https://aclanthology.org/P11-1015} } @inproceedings{socher2013, title = "Recursive Deep Models for Semantic Compositionality Over a Sentiment Treebank", author = "Socher, Richard and Perelygin, Alex and Wu, Jean and Chuang, Jason and Manning, Christopher D. and Ng, Andrew and Potts, Christopher", booktitle = "Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing", month = oct, year = "2013", address = "Seattle, Washington, USA", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/D13-1170", pages = "1631--1642", } @article{lim2018chemical, title={Chemical--gene relation extraction using recursive neural network}, author={Lim, Sangrak and Kang, Jaewoo}, journal={Database}, volume={2018}, year={2018}, publisher={Oxford Academic} } @inproceedings{li2007scalable, title={Scalable term selection for text categorization}, author={Li, Jingyang and Sun, Maosong}, booktitle={Proceedings of the 2007 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning (EMNLP-CoNLL)}, pages={774--782}, year={2007} } @misc{lee2019biobert, title={BioBERT: a pre-trained biomedical language representation model for biomedical text mining}, author={Jinhyuk Lee and Wonjin Yoon and Sungdong Kim and Donghyeon Kim and Sunkyu Kim and Chan Ho So and Jaewoo Kang}, year={2019}, eprint={1901.08746}, archivePrefix={arXiv}, primaryClass={cs.CL} } @misc{shin2020biomegatron, title={BioMegatron: Larger Biomedical Domain Language Model}, author={Hoo-Chang Shin and Yang Zhang and Evelina Bakhturina and Raul Puri and Mostofa Patwary and Mohammad Shoeybi and Raghav Mani}, year={2020}, eprint={2010.06060}, archivePrefix={arXiv}, primaryClass={cs.CL} } @inproceedings{vaswani2017attention, title={Attention is all you need}, author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia}, booktitle={Advances in Neural Information Processing Systems}, pages={6000--6010}, year={2017} } @article{sennrich2015neural, title={Neural machine translation of rare words with subword units}, author={Sennrich, Rico and Haddow, Barry and Birch, Alexandra}, journal={arXiv preprint arXiv:1508.07909}, year={2015} } @article{provilkov2019bpe, title={Bpe-dropout: Simple and effective subword regularization}, author={Provilkov, Ivan and Emelianenko, Dmitrii and Voita, Elena}, journal={arXiv preprint arXiv:1910.13267}, year={2019} } @article{post2018call, title={A call for clarity in reporting BLEU scores}, author={Post, Matt}, journal={arXiv preprint arXiv:1804.08771}, year={2018} } @misc{zhang2021sgdqa, title={SGD-QA: Fast Schema-Guided Dialogue State Tracking for Unseen Services}, author={Yang Zhang and Vahid Noroozi and Evelina Bakhturina and Boris Ginsburg}, year={2021}, eprint={2105.08049}, archivePrefix={arXiv}, primaryClass={cs.CL} } @article{zhang2019neural, title={Neural Models of Text Normalization for Speech Applications}, author={Hao Zhang and R. Sproat and Axel H. Ng and Felix Stahlberg and Xiaochang Peng and Kyle Gorman and B. Roark}, journal={Computational Linguistics}, year={2019}, pages={293-338} } @misc{liu2021selfalignment, title={Self-Alignment Pretraining for Biomedical Entity Representations}, author={Fangyu Liu and Ehsan Shareghi and Zaiqiao Meng and Marco Basaldella and Nigel Collier}, year={2021}, eprint={2010.11784}, archivePrefix={arXiv}, primaryClass={cs.CL} } @article{gulcehre2015using, title={On using monolingual corpora in neural machine translation}, author={Gulcehre, Caglar and Firat, Orhan and Xu, Kelvin and Cho, Kyunghyun and Barrault, Loic and Lin, Huei-Chi and Bougares, Fethi and Schwenk, Holger and Bengio, Yoshua}, journal={arXiv preprint arXiv:1503.03535}, year={2015} } @article{yee2019simple, title={Simple and effective noisy channel modeling for neural machine translation}, author={Yee, Kyra and Ng, Nathan and Dauphin, Yann N and Auli, Michael}, journal={arXiv preprint arXiv:1908.05731}, year={2019} } @inproceedings{koehnetal2007moses, title = "{M}oses: Open Source Toolkit for Statistical Machine Translation", author = "Koehn, Philipp and Hoang, Hieu and Birch, Alexandra and Callison-Burch, Chris and Federico, Marcello and Bertoldi, Nicola and Cowan, Brooke and Shen, Wade and Moran, Christine and Zens, Richard and Dyer, Chris and Bojar, Ond{\v{r}}ej and Constantin, Alexandra and Herbst, Evan", booktitle = "Proceedings of the 45th Annual Meeting of the Association for Computational Linguistics Companion Volume Proceedings of the Demo and Poster Sessions", month = jun, year = "2007", address = "Prague, Czech Republic", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/P07-2045", pages = "177--180", } @inproceedings{sunkara20_interspeech, author={Monica Sunkara and Srikanth Ronanki and Dhanush Bekal and Sravan Bodapati and Katrin Kirchhoff}, title={{Multimodal Semi-Supervised Learning Framework for Punctuation Prediction in Conversational Speech}}, year=2020, booktitle={Proc. Interspeech 2020}, pages={4911--4915}, doi={10.21437/Interspeech.2020-3074} } @article{chen2019bert, title={Bert for joint intent classification and slot filling}, author={Chen, Qian and Zhuo, Zhu and Wang, Wen}, journal={arXiv preprint arXiv:1902.10909}, year={2019} } @article{borgeaud2021improving, title={Improving language models by retrieving from trillions of tokens}, author={Borgeaud, Sebastian and Mensch, Arthur and Hoffmann, Jordan and Cai, Trevor and Rutherford, Eliza and Millican, Katie and Driessche, George van den and Lespiau, Jean-Baptiste and Damoc, Bogdan and Clark, Aidan and others}, journal={arXiv preprint arXiv:2112.04426}, year={2021} } @article{su2021roformer, title={Roformer: Enhanced transformer with rotary position embedding}, author={Su, Jianlin and Lu, Yu and Pan, Shengfeng and Wen, Bo and Liu, Yunfeng}, journal={arXiv preprint arXiv:2104.09864}, year={2021} } @article{reimers2019sentence, title={Sentence-bert: Sentence embeddings using siamese bert-networks}, author={Reimers, Nils and Gurevych, Iryna}, journal={arXiv preprint arXiv:1908.10084}, year={2019} } @article{yang2022tensor, title={Tensor Programs V: Tuning Large Neural Networks via Zero-Shot Hyperparameter Transfer}, author={Yang, Greg and Hu, Edward J and Babuschkin, Igor and Sidor, Szymon and Liu, Xiaodong and Farhi, David and Ryder, Nick and Pachocki, Jakub and Chen, Weizhu and Gao, Jianfeng}, journal={arXiv preprint arXiv:2203.03466}, year={2022} } @article{jegou2022faiss, title={Faiss: Similarity search and clustering of dense vectors library}, author={J{\'e}gou, Herv{\'e} and Douze, Matthijs and Johnson, Jeff and Hosseini, Lucas and Deng, Chengqi}, journal={Astrophysics Source Code Library}, pages={ascl--2210}, year={2022} } @misc{antonova2023spellmapper, title={SpellMapper: A non-autoregressive neural spellchecker for ASR customization with candidate retrieval based on n-gram mappings}, author={Alexandra Antonova and Evelina Bakhturina and Boris Ginsburg}, year={2023}, eprint={2306.02317}, archivePrefix={arXiv}, primaryClass={cs.CL} } @misc{dao2022flashattention, title={FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness}, author={Tri Dao and Daniel Y. Fu and Stefano Ermon and Atri Rudra and Christopher RĂ©}, year={2022}, eprint={2205.14135}, archivePrefix={arXiv}, primaryClass={cs.LG} } @misc{vaswani2023attention, title={Attention Is All You Need}, author={Ashish Vaswani and Noam Shazeer and Niki Parmar and Jakob Uszkoreit and Llion Jones and Aidan N. Gomez and Lukasz Kaiser and Illia Polosukhin}, year={2023}, eprint={1706.03762}, archivePrefix={arXiv}, primaryClass={cs.CL} } @misc{su2022roformer, title={RoFormer: Enhanced Transformer with Rotary Position Embedding}, author={Jianlin Su and Yu Lu and Shengfeng Pan and Ahmed Murtadha and Bo Wen and Yunfeng Liu}, year={2022}, eprint={2104.09864}, archivePrefix={arXiv}, primaryClass={cs.CL} } @misc{press2022train, title={Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation}, author={Ofir Press and Noah A. Smith and Mike Lewis}, year={2022}, eprint={2108.12409}, archivePrefix={arXiv}, primaryClass={cs.CL} } @misc{chi2022kerple, title={KERPLE: Kernelized Relative Positional Embedding for Length Extrapolation}, author={Ta-Chung Chi and Ting-Han Fan and Peter J. Ramadge and Alexander I. Rudnicky}, year={2022}, eprint={2205.09921}, archivePrefix={arXiv}, primaryClass={cs.CL} } @misc{sun2022lengthextrapolatable, title={A Length-Extrapolatable Transformer}, author={Yutao Sun and Li Dong and Barun Patra and Shuming Ma and Shaohan Huang and Alon Benhaim and Vishrav Chaudhary and Xia Song and Furu Wei}, year={2022}, eprint={2212.10554}, archivePrefix={arXiv}, primaryClass={cs.CL} } @misc{chi2023dissecting, title={Dissecting Transformer Length Extrapolation via the Lens of Receptive Field Analysis}, author={Ta-Chung Chi and Ting-Han Fan and Alexander I. Rudnicky and Peter J. Ramadge}, year={2023}, eprint={2212.10356}, archivePrefix={arXiv}, primaryClass={cs.CL} } @misc{shaw2018selfattention, title={Self-Attention with Relative Position Representations}, author={Peter Shaw and Jakob Uszkoreit and Ashish Vaswani}, year={2018}, eprint={1803.02155}, archivePrefix={arXiv}, primaryClass={cs.CL} } @misc{chen2023extending, title={Extending Context Window of Large Language Models via Positional Interpolation}, author={Shouyuan Chen and Sherman Wong and Liangjian Chen and Yuandong Tian}, year={2023}, eprint={2306.15595}, archivePrefix={arXiv}, primaryClass={cs.CL} }