Citations
BuzzASR (this work)
@misc{buzzasr2026,
title = {BuzzASR: A Swarm of 100+ Monolingual Speech Recognition Models},
author = {Anonymous},
year = {2026},
note = {Anonymous ACL submission}
}
Whisper
@inproceedings{radford2023whisper,
title = {Robust Speech Recognition via Large-Scale Weak Supervision},
author = {Radford, Alec and Kim, Jong Wook and Xu, Tao and Brockman, Greg
and McLeavey, Christine and Sutskever, Ilya},
booktitle = {Proceedings of the 40th International Conference on Machine Learning},
year = {2023}
}
FLEURS
@inproceedings{conneau2023fleurs,
title = {FLEURS: Few-shot Learning Evaluation of Universal Representations of Speech},
author = {Conneau, Alexis and Ma, Min and Khanuja, Simran and Zhang, Yu
and Axelrod, Vera and Dalmia, Siddharth and Riesa, Jason and Rivera, Clara
and Bapna, Ankur},
booktitle = {2022 IEEE Spoken Language Technology Workshop (SLT)},
pages = {798--805},
year = {2023}
}
CommonVoice
@inproceedings{ardila2020commonvoice,
title = {Common Voice: A Massively-Multilingual Speech Corpus},
author = {Ardila, Rosana and Branson, Megan and Davis, Kelly and Kohler, Michael
and Meyer, Josh and Henretty, Michael and Morais, Reuben and Saunders, Lindsay
and Tyers, Francis and Weber, Gregor},
booktitle = {Proceedings of the Twelfth Language Resources and Evaluation Conference},
pages = {4218--4222},
year = {2020}
}
Goldfish text corpora
@inproceedings{chang2026goldfish,
title = {Goldfish: Monolingual Language Models for 350 Languages},
author = {Chang, Tyler A. and Arnett, Catherine and Tu, Zhuowen and Bergen, Benjamin K.},
booktitle = {LREC},
year = {2026}
}
HuggingFace tokenizers
@misc{huggingface2020tokenizers,
title = {Tokenizers: Fast State-of-the-Art Tokenizers Optimized for Research and Production},
author = {{HuggingFace}},
year = {2020},
url = {https://github.com/huggingface/tokenizers}
}