@inproceedings{4de995ad7434481087c87a463270f52b,
title = "Huqariq: A Multilingual Speech Corpus of Native Languages of Peru for Speech Recognition",
abstract = "The Huqariq corpus is a multilingual collection of speech from native Peruvian languages. The transcribed corpus is intended for the research and development of speech technologies to preserve endangered languages in Peru. Huqariq is primarily designed for the development of automatic speech recognition, language identification and text-to-speech tools. In order to achieve corpus collection sustainably, we employ the crowdsourcing methodology. Huqariq includes four native languages of Peru, and it is expected that by the end of the year 2022, it can reach up to 20 native languages out of the 48 native languages in Peru. The corpus has 220 hours of transcribed audio recorded by more than 500 volunteers, making it the largest speech corpus for native languages in Peru. In order to verify the quality of the corpus, we present speech recognition experiments using 220 hours of fully transcribed audio.",
keywords = "Low-resource Languages, Speech Corpus, Speech Recognition",
author = "Rodolfo Zevallos and Luis Camacho and Nelsi Melgarejo",
note = "Publisher Copyright: {\textcopyright} European Language Resources Association (ELRA), licensed under CC-BY-NC-4.0.; 13th International Conference on Language Resources and Evaluation Conference, LREC 2022 ; Conference date: 20-06-2022 Through 25-06-2022",
year = "2022",
language = "English",
series = "2022 Language Resources and Evaluation Conference, LREC 2022",
publisher = "European Language Resources Association (ELRA)",
pages = "5029--5034",
editor = "Nicoletta Calzolari and Frederic Bechet and Philippe Blache and Khalid Choukri and Christopher Cieri and Thierry Declerck and Sara Goggi and Hitoshi Isahara and Bente Maegaard and Joseph Mariani and Helene Mazo and Jan Odijk and Stelios Piperidis",
booktitle = "2022 Language Resources and Evaluation Conference, LREC 2022",
}