@inproceedings{monsalve-etal-2019-assessing,
title = "Assessing Back-Translation as a Corpus Generation Strategy for non-{E}nglish Tasks: A Study in Reading Comprehension and Word Sense Disambiguation",
author = "Monsalve, Fabricio and
Rivas Rojas, Kervy and
Sobrevilla Cabezudo, Marco Antonio and
Oncevay, Arturo",
editor = "Friedrich, Annemarie and
Zeyrek, Deniz and
Hoek, Jet",
booktitle = "Proceedings of the 13th Linguistic Annotation Workshop",
month = aug,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W19-4010",
doi = "10.18653/v1/W19-4010",
pages = "81--89",
abstract = "Corpora curated by experts have sustained Natural Language Processing mainly in English, but the expensiveness of corpora creation is a barrier for the development in further languages. Thus, we propose a corpus generation strategy that only requires a machine translation system between English and the target language in both directions, where we filter the best translations by computing automatic translation metrics and the task performance score. By studying Reading Comprehension in Spanish and Word Sense Disambiguation in Portuguese, we identified that a more quality-oriented metric has high potential in the corpora selection without degrading the task performance. We conclude that it is possible to systematise the building of quality corpora using machine translation and automatic metrics, besides some prior effort to clean and process the data.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="monsalve-etal-2019-assessing">
<titleInfo>
<title>Assessing Back-Translation as a Corpus Generation Strategy for non-English Tasks: A Study in Reading Comprehension and Word Sense Disambiguation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Fabricio</namePart>
<namePart type="family">Monsalve</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kervy</namePart>
<namePart type="family">Rivas Rojas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="given">Antonio</namePart>
<namePart type="family">Sobrevilla Cabezudo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arturo</namePart>
<namePart type="family">Oncevay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 13th Linguistic Annotation Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Annemarie</namePart>
<namePart type="family">Friedrich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Deniz</namePart>
<namePart type="family">Zeyrek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jet</namePart>
<namePart type="family">Hoek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Florence, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Corpora curated by experts have sustained Natural Language Processing mainly in English, but the expensiveness of corpora creation is a barrier for the development in further languages. Thus, we propose a corpus generation strategy that only requires a machine translation system between English and the target language in both directions, where we filter the best translations by computing automatic translation metrics and the task performance score. By studying Reading Comprehension in Spanish and Word Sense Disambiguation in Portuguese, we identified that a more quality-oriented metric has high potential in the corpora selection without degrading the task performance. We conclude that it is possible to systematise the building of quality corpora using machine translation and automatic metrics, besides some prior effort to clean and process the data.</abstract>
<identifier type="citekey">monsalve-etal-2019-assessing</identifier>
<identifier type="doi">10.18653/v1/W19-4010</identifier>
<location>
<url>https://aclanthology.org/W19-4010</url>
</location>
<part>
<date>2019-08</date>
<extent unit="page">
<start>81</start>
<end>89</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Assessing Back-Translation as a Corpus Generation Strategy for non-English Tasks: A Study in Reading Comprehension and Word Sense Disambiguation
%A Monsalve, Fabricio
%A Rivas Rojas, Kervy
%A Sobrevilla Cabezudo, Marco Antonio
%A Oncevay, Arturo
%Y Friedrich, Annemarie
%Y Zeyrek, Deniz
%Y Hoek, Jet
%S Proceedings of the 13th Linguistic Annotation Workshop
%D 2019
%8 August
%I Association for Computational Linguistics
%C Florence, Italy
%F monsalve-etal-2019-assessing
%X Corpora curated by experts have sustained Natural Language Processing mainly in English, but the expensiveness of corpora creation is a barrier for the development in further languages. Thus, we propose a corpus generation strategy that only requires a machine translation system between English and the target language in both directions, where we filter the best translations by computing automatic translation metrics and the task performance score. By studying Reading Comprehension in Spanish and Word Sense Disambiguation in Portuguese, we identified that a more quality-oriented metric has high potential in the corpora selection without degrading the task performance. We conclude that it is possible to systematise the building of quality corpora using machine translation and automatic metrics, besides some prior effort to clean and process the data.
%R 10.18653/v1/W19-4010
%U https://aclanthology.org/W19-4010
%U https://doi.org/10.18653/v1/W19-4010
%P 81-89
Markdown (Informal)
[Assessing Back-Translation as a Corpus Generation Strategy for non-English Tasks: A Study in Reading Comprehension and Word Sense Disambiguation](https://aclanthology.org/W19-4010) (Monsalve et al., LAW 2019)
ACL