@inproceedings{khan-etal-2021-lone,
title = "Lone Pine at {S}em{E}val-2021 Task 5: Fine-Grained Detection of Hate Speech Using {BERT}oxic",
author = "Khan, Yakoob and
Ma, Weicheng and
Vosoughi, Soroush",
editor = "Palmer, Alexis and
Schneider, Nathan and
Schluter, Natalie and
Emerson, Guy and
Herbelot, Aurelie and
Zhu, Xiaodan",
booktitle = "Proceedings of the 15th International Workshop on Semantic Evaluation (SemEval-2021)",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.semeval-1.132/",
doi = "10.18653/v1/2021.semeval-1.132",
pages = "967--973",
abstract = "This paper describes our approach to the Toxic Spans Detection problem (SemEval-2021 Task 5). We propose BERToxic, a system that fine-tunes a pre-trained BERT model to locate toxic text spans in a given text and utilizes additional post-processing steps to refine the boundaries. The post-processing steps involve (1) labeling character offsets between consecutive toxic tokens as toxic and (2) assigning a toxic label to words that have at least one token labeled as toxic. Through experiments, we show that these two post-processing steps improve the performance of our model by 4.16{\%} on the test set. We also studied the effects of data augmentation and ensemble modeling strategies on our system. Our system significantly outperformed the provided baseline and achieved an F1-score of 0.683, placing Lone Pine in the 17th place out of 91 teams in the competition. Our code is made available at \url{https://github.com/Yakoob-Khan/Toxic-Spans-Detection}"
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="khan-etal-2021-lone">
<titleInfo>
<title>Lone Pine at SemEval-2021 Task 5: Fine-Grained Detection of Hate Speech Using BERToxic</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yakoob</namePart>
<namePart type="family">Khan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weicheng</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Soroush</namePart>
<namePart type="family">Vosoughi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th International Workshop on Semantic Evaluation (SemEval-2021)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alexis</namePart>
<namePart type="family">Palmer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nathan</namePart>
<namePart type="family">Schneider</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Natalie</namePart>
<namePart type="family">Schluter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guy</namePart>
<namePart type="family">Emerson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aurelie</namePart>
<namePart type="family">Herbelot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaodan</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper describes our approach to the Toxic Spans Detection problem (SemEval-2021 Task 5). We propose BERToxic, a system that fine-tunes a pre-trained BERT model to locate toxic text spans in a given text and utilizes additional post-processing steps to refine the boundaries. The post-processing steps involve (1) labeling character offsets between consecutive toxic tokens as toxic and (2) assigning a toxic label to words that have at least one token labeled as toxic. Through experiments, we show that these two post-processing steps improve the performance of our model by 4.16% on the test set. We also studied the effects of data augmentation and ensemble modeling strategies on our system. Our system significantly outperformed the provided baseline and achieved an F1-score of 0.683, placing Lone Pine in the 17th place out of 91 teams in the competition. Our code is made available at https://github.com/Yakoob-Khan/Toxic-Spans-Detection</abstract>
<identifier type="citekey">khan-etal-2021-lone</identifier>
<identifier type="doi">10.18653/v1/2021.semeval-1.132</identifier>
<location>
<url>https://aclanthology.org/2021.semeval-1.132/</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>967</start>
<end>973</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Lone Pine at SemEval-2021 Task 5: Fine-Grained Detection of Hate Speech Using BERToxic
%A Khan, Yakoob
%A Ma, Weicheng
%A Vosoughi, Soroush
%Y Palmer, Alexis
%Y Schneider, Nathan
%Y Schluter, Natalie
%Y Emerson, Guy
%Y Herbelot, Aurelie
%Y Zhu, Xiaodan
%S Proceedings of the 15th International Workshop on Semantic Evaluation (SemEval-2021)
%D 2021
%8 August
%I Association for Computational Linguistics
%C Online
%F khan-etal-2021-lone
%X This paper describes our approach to the Toxic Spans Detection problem (SemEval-2021 Task 5). We propose BERToxic, a system that fine-tunes a pre-trained BERT model to locate toxic text spans in a given text and utilizes additional post-processing steps to refine the boundaries. The post-processing steps involve (1) labeling character offsets between consecutive toxic tokens as toxic and (2) assigning a toxic label to words that have at least one token labeled as toxic. Through experiments, we show that these two post-processing steps improve the performance of our model by 4.16% on the test set. We also studied the effects of data augmentation and ensemble modeling strategies on our system. Our system significantly outperformed the provided baseline and achieved an F1-score of 0.683, placing Lone Pine in the 17th place out of 91 teams in the competition. Our code is made available at https://github.com/Yakoob-Khan/Toxic-Spans-Detection
%R 10.18653/v1/2021.semeval-1.132
%U https://aclanthology.org/2021.semeval-1.132/
%U https://doi.org/10.18653/v1/2021.semeval-1.132
%P 967-973
Markdown (Informal)
[Lone Pine at SemEval-2021 Task 5: Fine-Grained Detection of Hate Speech Using BERToxic](https://aclanthology.org/2021.semeval-1.132/) (Khan et al., SemEval 2021)
ACL