@inproceedings{wang-etal-2022-smartave,
title = "{SMARTAVE}: Structured Multimodal Transformer for Product Attribute Value Extraction",
author = "Wang, Qifan and
Yang, Li and
Wang, Jingang and
Krishnan, Jitin and
Dai, Bo and
Wang, Sinong and
Xu, Zenglin and
Khabsa, Madian and
Ma, Hao",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2022",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-emnlp.20",
doi = "10.18653/v1/2022.findings-emnlp.20",
pages = "263--276",
abstract = "Automatic product attribute value extraction refers to the task of identifying values of an attribute from the product information. Product attributes are essential in improving online shopping experience for customers. Most existing methods focus on extracting attribute values from product title and description.However, in many real-world applications, a product is usually represented by multiple modalities beyond title and description, such as product specifications, text and visual information from the product image, etc. In this paper, we propose SMARTAVE, a Structure Mltimodal trAnsformeR for producT Attribute Value Extraction, which jointly encodes the structured product information from multiple modalities. Specifically, in SMARTAVE encoder, we introduce hyper-tokens to represent the modality-level information, and local-tokens to represent the original text and visual inputs. Structured attention patterns are designed among the hyper-tokens and local-tokens for learning effective product representation. The attribute values are then extracted based on the learned embeddings. We conduct extensive experiments on two multimodal product datasets. Experimental results demonstrate the superior performance of the proposed approach over several state-of-the-art methods. Ablation studies validate the effectiveness of the structured attentions in modeling the multimodal product information.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2022-smartave">
<titleInfo>
<title>SMARTAVE: Structured Multimodal Transformer for Product Attribute Value Extraction</title>
</titleInfo>
<name type="personal">
<namePart type="given">Qifan</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Li</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jingang</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jitin</namePart>
<namePart type="family">Krishnan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bo</namePart>
<namePart type="family">Dai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sinong</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zenglin</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Madian</namePart>
<namePart type="family">Khabsa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hao</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Automatic product attribute value extraction refers to the task of identifying values of an attribute from the product information. Product attributes are essential in improving online shopping experience for customers. Most existing methods focus on extracting attribute values from product title and description.However, in many real-world applications, a product is usually represented by multiple modalities beyond title and description, such as product specifications, text and visual information from the product image, etc. In this paper, we propose SMARTAVE, a Structure Mltimodal trAnsformeR for producT Attribute Value Extraction, which jointly encodes the structured product information from multiple modalities. Specifically, in SMARTAVE encoder, we introduce hyper-tokens to represent the modality-level information, and local-tokens to represent the original text and visual inputs. Structured attention patterns are designed among the hyper-tokens and local-tokens for learning effective product representation. The attribute values are then extracted based on the learned embeddings. We conduct extensive experiments on two multimodal product datasets. Experimental results demonstrate the superior performance of the proposed approach over several state-of-the-art methods. Ablation studies validate the effectiveness of the structured attentions in modeling the multimodal product information.</abstract>
<identifier type="citekey">wang-etal-2022-smartave</identifier>
<identifier type="doi">10.18653/v1/2022.findings-emnlp.20</identifier>
<location>
<url>https://aclanthology.org/2022.findings-emnlp.20</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>263</start>
<end>276</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SMARTAVE: Structured Multimodal Transformer for Product Attribute Value Extraction
%A Wang, Qifan
%A Yang, Li
%A Wang, Jingang
%A Krishnan, Jitin
%A Dai, Bo
%A Wang, Sinong
%A Xu, Zenglin
%A Khabsa, Madian
%A Ma, Hao
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Findings of the Association for Computational Linguistics: EMNLP 2022
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F wang-etal-2022-smartave
%X Automatic product attribute value extraction refers to the task of identifying values of an attribute from the product information. Product attributes are essential in improving online shopping experience for customers. Most existing methods focus on extracting attribute values from product title and description.However, in many real-world applications, a product is usually represented by multiple modalities beyond title and description, such as product specifications, text and visual information from the product image, etc. In this paper, we propose SMARTAVE, a Structure Mltimodal trAnsformeR for producT Attribute Value Extraction, which jointly encodes the structured product information from multiple modalities. Specifically, in SMARTAVE encoder, we introduce hyper-tokens to represent the modality-level information, and local-tokens to represent the original text and visual inputs. Structured attention patterns are designed among the hyper-tokens and local-tokens for learning effective product representation. The attribute values are then extracted based on the learned embeddings. We conduct extensive experiments on two multimodal product datasets. Experimental results demonstrate the superior performance of the proposed approach over several state-of-the-art methods. Ablation studies validate the effectiveness of the structured attentions in modeling the multimodal product information.
%R 10.18653/v1/2022.findings-emnlp.20
%U https://aclanthology.org/2022.findings-emnlp.20
%U https://doi.org/10.18653/v1/2022.findings-emnlp.20
%P 263-276
Markdown (Informal)
[SMARTAVE: Structured Multimodal Transformer for Product Attribute Value Extraction](https://aclanthology.org/2022.findings-emnlp.20) (Wang et al., Findings 2022)
ACL
- Qifan Wang, Li Yang, Jingang Wang, Jitin Krishnan, Bo Dai, Sinong Wang, Zenglin Xu, Madian Khabsa, and Hao Ma. 2022. SMARTAVE: Structured Multimodal Transformer for Product Attribute Value Extraction. In Findings of the Association for Computational Linguistics: EMNLP 2022, pages 263–276, Abu Dhabi, United Arab Emirates. Association for Computational Linguistics.