@inproceedings{078e8ecdee56444fa117c9761f2854db,
title = "VQAttack: Transferable Adversarial Attacks on Visual Question Answering via Pre-trained Models",
abstract = "Visual Question Answering (VQA) is a fundamental task in computer vision and natural language process fields. Although the “pre-training & finetuning” learning paradigm significantly improves the VQA performance, the adversarial robustness of such a learning paradigm has not been explored. In this paper, we delve into a new problem: using a pre-trained multimodal source model to create adversarial image-text pairs and then transferring them to attack the target VQA models. Correspondingly, we propose a novel VQATTACK model, which can iteratively generate both image and text perturbations with the designed modules: the large language model (LLM)-enhanced image attack and the cross-modal joint attack module. At each iteration, the LLM-enhanced image attack module first optimizes the latent representation-based loss to generate feature-level image perturbations. Then it incorporates an LLM to further enhance the image perturbations by optimizing the designed masked answer anti-recovery loss. The cross-modal joint attack module will be triggered at a specific iteration, which updates the image and text perturbations sequentially. Notably, the text perturbation updates are based on both the learned gradients in the word embedding space and word synonym-based substitution. Experimental results on two VQA datasets with five validated models demonstrate the effectiveness of the proposed VQATTACK in the transferable attack setting, compared with state-of-the-art baselines. This work reveals a significant blind spot in the “pre-training & fine-tuning” paradigm on VQA tasks. The source code can be found in the link https://github.com/ericyinyzy/VQAttack.",
author = "Ziyi Yin and Muchao Ye and Tianrong Zhang and Jiaqi Wang and Han Liu and Jinghui Chen and Ting Wang and Fenglong Ma",
note = "Publisher Copyright: Copyright {\textcopyright} 2024, Association for the Advancement of Artificial Intelligence (www.aaai.org). All rights reserved.; 38th AAAI Conference on Artificial Intelligence, AAAI 2024 ; Conference date: 20-02-2024 Through 27-02-2024",
year = "2024",
month = mar,
day = "25",
doi = "10.1609/aaai.v38i7.28499",
language = "English (US)",
series = "Proceedings of the AAAI Conference on Artificial Intelligence",
publisher = "Association for the Advancement of Artificial Intelligence",
number = "7",
pages = "6755--6763",
editor = "Michael Wooldridge and Jennifer Dy and Sriraam Natarajan",
booktitle = "Technical Tracks 14",
edition = "7",
}