@inproceedings{4a18e4ac5877491ba149034604b842f5,
title = "LoGAN: Latent graph co-attention network for weakly-supervised video moment retrieval",
abstract = "The goal of weakly-supervised video moment retrieval is to localize the video segment most relevant to a description without access to temporal annotations during training. Prior work uses co-attention mechanisms to understand relationships between the vision and language data, but they lack contextual information between video frames that can be useful to determine how well a segment relates to the query. To address this, we propose an efficient Latent Graph Co-Attention Network (LoGAN) that exploits fine-grained frame-by-word interactions to jointly reason about the correspondences between all possible pairs of frames, providing context cues absent in prior work. Experiments on the DiDeMo and Charades-STA datasets demonstrate the effectiveness of our approach, where we improve Recall@1 by 520% over prior weakly-supervised methods, even boasting an 11% gain over strongly-supervised methods on DiDeMo, while also using significantly fewer model parameters than other co-attention mechanisms.",
author = "Reuben Tan and Huijuan Xu and Kate Saenko and Plummer, {Bryan A.}",
note = "Publisher Copyright: {\textcopyright} 2021 IEEE.; 2021 IEEE Winter Conference on Applications of Computer Vision, WACV 2021 ; Conference date: 05-01-2021 Through 09-01-2021",
year = "2021",
month = jan,
doi = "10.1109/WACV48630.2021.00213",
language = "English (US)",
series = "Proceedings - 2021 IEEE Winter Conference on Applications of Computer Vision, WACV 2021",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "2082--2091",
booktitle = "Proceedings - 2021 IEEE Winter Conference on Applications of Computer Vision, WACV 2021",
address = "United States",
}