@inproceedings{7b618f7312b64b24bca7cc42563608cd,
title = "Learning Transferable Human-Object Interaction Detector with Natural Language Supervision",
abstract = "It is difficult to construct a data collection including all possible combinations of human actions and interacting objects due to the combinatorial nature of human-object interactions (HOI). In this work, we aim to develop a transferable HOI detector for unseen interactions. Existing HOI detectors often treat interactions as discrete labels and learn a classifier according to a predetermined category space. This is inherently inapt for detecting unseen interactions which are out of the predefined categories. Conversely, we treat independent HOI labels as the natural language supervision of interactions and embed them into a joint visual-and-text space to capture their correlations. More specifically, we propose a new HOI visual encoder to detect the interacting humans and objects, and map them to a joint feature space to perform interaction recognition. Our visual encoder is instantiated as a Vision Transformer with new learnable HOI tokens and a sequence parser to generate unique HOI predictions. It distills and leverages the transferable knowledge from the pretrained CLIP model to perform the zero-shot interaction detection. Experiments on two datasets, SWIG-HOI and HICO-DET, validate that our proposed method can achieve a notable mAP improvement on detecting both seen and unseen HOIs. Our code is available at https://github.com/scwangdyd/promting\_hoi.",
keywords = "Action and event recognition, categorization, Recognition: detection, retrieval",
author = "Suchen Wang and Yueqi Duan and Henghui Ding and Tan, \{Yap Peng\} and Yap, \{Kim Hui\} and Junsong Yuan",
note = "Publisher Copyright: {\textcopyright} 2022 IEEE.; 2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2022 ; Conference date: 19-06-2022 Through 24-06-2022",
year = "2022",
doi = "10.1109/CVPR52688.2022.00101",
language = "English",
series = "Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition",
publisher = "IEEE Computer Society",
pages = "929--938",
booktitle = "Proceedings - 2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2022",
address = "United States",
}