@article{chen2023trajectoryformer,abbr={ICCV},bibtex_show={true},title={TrajectoryFormer: 3D Object Tracking Transformer with Predictive Trajectory Hypotheses},author={Chen, Xuesong and Shi, Shaoshuai and Zhang, Chao and Zhu, Benjin and Wang, Qiang and Cheung, Ka Chun and See, Simon and Li, Hongsheng},journal={International Conference on Computer Vision},year={2023},code={https://github.com/poodarchu/EFG}}
codebase
EFG: An Efficient, Flexible, and General deep learning framework that retains minimal
@misc{zhu2023efg,abbr={codebase},bibtex_show={true},title={EFG: An Efficient, Flexible, and General deep learning framework that retains minimal},author={Contributors, EFG},howpublished={\url{https://github.com/poodarchu/efg}},year={2023}}
CVPR
ConQueR: Query Contrast Voxel-DETR for 3D Object Detection
Although DETR-based 3D detectors can simplify the detection pipeline and achieve direct sparse predictions, their performance still lags behind dense detectors with post-processing for 3D object detection from point clouds. DETRs usually adopt a larger number of queries than GTs (e.g., 300 queries v.s. 40 objects in Waymo) in a scene, which inevitably incur many false positives during inference. In this paper, we propose a simple yet effective sparse 3D detector, named Query Contrast Voxel-DETR (ConQueR), to eliminate the challenging false positives, and achieve more accurate and sparser predictions. We observe that most false positives are highly overlapping in local regions, caused by the lack of explicit supervision to discriminate locally similar queries. We thus propose a Query Contrast mechanism to explicitly enhance queries towards their best-matched GTs over all unmatched query predictions. This is achieved by the construction of positive and negative GT-query pairs for each GT, and a contrastive loss to enhance positive GT-query pairs against negative ones based on feature similarities. ConQueR closes the gap of sparse and dense 3D detectors, and reduces up to 60% false positives. Our single-frame ConQueR achieves new state-of-the-art (sota) 71.6 mAPH/L2 on the challenging Waymo Open Dataset validation set, outperforming previous sota methods (e.g., PV-RCNN++) by over 2.0 mAPH/L2.
@article{zhu2023conquer,abbr={CVPR},bibtex_show={true},title={ConQueR: Query Contrast Voxel-DETR for 3D Object Detection},author={Zhu, Benjin and Wang, Zhe and Shi, Shaoshuai and Xu, Hang and Hong, Lanqing and Li, Hongsheng},journal={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},year={2023},html={https://benjin.me/projects/2022_conquer/},code={https://github.com/poodarchu/ConQueR},selected={true}}
2022
ECCV
MPPNet: Multi-Frame Feature Intertwining with Proxy Points for 3D Temporal Object Detection
@article{chen2022mppnet,abbr={ECCV},bibtex_show={true},title={MPPNet: Multi-Frame Feature Intertwining with Proxy Points for 3D Temporal Object Detection},author={Chen, Xuesong and Shi, Shaoshuai and Zhu, Benjin and Cheung, Ka Chun and Xu, Hang and Li, Hongsheng},journal={arXiv preprint arXiv:2205.05979},html={https://github.com/open-mmlab/OpenPCDet},year={2022}}
2021
2020
codebase
cvpods: All-in-one Toolbox for Computer Vision Research
Benjin Zhu, Feng Wang, Wang Jianfeng, Siwei Yang, Jianhu Chen, and Zeming Li
@misc{zhu2020cvpods,abbr={codebase},bibtex_show={true},title={cvpods: All-in-one Toolbox for Computer Vision Research},author={Zhu, Benjin and Wang, Feng and Jianfeng, Wang and Yang, Siwei and Chen, Jianhu and Li, Zeming},html={https://github.com/Megvii-BaseDetection/cvpods},year={2020}}
arXiv
EqCo: Equivalent Rules for Self-supervised Contrastive Learning
In this paper, we propose a method, named EqCo (Equivalent Rules for Contrastive Learning), to make self-supervised learning irrelevant to the number of negative samples in InfoNCE-based contrastive learning frameworks. Inspired by the InfoMax principle, we point that the margin term in contrastive loss needs to be adaptively scaled according to the number of negative pairs in order to keep steady mutual information bound and gradient magnitude. EqCo bridges the performance gap among a wide range of negative sample sizes, so that we can use only a few negative pairs (e.g. 16 per query) to perform self-supervised contrastive training on large-scale vision datasets like ImageNet, while with almost no accuracy drop. This is quite a contrast to the widely used large batch training or memory bank mechanism in current practices. Equipped with EqCo, our simplified MoCo (SiMo) achieves comparable accuracy with MoCo v2 on ImageNet (linear evaluation protocol) while only involves 4 negative pairs per query instead of 65536, suggesting that large quantities of negative samples might not be a critical factor in InfoNCE loss.
@article{zhu2020equal,abbr={arXiv},bibtex_show={true},title={EqCo: Equivalent Rules for Self-supervised Contrastive Learning},author={Zhu, Benjin and Huang, Junqiang and Li, Zeming and Zhang, Xiangyu and Sun, Jian},journal={arXiv preprint: 2010.01929},year={2020},html={https://arxiv.org/abs/2010.01929},code={https://github.com/poodarchu/SelfSup},selected={true}}
arXiv
AutoAssign: Differentiable Label Assignment for Dense Object Detection
Benjin Zhu, Jianfeng Wang, Zhengkai Jiang, Fuhang Zong, Songtao Liu, Zeming Li, and Jian Sun
Determining positive/negative samples for object detection is known as label assignment. Here we present an anchor-free detector named AutoAssign. It requires little human knowledge and achieves appearance-aware through a fully differentiable weighting mechanism. During training, to both satisfy the prior distribution of data and adapt to category characteristics, we present Center Weighting to adjust the category-specific prior distributions. To adapt to object appearances, Confidence Weighting is proposed to adjust the specific assign strategy of each instance. The two weighting modules are then combined to generate positive and negative weights to adjust each location’s confidence. Extensive experiments on the MS COCO show that our method steadily surpasses other best sampling strategies by large margins with various backbones. Moreover, our best model achieves 52.1% AP, outperforming all existing one-stage detectors. Besides, experiments on other datasets, e.g., PASCAL VOC, Objects365, and WiderFace, demonstrate the broad applicability of AutoAssign.
@article{zhu2020auto,abbr={arXiv},bibtex_show={true},title={AutoAssign: Differentiable Label Assignment for Dense Object Detection},author={Zhu, Benjin and Wang, Jianfeng and Jiang, Zhengkai and Zong, Fuhang and Liu, Songtao and Li, Zeming and Sun, Jian},journal={arXiv preprint: 2007.03496},year={2020},html={https://arxiv.org/abs/2007.03496},code={https://github.com/Megvii-BaseDetection/AutoAssign},selected={true}}
2019
arXiv
Class-balanced Grouping and Sampling for Point Cloud 3D Object Detection
Benjin Zhu, Zhengkai Jiang, Xiangxin Zhou, Zeming Li, and Gang Yu
This report presents our method which wins the nuScenes3D Detection Challenge [17] held in Workshop on Autonomous Driving(WAD, CVPR 2019). Generally, we utilize sparse 3D convolution to extract rich semantic features, which are then fed into a class-balanced multi-head network to perform 3D object detection. To handle the severe class imbalance problem inherent in the autonomous driving scenarios, we design a class-balanced sampling and augmentation strategy to generate a more balanced data distribution. Furthermore, we propose a balanced group-ing head to boost the performance for the categories withsimilar shapes. Based on the Challenge results, our methodoutperforms the PointPillars [14] baseline by a large mar-gin across all metrics, achieving state-of-the-art detection performance on the nuScenes dataset. Code will be released at CBGS.
@article{zhu2019class,abbr={arXiv},bibtex_show={true},title={Class-balanced Grouping and Sampling for Point Cloud 3D Object Detection},author={Zhu, Benjin and Jiang, Zhengkai and Zhou, Xiangxin and Li, Zeming and Yu, Gang},journal={arXiv preprint:1908.09492},year={2019},html={https://arxiv.org/abs/1908.09492},code={https://github.com/poodarchu/Det3D}}
2018
KBS
Improving user recommendation by extracting social topics and interest topics of users in uni-directional social networks
Ke Xu, Xushen Zheng, Yi Cai, Huaqing Min, Zhen Gao, Zhu Benjin, Haoran Xie, and Tak-Lam Wong
@article{xu2018improving,abbr={KBS},bibtex_show={true},title={Improving user recommendation by extracting social topics and interest topics of users in uni-directional social networks},author={Xu, Ke and Zheng, Xushen and Cai, Yi and Min, Huaqing and Gao, Zhen and Benjin, Zhu and Xie, Haoran and Wong, Tak-Lam},journal={Knowledge-Based Systems},volume={140},pages={120--133},year={2018},publisher={Elsevier},html={https://www.sciencedirect.com/science/article/abs/pii/S0950705117305002}}