retinanet inference speedup

ppwwyyxx · facebook-github-bot · commit 8999946492ae · 2020-09-25T16:07:45.000-07:00
Reviewed By: theschnitz

Differential Revision: D23924255

fbshipit-source-id: ea85df04b0e56cc5ba7eeccb6d7d1f88300c896f
diff --git a/MODEL_ZOO.md b/MODEL_ZOO.md
@@ -219,7 +219,7 @@ All models available for download through this document are licensed under the
  <tr><td align="left"><a href="configs/COCO-Detection/retinanet_R_50_FPN_1x.yaml">R50</a></td>
 <td align="center">1x</td>
 <td align="center">0.205</td>
-<td align="center">0.056</td>
+<td align="center">0.041</td>
 <td align="center">4.1</td>
 <td align="center">37.4</td>
 <td align="center">190397773</td>
@@ -229,7 +229,7 @@ All models available for download through this document are licensed under the
  <tr><td align="left"><a href="configs/COCO-Detection/retinanet_R_50_FPN_3x.yaml">R50</a></td>
 <td align="center">3x</td>
 <td align="center">0.205</td>
-<td align="center">0.056</td>
+<td align="center">0.041</td>
 <td align="center">4.1</td>
 <td align="center">38.7</td>
 <td align="center">190397829</td>
@@ -239,7 +239,7 @@ All models available for download through this document are licensed under the
  <tr><td align="left"><a href="configs/COCO-Detection/retinanet_R_101_FPN_3x.yaml">R101</a></td>
 <td align="center">3x</td>
 <td align="center">0.291</td>
-<td align="center">0.069</td>
+<td align="center">0.054</td>
 <td align="center">5.2</td>
 <td align="center">40.4</td>
 <td align="center">190397697</td>
diff --git a/detectron2/config/defaults.py b/detectron2/config/defaults.py
@@ -442,6 +442,7 @@
 # Inference cls score threshold, only anchors with score > INFERENCE_TH are
 # considered for inference (to improve speed)
 _C.MODEL.RETINANET.SCORE_THRESH_TEST = 0.05
+# Select topk candidates before NMS
 _C.MODEL.RETINANET.TOPK_CANDIDATES_TEST = 1000
 _C.MODEL.RETINANET.NMS_THRESH_TEST = 0.5
 
diff --git a/detectron2/modeling/meta_arch/retinanet.py b/detectron2/modeling/meta_arch/retinanet.py
@@ -341,19 +341,20 @@ def inference_single_image(self, anchors, box_cls, box_delta, image_size):
         # Iterate over every feature level
         for box_cls_i, box_reg_i, anchors_i in zip(box_cls, box_delta, anchors):
             # (HxWxAxK,)
-            box_cls_i = box_cls_i.flatten().sigmoid_()
+            predicted_prob = box_cls_i.flatten().sigmoid_()
 
-            # Keep top k top scoring indices only.
-            num_topk = min(self.topk_candidates, box_reg_i.size(0))
-            # torch.sort is actually faster than .topk (at least on GPUs)
-            predicted_prob, topk_idxs = box_cls_i.sort(descending=True)
-            predicted_prob = predicted_prob[:num_topk]
-            topk_idxs = topk_idxs[:num_topk]
-
-            # filter out the proposals with low confidence score
+            # Apply two filtering below to make NMS faster.
+            # 1. Keep boxes with confidence score higher than threshold
             keep_idxs = predicted_prob > self.score_threshold
             predicted_prob = predicted_prob[keep_idxs]
-            topk_idxs = topk_idxs[keep_idxs]
+            topk_idxs = torch.nonzero(keep_idxs, as_tuple=True)[0]
+
+            # 2. Keep top k top scoring boxes only
+            num_topk = min(self.topk_candidates, topk_idxs.size(0))
+            # torch.sort is actually faster than .topk (at least on GPUs)
+            predicted_prob, idxs = predicted_prob.sort(descending=True)
+            predicted_prob = predicted_prob[:num_topk]
+            topk_idxs = topk_idxs[idxs[:num_topk]]
 
             anchor_idxs = topk_idxs // self.num_classes
             classes_idxs = topk_idxs % self.num_classes
diff --git a/tools/benchmark.py b/tools/benchmark.py
@@ -125,16 +125,16 @@ def benchmark_eval(args):
     cfg.defrost()
     cfg.DATALOADER.NUM_WORKERS = 0
     data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0])
-    dummy_data = list(itertools.islice(data_loader, 100))
+    dummy_data = DatasetFromList(list(itertools.islice(data_loader, 100)), copy=False)
 
     def f():
         while True:
-            yield from DatasetFromList(dummy_data, copy=False)
+            yield from dummy_data
 
-    for _ in range(5):  # warmup
-        model(dummy_data[0])
+    for k in range(5):  # warmup
+        model(dummy_data[k])
 
-    max_iter = 400
+    max_iter = 300
     timer = Timer()
     with tqdm.tqdm(total=max_iter) as pbar:
         for idx, d in enumerate(f()):