I have RuntimeError while running detectron2 DefaultPredictor when use CUDA:
"RuntimeError: CUDA error: device-side assert triggered"
On CPU everything works fine
Stacktrace:
Traceback (most recent call last):
File "/home/biobit/app/eagles/src/utils/b/test.py", line 78, in <module>
output = detector(frame)
^^^^^^^^^^^^^^^
File "/home/biobit/anaconda3/lib/python3.11/site-packages/detectron2/engine/defaults.py", line 319, in __call__
predictions = self.model([inputs])[0]
^^^^^^^^^^^^^^^^^^^^
File "/home/biobit/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/biobit/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/biobit/anaconda3/lib/python3.11/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward
return self.inference(batched_inputs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/biobit/anaconda3/lib/python3.11/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 208, in inference
proposals, _ = self.proposal_generator(images, features, None)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/biobit/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/biobit/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/biobit/anaconda3/lib/python3.11/site-packages/detectron2/modeling/proposal_generator/rpn.py", line 477, in forward
proposals = self.predict_proposals(
^^^^^^^^^^^^^^^^^^^^^^^
File "/home/biobit/anaconda3/lib/python3.11/site-packages/detectron2/modeling/proposal_generator/rpn.py", line 503, in predict_proposals
return find_top_rpn_proposals(
^^^^^^^^^^^^^^^^^^^^^^^
File "/home/biobit/anaconda3/lib/python3.11/site-packages/detectron2/modeling/proposal_generator/proposal_utils.py", line 106, in find_top_rpn_proposals
if not valid_mask.all():
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
Code:
import json
from detectron2.data import MetadataCatalog
from detectron2.data.catalog import Metadata
from detectron2.config import get_cfg, CfgNode
from detectron2.engine.defaults import DefaultPredictor
import cv2
from pathlib import Path
from time import sleep
import torch
# torch.backends.cudnn.enabled = False # <-- If uncomment, all works fine
IMAGE_SIZE = 900
NMS_THRESH_TEST = 0.1
SCORE_THRESH_TEST = 0.8
DETECTION_PATH = Path('../../detection/')
assert DETECTION_PATH.is_dir()
def load_meta() -> Metadata:
path = DETECTION_PATH / 'model_storage/faster_rcnn_50_cosine_001/meta.json'
assert path.is_file()
with open(path) as f:
meta_dict = json.load(f)
data = MetadataCatalog.get('test')
data.set(
image_root=meta_dict['image_root'],
thing_classes=meta_dict['thing_classes'],
thing_dataset_id_to_contiguous_id={
int(k): int(v) for k, v in meta_dict["thing_dataset_id_to_contiguous_id"].items()
)
return data
def get_config(metadata: Metadata) -> CfgNode:
config = get_cfg()
config.INPUT.IMAGE_SIZE = IMAGE_SIZE
config_path = DETECTION_PATH / 'config' / 'faster_rcnn_R_50_FPN_3x.yaml'
config.merge_from_file(config_path)
config.MODEL.ROI_HEADS.NUM_CLASSES = len(metadata.thing_classes)
config.OUTPUT_DIR = '.'
weights_path = DETECTION_PATH / 'model_storage' / 'faster_rcnn_50_cosine_001' / 'model_0009999.pth'
assert weights_path.is_file()
config.MODEL.WEIGHTS = str(weights_path)
config.INPUT.IMAGE_SIZE = IMAGE_SIZE
config.MODEL.ROI_HEADS.NMS_THRESH_TEST = NMS_THRESH_TEST
config.MODEL.ROI_HEADS.SCORE_THRESH_TEST = SCORE_THRESH_TEST
return config
config = get_config(load_meta())
detector = DefaultPredictor(config)
video = cv2.VideoCapture('/zoo/eagles/error/camera-02/2024.03.13/16-40-05.mp4')
assert video.isOpened()
while video.isOpened():
ret, frame = video.read()
if not ret:
continue
output = detector(frame) # <-- errors here always on 2nd iteration
- OS: Ubuntu 22.04
- Torch version: 2.2.1+cu121
- torch.cuda.is_available(): True
- detectron2 version: 0.6
Model created, trained by my collegues on other computer
Does anyone at least has an opinion is it problem with CUDA/CUDNN/pytorch installation or problems inside saved model?