CUDA runtime error while detectron2 predictor run

41 Views Asked by At

I have RuntimeError while running detectron2 DefaultPredictor when use CUDA:

"RuntimeError: CUDA error: device-side assert triggered"

On CPU everything works fine

Stacktrace:

Traceback (most recent call last):
  File "/home/biobit/app/eagles/src/utils/b/test.py", line 78, in <module>
    output = detector(frame)
             ^^^^^^^^^^^^^^^
  File "/home/biobit/anaconda3/lib/python3.11/site-packages/detectron2/engine/defaults.py", line 319, in __call__
    predictions = self.model([inputs])[0]
                  ^^^^^^^^^^^^^^^^^^^^
  File "/home/biobit/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/biobit/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/biobit/anaconda3/lib/python3.11/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward
    return self.inference(batched_inputs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/biobit/anaconda3/lib/python3.11/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 208, in inference
    proposals, _ = self.proposal_generator(images, features, None)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/biobit/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/biobit/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/biobit/anaconda3/lib/python3.11/site-packages/detectron2/modeling/proposal_generator/rpn.py", line 477, in forward
    proposals = self.predict_proposals(
                ^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/biobit/anaconda3/lib/python3.11/site-packages/detectron2/modeling/proposal_generator/rpn.py", line 503, in predict_proposals
    return find_top_rpn_proposals(
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/biobit/anaconda3/lib/python3.11/site-packages/detectron2/modeling/proposal_generator/proposal_utils.py", line 106, in find_top_rpn_proposals
    if not valid_mask.all():
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

Code:

import json
from detectron2.data import MetadataCatalog
from detectron2.data.catalog import Metadata
from detectron2.config import get_cfg, CfgNode
from detectron2.engine.defaults import DefaultPredictor

import cv2
from pathlib import Path
from time import sleep

import torch


# torch.backends.cudnn.enabled = False   # <-- If uncomment, all works fine

IMAGE_SIZE = 900
NMS_THRESH_TEST = 0.1
SCORE_THRESH_TEST = 0.8
DETECTION_PATH = Path('../../detection/')
assert DETECTION_PATH.is_dir()


def load_meta() -> Metadata:
    path = DETECTION_PATH / 'model_storage/faster_rcnn_50_cosine_001/meta.json'
    assert path.is_file()
    with open(path) as f:
        meta_dict = json.load(f)
        data = MetadataCatalog.get('test')
        data.set(
            image_root=meta_dict['image_root'],
            thing_classes=meta_dict['thing_classes'],
            thing_dataset_id_to_contiguous_id={
                int(k): int(v) for k, v in meta_dict["thing_dataset_id_to_contiguous_id"].items()
           
        )
        return data


def get_config(metadata: Metadata) -> CfgNode:
    config = get_cfg()
    config.INPUT.IMAGE_SIZE = IMAGE_SIZE
    config_path = DETECTION_PATH / 'config' / 'faster_rcnn_R_50_FPN_3x.yaml'
    config.merge_from_file(config_path)
    config.MODEL.ROI_HEADS.NUM_CLASSES = len(metadata.thing_classes)
    config.OUTPUT_DIR = '.'

    weights_path = DETECTION_PATH / 'model_storage' / 'faster_rcnn_50_cosine_001' / 'model_0009999.pth'
    assert weights_path.is_file()
    config.MODEL.WEIGHTS = str(weights_path)
    config.INPUT.IMAGE_SIZE = IMAGE_SIZE
    config.MODEL.ROI_HEADS.NMS_THRESH_TEST = NMS_THRESH_TEST
    config.MODEL.ROI_HEADS.SCORE_THRESH_TEST = SCORE_THRESH_TEST
    return config


config = get_config(load_meta())
detector = DefaultPredictor(config)

video = cv2.VideoCapture('/zoo/eagles/error/camera-02/2024.03.13/16-40-05.mp4')
assert video.isOpened()
while video.isOpened():
    ret, frame = video.read()
    if not ret:
        continue
    output = detector(frame)   # <-- errors here always on 2nd iteration
  • OS: Ubuntu 22.04
  • Torch version: 2.2.1+cu121
  • torch.cuda.is_available(): True
  • detectron2 version: 0.6

Model created, trained by my collegues on other computer

Does anyone at least has an opinion is it problem with CUDA/CUDNN/pytorch installation or problems inside saved model?

0

There are 0 best solutions below