When I use topi.nn.upsampling I have an error InternalError

18 Views Asked by At
import math
import numpy as np
import tvm
from tvm import te
from tvm import autotvm
from tvm import topi
from vta.top.utils import is_packed_layout
from vta.environment import get_env




@autotvm.register_topi_compute("upsampling_packed.vta")
def upsampling(
    cfg,
    data,
    scale_h,
    scale_w,
    layout="NCHW",
    method="nearest_neighbor",
    align_corners=False,
    output_shape=None,
):
    env = get_env()
    batch, channels, in_height, in_width, in_batch, in_channels   = data.shape
    data_shape = data.shape
    out_height = int(in_height.value * scale_h)
    out_width = int(in_width.value * scale_w)
    # data_buf = te.compute(
    #     data_shape, 
    #     # lambda b_o, c_o, i, j, b_i, c_i:data[b_o, c_o, i, j, b_i, c_i].astype(env.acc_dtype),
    #     lambda *i: data(*i).astype(env.acc_dtype),
    #     name="data_buf",
    #     tag="data_buf"
    # ) # inp_scope
    # output = te.compute(
    #     (batch, channels, out_height, out_width, in_batch, in_channels),
    #     lambda b_o, c_o, i, j, b_i, c_i: data[b_o, c_o, te.round(te.floordiv(i, scale_h)).astype(env.acc_dtype), te.round(te.floordiv(j, scale_w)).astype(env.acc_dtype), b_i, c_i],
    #     # .astype(env.out_dtype), # 因为直接展示的int32的数据导致出现了后面数据全部为0的情况,请数据返回类型改为out_dtype就可以解决
    #     name="upsampling_packed",
    #     tag="upsampling_packed",
    # )

    output =  topi.nn.upsampling(
            # data_buf,
            data,
            scale_h=scale_h,
            scale_w=scale_w,
            layout=layout,
            method=method,
            align_corners=align_corners,
           # output_shape=( batch, channels, out_height, out_width)
    )
    output = te.compute(
        (batch, channels, out_height, out_width, in_batch, in_channels),
        lambda *i: output(*i) + 0, # 因为直接展示的int32的数据导致出现了后面数据全部为0的情况,请数据返回类型改为out_dtype就可以解决
        name="upsampling_add",
        tag="upsampling_add",
    ) # acc_scope 
    output = te.compute(
        (batch, channels, out_height, out_width, in_batch, in_channels),
        # lambda b_o, c_o, i, j, b_i, c_i: output[b_o, c_o, i, j, b_i, c_i].astype(env.out_dtype), # 因为直接展示的int32的数据导致出现了后
         lambda *i: output(*i).astype(env.out_dtype),
        # 面数据全部为0的情况,请数据返回类型改为out_dtype就可以解决
        name="upsampling_res",
        tag="upsampling_res",
    )
    return output

@autotvm.register_topi_schedule("upsampling_packed.vta")
def schedule_upsampling_packed(cfg, outs, layout=None):
    # assert layout == "NCHW", "Only NCHW layout is supported for upsampling"
    assert len(outs) == 1
    
    env = get_env()
    output = outs[0]# output 是待优化的计算张量。
    const_ops = []
    ewise_inputs = []
    ewise_ops = []
    upsample_res = []
    assert "int" in output.op.input_tensors[0].dtype


    # _traverse(output.op)    

    # data_buff_op = output.op.input_tensors[0].op.input_tensors[0].op.input_tensors[0] #  data_buff_add
    resize_op = output.op.input_tensors[0].op.input_tensors[0]
    upsample_packed = output.op.input_tensors[0]
    # print(f'data_buff_op:{data_buff_op}')
    print(f'resize_op:{resize_op}')
    print(f'upsample_packed:{upsample_packed}')

    
    batch, channels, in_height, in_width, batch_inner, channels_inner  = output.shape
    # # Create schedule
    s = te.create_schedule(output.op)
    # # # # Reorder axes
    bo, co, h, w, bi, ci= s[resize_op].op.axis
    cfg.define_split("tile_bo", bo, num_outputs=2)
    cfg.define_split("tile_co", co, num_outputs=2)
    cfg.define_split("tile_h", h, num_outputs=2)
    cfg.define_split("tile_w", w, num_outputs=2) 
    cfg.define_split("title_bi", bi, num_outputs=2) 
    cfg.define_split("tile_ci", ci, num_outputs=2)
    cfg.define_knob("oc_nthread", [1, 2])
    cfg.define_knob("h_nthread", [1, 2])

    x_bo, x_co, x_h, x_w, x_bi, x_ci = s[output].op.axis
    x_co0, x_co1= cfg["tile_co"].apply(s, output, x_co)
    x_i0, x_i1 = cfg["tile_h"].apply(s, output, x_h)
    x_j0, x_j1 = cfg["tile_w"].apply(s, output, x_w)
    s[output].reorder(x_bo, x_i0, x_co0,  x_j0, x_co1, x_i1, x_j1, x_bi, x_ci)
    store_pt = x_i0
    # # x_co0
    # # x_i0
    
    # s[data_buff_op].set_scope(env.acc_scope)
    s[resize_op].set_scope(env.acc_scope)
    s[upsample_packed].set_scope(env.acc_scope)
    
    
    x_co0_, x_co1_= cfg["tile_co"].apply(s, resize_op, co)
    x_i0_, x_i1_ = cfg["tile_h"].apply(s, resize_op, h)
    x_j0_, x_j1_ = cfg["tile_w"].apply(s, resize_op, w)      
    s[resize_op].reorder(bo, x_i0_, x_co0_, x_j0_, x_co1_, x_i1_, x_j1_, bi, ci)



    # s[data_buff_op].compute_at(s[upsample_packed], x_i0_)
    s[resize_op].compute_at(s[output], store_pt)
    s[upsample_packed].compute_at(s[output], store_pt)
    
    # s[data_buff_op].pragma(s[data_buff_op].op.axis[0], env.dma_copy)
    s[resize_op].pragma(bi, env.dma_copy)
    s[upsample_packed].pragma(s[upsample_packed].op.axis[0], env.alu)


    # # store_out = env.dma_copy
    # # # 需要在将结果拷贝到dram上之前,将计算结果拷贝的output_memory (有inp_memory, acc_memory ,wgt_memory, output_memory 
    s[output].pragma(x_bi, env.dma_copy)
    
    return s

0: tvm::tir::CopyIntrinInjector::VisitStmt_(tvm::tir::AttrStmtNode const*) at /home/lay/tvm/src/tir/transforms/inject_copy_intrin.cc:51 resize[i3_outer * 32 + i5] = T.Cast("int32", T.Cast("float32", data[T.max(T.min(T.Cast("int32", T.round(T.float32(0.48936170339584351) * T.Cast("float32", i2_outer))), 23), 0) * 768 + T.max(T.min(T.Cast("int32", T.round(T.float32(0.48936170339584351) * T.Cast("float32", i3_outer))), 23), 0) * 32 + i5])) i2_outer = T.int32() data = T.Buffer((18432,), "int32") i3_outer = T.int32() resize = T.Buffer((1536,), "int32", scope="local.acc_buffer", align=32) File "/home/lay/tvm/src/tir/transforms/inject_copy_intrin.cc", line 51 InternalError: Check failed: (MatchCopyPattern(op->body, &ret, &error_info)) is false: Cannot match copy pattern. The error is the 'BufferLoadNode' of body is a nullptr. The body is for i5 in range(32):

I'm using my own te.compute, but that creates a lot of stored instructions.

    output = te.compute(
        (batch, channels, out_height, out_width, in_batch, in_channels),
        lambda b_o, c_o, i, j, b_i, c_i: data[b_o, c_o, te.round(te.floordiv(i, scale_h)).astype(env.acc_dtype), te.round(te.floordiv(j, scale_w)).astype(env.acc_dtype), b_i, c_i],
        # .astype(env.out_dtype), # 因为直接展示的int32的数据导致出现了后面数据全部为0的情况,请数据返回类型改为out_dtype就可以解决
        name="upsampling_packed",
        tag="upsampling_packed",
    )
0

There are 0 best solutions below