`timescale 1ns / 1ps
// Description: This device performs convolution operation with a 3x3 kernel.
//////////////////////////////////////////////////////////////////////////////////
module PE#(
bit_width = 3,
out_bit_width = 9
)(
input logic ready,
input logic clock,
input logic signed [bit_width-1:0] weight [2:0][2:0],
input logic unsigned [bit_width-1:0] fmap [2:0][2:0],
output logic signed [out_bit_width-1:0] sum
);
logic signed [2*bit_width:0] intermediate1;
logic signed [2*bit_width:0] intermediate2;
logic signed [2*bit_width:0] intermediate3;
logic signed [2*bit_width:0] intermediate4;
logic signed [2*bit_width+1:0] intermediate5;
logic signed [2*bit_width+1:0] intermediate6;
logic signed [2*bit_width+2:0] intermediate7;
logic signed [2*bit_width-1:0] psum [2:0][2:0];
always_ff @(posedge clock)
begin
if (ready == 1'b1)
begin
psum[0][0] <= weight[0][0] * fmap[0][0];
psum[0][1] <= weight[0][1] * fmap[0][1];
psum[0][2] <= weight[0][2] * fmap[0][2];
psum[1][0] <= weight[1][0] * fmap[1][0];
psum[1][1] <= weight[1][1] * fmap[1][1];
psum[1][2] <= weight[1][2] * fmap[1][2];
psum[2][0] <= weight[2][0] * fmap[2][0];
psum[2][1] <= weight[2][1] * fmap[2][1];
psum[2][2] <= weight[2][2] * fmap[2][2];
intermediate1 <= psum[0][0] + psum[0][1];
intermediate2 <= psum[0][2] + psum[1][0];
intermediate3 <= psum[1][1] + psum[1][2];
intermediate4 <= psum[2][0] + psum[2][1];
intermediate5 <= intermediate1 + intermediate2;
intermediate6 <= intermediate3 + intermediate4;
intermediate7 <= intermediate5 + intermediate6;
sum <= psum[2][2] + intermediate7;
end
end
endmodule //PE
`timescale 1ns / 1ps
//////////////////////////////////////////////////////////////////////////////////
module PE_tb();
logic [2:0] weight [2:0][2:0];
logic [2:0] inputfmap [2:0][2:0];
logic [2:0] weight [2:0][2:0];
logic [8:0] sum;
logic clock;
logic ready;
PE PE1 (ready, clock, weight, inputfmap,sum);
initial begin
weight[0][0] = 3'b1;
weight[0][1] = 3'b1;
weight[0][2] = 3'b1;
weight[1][0] = 3'b1;
weight[1][1] = 3'b1;
weight[1][2] = 3'b1;
weight[2][0] = 3'b1;
weight[2][1] = 3'b1;
weight[2][2] = 3'b1;
inputfmap[0][0] = 3'b0;
inputfmap[0][1] = 3'b0;
inputfmap[0][2] = 3'b0;
inputfmap[1][0] = 3'b0;
inputfmap[1][1] = 3'b0;
inputfmap[1][2] = 3'b0;
inputfmap[2][0] = 3'b0;
inputfmap[2][1] = 3'b0;
inputfmap[2][2] = 3'b0;
clock = 1'b0;
ready = 1'b0;
# 25
weight[0][0] = 3'd1;
weight[0][1] = 3'd1;
weight[0][2] = 3'd1;
weight[1][0] = 3'd1;
weight[1][1] = 3'd1;
weight[1][2] = 3'd1;
weight[2][0] = 3'd1;
weight[2][1] = 3'd1;
weight[2][2] = 3'd1;
inputfmap[0][0] = 3'd1;
inputfmap[0][1] = 3'd1;
inputfmap[0][2] = 3'd1;
inputfmap[1][0] = 3'd1;
inputfmap[1][1] = 3'd1;
inputfmap[1][2] = 3'd1;
inputfmap[2][0] = 3'd1;
inputfmap[2][1] = 3'd1;
inputfmap[2][2] = 3'd1;
# 25
weight[0][0] = 3'd1;
weight[0][1] = 3'd1;
weight[0][2] = 3'd1;
weight[1][0] = 3'd1;
weight[1][1] = 3'd1;
weight[1][2] = 3'd1;
weight[2][0] = 3'd1;
weight[2][1] = 3'd1;
weight[2][2] = 3'd1;
inputfmap[0][0] = 3'd2;
inputfmap[0][1] = 3'd2;
inputfmap[0][2] = 3'd2;
inputfmap[1][0] = 3'd2;
inputfmap[1][1] = 3'd2;
inputfmap[1][2] = 3'd2;
inputfmap[2][0] = 3'd2;
inputfmap[2][1] = 3'd2;
inputfmap[2][2] = 3'd2;
# 25
weight[0][0] = 3'd1;
weight[0][1] = 3'd1;
weight[0][2] = 3'd1;
weight[1][0] = 3'd1;
weight[1][1] = 3'd1;
weight[1][2] = 3'd1;
weight[2][0] = 3'd1;
weight[2][1] = 3'd1;
weight[2][2] = 3'd1;
inputfmap[0][0] = 3'd3;
inputfmap[0][1] = 3'd3;
inputfmap[0][2] = 3'd3;
inputfmap[1][0] = 3'd3;
inputfmap[1][1] = 3'd3;
inputfmap[1][2] = 3'd3;
inputfmap[2][0] = 3'd3;
inputfmap[2][1] = 3'd3;
inputfmap[2][2] = 3'd3;
#30 ready = 1'b1;
#100 $stop;
end
always @* begin
#10 clock = ~clock; // Clock Period = 20
#100 $stop;
end
endmodule
This convolutional engine performs a 3x3 convolution on a 3x3 input feature map with //a 3x3 weight or kernel matrix. This performs element-wise multiplication of each element and then adds up the 9 multiplication results to provide the convolutional sum as output.
Here the convolution is performed on the posedge
clock
when ready
= 1. If ready
= 0, the convolution is not performed.
For some reason, when I run the test bench, the output is XXX instead of a number. I am not sure where I went wrong, but the code isn't working as I had expected.
The simulation did not run long enough because of the
$stop
in thealways
block forclock
. Also, you should not use a sensitivity list (@*
) for that block. When I use the following for the clock, I seesum
become a known value ('h1b
) at time 190ns:If you didn't get any warnings when you compiled your code, you should try different simulators like the ones on edaplayground. When I run your code on VCS, I get this warning inside module
PE_tb
:I also get more compile errors with Cadence.