1 | 2 | 3 | 4 | 5 |
6 | 7 | 8 | 9 | 10 |
11 | 12 | 13 | 14 | 15 |
16 | 17 | 18 | 19 | 20 |
21 | 22 | 23 | 24 | 25 |
1 | 2 | 3 | 4 | 5 |
6 | 7 | 8 | 9 | 10 |
11 | 12 | 13 | 14 | 15 |
16 | 17 | 18 | 19 | 20 |
21 | 22 | 23 | 24 | 25 |
1 | 2 | 3 | 4 | 5 |
6 | 7 | 8 | 9 | 10 |
11 | 12 | 13 | 14 | 15 |
16 | 17 | 18 | 19 | 20 |
21 | 22 | 23 | 24 | 25 |
1 | 2 | 3 | 4 | 5 |
6 | 7 | 8 | 9 | 10 |
11 | 12 | 13 | 14 | 15 |
16 | 17 | 18 | 19 | 20 |
21 | 22 | 23 | 24 | 25 |
a1 | b1 | c1 |
d1 | e1 | f1 |
g1 | h1 | i1 |
a1 | a2 | a3 | a4 |
a1 | a2 | a3 | a4 |
a1 | a2 | a3 | a4 |
a1 | a2 | a3 | a4 |
a1 | b1 | c1 |
d1 | e1 | f1 |
g1 | h1 | i1 |
a1 | b1 | c1 |
d1 | e1 | f1 |
g1 | h1 | i1 |
a1 | b1 | c1 |
d1 | e1 | f1 |
g1 | h1 | i1 |
a2 | b2 | c2 |
d2 | e2 | f2 |
g2 | h2 | i2 |
a2 | b2 | c2 |
d2 | e2 | f2 |
g2 | h2 | i2 |
a2 | b2 | c2 |
d2 | e2 | f2 |
g2 | h2 | i2 |
a2 | b2 | c2 |
d2 | e2 | f2 |
g2 | h2 | i2 |
13 | 12 | 11 | 8 | 7 | 6 | 3 | 2 | 1 |
13 | 12 | 11 | 8 | 7 | 6 | 3 | 2 | 1 |
13 | 12 | 11 | 8 | 7 | 6 | 3 | 2 | 1 |
13 | 12 | 11 | 8 | 7 | 6 | 3 | 2 | 1 |
l1 | m1 | n1 |
p1 | q1 | r1 |
x1 | y1 | z1 |
l2 | m2 | n2 |
p2 | q2 | r2 |
x2 | y2 | z2 |
l3 | m3 | n3 |
p3 | q3 | r3 |
x3 | y3 | z3 |
l4 | m4 | n4 |
p4 | q4 | r4 |
x4 | y4 | z4 |
l1 | m1 | n1 | p1 | q1 | r1 | x1 | y1 | z1 |
---|
l2 | m2 | n2 | p2 | q2 | r2 | x2 | y2 | z2 |
---|
l3 | m3 | n3 | p3 | q3 | r3 | x3 | y3 | z3 |
---|
l4 | m4 | n4 | p4 | q4 | r4 | x4 | y4 | z4 |
---|
Ifmap
Ofmap
Weights
Feeding inputs to array
Collecting outputs from array
Systolic
array
typedef struct { // 120 Total
ALU_Opcode alu_opcode; // 2
SRAM_index#(a) input_address; // 15
SRAM_index#(a) output_address; // 15
Dim1 output_height; // OH' // 8
Dim1 output_width; // OW' // 8
Dim2 window_height; // R // 4
Dim2 window_width; // S // 4
Dim1 mem_stride_OW; // S_OW // 8
Dim1 mem_stride_R; // S_R // 8
Dim1 mem_stride_S; // S_S // 8
Dim1 num_active; //Number of filters(M) // 8
Bool use_immediate; // 1
Dim1 immediate_value; // 8
Pad_bits#(b) padding; // 23
} ALU_params#(numeric type a, numeric type b) deriving(Bits, Eq, FShow);
{
Max,
addr_1,
addr_1,
OH,
OW,
1,
1,
1,//doesnt matter
1,
1,
32,
True,
0,
<>
}
typedef struct { // 120 Total
ALU_Opcode alu_opcode; // 2
SRAM_index#(a) input_address; // 15
SRAM_index#(a) output_address; // 15
Dim1 output_height; // OH' // 8
Dim1 output_width; // OW' // 8
Dim2 window_height; // R // 4
Dim2 window_width; // S // 4
Dim1 mem_stride_OW; // S_OW // 8
Dim1 mem_stride_R; // S_R // 8
Dim1 mem_stride_S; // S_S // 8
Dim1 num_active; //Number of filters(M) // 8
Bool use_immediate; // 1
Dim1 immediate_value; // 8
Pad_bits#(b) padding; // 23
} ALU_params#(numeric type a, numeric type b) deriving(Bits, Eq, FShow);
{
Max,
addr_1,
addr_2,
OH,
OW,
R,
S,
OW-S+1,
Sy,
Sx*OW,
32,
False,
0,
<>
}
typedef struct { // 120 Total
ALU_Opcode alu_opcode; // 2
SRAM_index#(a) input_address; // 15
SRAM_index#(a) output_address; // 15
Dim1 output_height; // OH' // 8
Dim1 output_width; // OW' // 8
Dim2 window_height; // R // 4
Dim2 window_width; // S // 4
Dim1 mem_stride_OW; // S_OW // 8
Dim1 mem_stride_R; // S_R // 8
Dim1 mem_stride_S; // S_S // 8
Dim1 num_active; //Number of filters(M) // 8
Bool use_immediate; // 1
Dim1 immediate_value; // 8
Pad_bits#(b) padding; // 23
} ALU_params#(numeric type a, numeric type b) deriving(Bits, Eq, FShow);
{
Add,
addr_1,
addr_2,
OH,
OW,
OH,
OW,
1,
1,
1, //doesn't matter
32,
False,
0,
<>
}
wgt, 0000
(0,0), 1000
inp, 0001
pop prev, push prev, pop next, push next
(0,1), 0000
(0,2), 0000
(2,2), 0000
(0,0), 1000
relu, 1000
out, 1000
inp, 0001
(0,1), 0000
(0,2), 0000
(2,2), 0001
BN, 0001
wgt, 0000
(0,0), 1000
inp, 0001
pop prev, push prev, pop next, push next
(0,1), 0000
(0,2), 0000
(2,2), 0100
(0,0), 1000
relu, 1000
out, 1000
inp, 0011
(0,1), 0000
(0,2), 0000
(2,2), 0001
BN, 0001
wgt, 0000
(0,0), 1000
relu, 1000
out, 1000
inp, 0001
pop prev, push prev,
pop next, push next
(0,1), 0000
(0,2), 0000
(2,2), 0001
First 32 filters
BN, 0001
(0,0), 0000
(0,1), 0000
(0,2), 0000
(2,2), 0001
Last 32 filters
relu, 1000
BN, 0001
out, 1000
wgt, 0000
(0,0), 1000
relu, 1000
out, 1000
inp, 0001
pop prev, push prev,
pop next, push next
(0,1), 0000
(0,2), 0000
(2,2), 0101
First 32 filters
BN, 0001
(0,0), 1000
(0,1), 0000
(0,2), 0000
(2,2), 0001
Last 32 filters
wgt, 0011
relu, 1000
BN, 0001
out, 1000
wgt, 0000
(0,0), 1000
relu, 1000
out, 1000
inp, 0001
pop prev, push prev,
pop next, push next
(0,1), 0000
(0,2), 0000
(2,2), 0101
First 32 filters
BN, 0001
(0,0), 1000
(0,1), 0000
(0,2), 0000
(2,2), 0001
Last 32 filters
wgt, 0011
relu, 1000
BN, 0001
out, 1000
wgt, 0000
(0,0), 1000
relu, 1000
out, 1000
inp, 0001
pop prev, push prev,
pop next, push next
(0,1), 0000
(0,2), 0000
First 32 filters
BN, 0101
(0,1), 0000
(0,2), 0000
(2,2), 0001
Last 32 filters
relu, 1000
BN, 0001
out, 1000
wgt, 0011
(2,2), 0101
(0,0), 1010
wgt, 0000
(0,0), 1000
relu, 1001
out, 1100
inp, 0001
pop prev, push prev,
pop next, push next
(0,1), 0000
(0,2), 0000
First 32 filters
(0,1), 0000
(0,2), 0000
(2,2), 0001
Last 32 filters
relu, 1000
out, 1000
nop, 0110
wgt, 0011
(2,2), 0101
(0,0), 1010
Consists of three computational axes - row, column, time
Â
F : 0 - 31 | F : 0- 31 | F: 32- 64 | F: 32 - 64 | |
C: 0 - 31 | (0, 0) | (0, 1) | (0, 0) | (0, 1) |
C: 0 - 31 | (1, 0) | (1, 1) | (1, 0) | (1, 1) |
C: 32 - 63 | (0, 0) | (0, 1) | (0, 0) | (0, 1) |
C: 32 - 63 | (1, 0) | (1, 1) | (1, 0) | (1, 1) |
F : 0 - 31 | F : 0- 31 | F: 32- 64 | F: 32 - 64 | |
C: 0 - 31 | (0, 0) | (0, 1) | (0, 0) | (0, 1) |
C: 0 - 31 | (1, 0) | (1, 1) | (1, 0) | (1, 1) |
C: 32 - 63 | (0, 0) | (0, 1) | (0, 0) | (0, 1) |
C: 32 - 63 | (1, 0) | (1, 1) | (1, 0) | (1, 1) |
outputs: same as previous fold
inputs: same as previous fold
weights: 32 x 32 new weights
F : 0 - 31 | F : 0- 31 | F: 32- 64 | F: 32 - 64 | |
C: 0 - 31 | (0, 0) | (0, 1) | (0, 0) | (0, 1) |
C: 0 - 31 | (1, 0) | (1, 1) | (1, 0) | (1, 1) |
C: 32 - 63 | (0, 0) | (0, 1) | (0, 0) | (0, 1) |
C: 32 - 63 | (1, 0) | (1, 1) | (1, 0) | (1, 1) |
outputs: different from previous fold
inputs: same as previous fold
weights: 32 x 32 new weights
F : 0 - 31 | F : 0- 31 | F: 32- 64 | F: 32 - 64 | |
C: 0 - 31 | (0, 0) | (0, 1) | (0, 0) | (0, 1) |
C: 0 - 31 | (1, 0) | (1, 1) | (1, 0) | (1, 1) |
C: 32 - 63 | (0, 0) | (0, 1) | (0, 0) | (0, 1) |
C: 32 - 63 | (1, 0) | (1, 1) | (1, 0) | (1, 1) |
outputs: same as previous fold
inputs: different from previous fold
weights: 32 x 32 new weights
F : 0 - 31 | F : 0- 31 | F : 0- 31 | F : 0- 31 | F: 32- 64 | ​F: 32- 64 | F: 32 - 64 | ​F: 32- 64 | |
C: 0 - 31 | (0, 0), 0-3 | (0,0), 4-7 | (0, 1), 0-3 | (0,1), 4-7 | (0, 0), 0-3 | (0,0), 4-7 | (0, 1), 0-3 | (0,1), 4-7 |
C: 0 - 31 | (1, 0), 0-3 | (1,0), 4-7 | (1, 1), 0-3 | (1,1), 4-7 | (1, 0), 0-3 | (1, 0), 4-7 | (1, 1), 0-3 | (1,1), 4-7 |
C: 32 -63 | (0, 0), 0-3 | (0,0), 4-7 | (0, 1), 0-3 | (0,1), 4-7 | (0, 0), 0-3 | (0,0), 4-7 | (0, 1), 0-3 | (0,1), 4-7 |
C: 32 -63 | (1, 0), 0-3 | (1,0), 4-7 | (1, 1), 0-3 | (1,1), 4-7 | (1, 0), 0-3 | (1, 0), 4-7 | (1, 1), 0-3 | (1,1), 4-7 |
F : 0 - 31 | F : 0- 31 | F : 0- 31 | F : 0- 31 | F: 32- 64 | ​F: 32- 64 | F: 32 - 64 | ​F: 32- 64 | |
C: 0 - 31 | (0, 0), 0-3 | (0,0), 4-7 | (0, 1), 0-3 | (0,1), 4-7 | (0, 0), 0-3 | (0,0), 4-7 | (0, 1), 0-3 | (0,1), 4-7 |
C: 0 - 31 | (1, 0), 0-3 | (1,0), 4-7 | (1, 1), 0-3 | (1,1), 4-7 | (1, 0), 0-3 | (1, 0), 4-7 | (1, 1), 0-3 | (1,1), 4-7 |
C: 32 -63 | (0, 0), 0-3 | (0,0), 4-7 | (0, 1), 0-3 | (0,1), 4-7 | (0, 0), 0-3 | (0,0), 4-7 | (0, 1), 0-3 | (0,1), 4-7 |
C: 32 -63 | (1, 0), 0-3 | (1,0), 4-7 | (1, 1), 0-3 | (1,1), 4-7 | (1, 0), 0-3 | (1, 0), 4-7 | (1, 1), 0-3 | (1,1), 4-7 |
outputs: same as previous fold
inputs: same as previous fold
weights: 32 x 32 new weights
F : 0 - 31 | F : 0- 31 | F : 0- 31 | F : 0- 31 | F: 32- 64 | ​F: 32- 64 | F: 32 - 64 | ​F: 32- 64 | |
C: 0 - 31 | (0, 0), 0-3 | (0,0), 4-7 | (0, 1), 0-3 | (0,1), 4-7 | (0, 0), 0-3 | (0,0), 4-7 | (0, 1), 0-3 | (0,1), 4-7 |
C: 0 - 31 | (1, 0), 0-3 | (1,0), 4-7 | (1, 1), 0-3 | (1,1), 4-7 | (1, 0), 0-3 | (1, 0), 4-7 | (1, 1), 0-3 | (1,1), 4-7 |
C: 32 -63 | (0, 0), 0-3 | (0,0), 4-7 | (0, 1), 0-3 | (0,1), 4-7 | (0, 0), 0-3 | (0,0), 4-7 | (0, 1), 0-3 | (0,1), 4-7 |
C: 32 -63 | (1, 0), 0-3 | (1,0), 4-7 | (1, 1), 0-3 | (1,1), 4-7 | (1, 0), 0-3 | (1, 0), 4-7 | (1, 1), 0-3 | (1,1), 4-7 |
outputs: store prev output, then write
inputs: same as previous fold
weights: same as previous fold
F : 0 - 31 | F : 0- 31 | F : 0- 31 | F : 0- 31 | F: 32- 64 | ​F: 32- 64 | F: 32 - 64 | ​F: 32- 64 | |
C: 0 - 31 | (0, 0), 0-3 | (0,0), 4-7 | (0, 1), 0-3 | (0,1), 4-7 | (0, 0), 0-3 | (0,0), 4-7 | (0, 1), 0-3 | (0,1), 4-7 |
C: 0 - 31 | (1, 0), 0-3 | (1,0), 4-7 | (1, 1), 0-3 | (1,1), 4-7 | (1, 0), 0-3 | (1, 0), 4-7 | (1, 1), 0-3 | (1,1), 4-7 |
C: 32 -63 | (0, 0), 0-3 | (0,0), 4-7 | (0, 1), 0-3 | (0,1), 4-7 | (0, 0), 0-3 | (0,0), 4-7 | (0, 1), 0-3 | (0,1), 4-7 |
C: 32 -63 | (1, 0), 0-3 | (1,0), 4-7 | (1, 1), 0-3 | (1,1), 4-7 | (1, 0), 0-3 | (1, 0), 4-7 | (1, 1), 0-3 | (1,1), 4-7 |
outputs: store prev output, then write
inputs: same as previous fold
weights: 32 x 32 new weights
F : 0 - 31 | F : 0- 31 | F : 0- 31 | F : 0- 31 | F: 32- 64 | ​F: 32- 64 | F: 32 - 64 | ​F: 32- 64 | |
C: 0 - 31 | (0, 0), 0-3 | (0,0), 4-7 | (0, 1), 0-3 | (0,1), 4-7 | (0, 0), 0-3 | (0,0), 4-7 | (0, 1), 0-3 | (0,1), 4-7 |
C: 0 - 31 | (1, 0), 0-3 | (1,0), 4-7 | (1, 1), 0-3 | (1,1), 4-7 | (1, 0), 0-3 | (1, 0), 4-7 | (1, 1), 0-3 | (1,1), 4-7 |
C: 32 -63 | (0, 0), 0-3 | (0,0), 4-7 | (0, 1), 0-3 | (0,1), 4-7 | (0, 0), 0-3 | (0,0), 4-7 | (0, 1), 0-3 | (0,1), 4-7 |
C: 32 -63 | (1, 0), 0-3 | (1,0), 4-7 | (1, 1), 0-3 | (1,1), 4-7 | (1, 0), 0-3 | (1, 0), 4-7 | (1, 1), 0-3 | (1,1), 4-7 |
outputs: same as previous fold
inputs: different from previous fold
weights: 32 x 32 new weights
for n = 1 to N
for m = 1 to M
for e = 1 to E
for f = 1 to F
for c = 1 to C
for r = 1 to R
for s = 1 to S
output[n][m][e][f] += input[n][e][f][c] * weight[r][s][c][m]
_
for n = 1 to N
for m = 1 to M_folds
for mm = 1 to SYS_W
for e = 1 to E
for f = 1 to F
for c = 1 to C_folds
for cc = 1 to SYS_H
for r = 1 to R
for s = 1 to S
output[n][m][e][f] += input[n][e][f][c] * weight[r][s][c][m]
Loop Tiling
for n = 1 to N
for m = 1 to M
for e = 1 to E
for f = 1 to F
for c = 1 to C
for r = 1 to R
for s = 1 to S
output[n][m][e][f] += input[n][e][f][c] * weight[r][s][c][m]
_
for m = 1 to M_folds
for c = 1 to C_folds
for n = 1 to N
for e = 1 to E
for f = 1 to F
for r = 1 to R
for s = 1 to S
for mm = 1 to SYS_W
for cc = 1 to SYS_H
output[n][m][e][f] += input[n][e][f][c] * weight[r][s][c][m]
Loop Interchange
for n = 1 to N
for m = 1 to M_folds
for mm = 1 to SYS_W
for e = 1 to E
for f = 1 to F
for c = 1 to C_folds
for cc = 1 to SYS_H
for r = 1 to R
for s = 1 to S
output[n][m][e][f] += input[n][e][f][c] * weight[r][s][c][m]
for m = 1 to M_folds
for c = 1 to C_folds
for ne = 1 to N*E
for f = 1 to F
for r = 1 to R
for s = 1 to S
for mm = 1 to SYS_W
for cc = 1 to SYS_H
output[n][m][e][f] += input[n][e][f][c] * weight[r][s][c][m]
_
Loop Fusion
for m = 1 to M_folds
for c = 1 to C_folds
for n = 1 to N
for e = 1 to E
for f = 1 to F
for r = 1 to R
for s = 1 to S
for mm = 1 to SYS_W
for cc = 1 to SYS_H
output[n][m][e][f] += input[n][e][f][c] * weight[r][s][c][m]
for m = 1 to M_folds
for c = 1 to C_folds
for ne = 1 to T_folds
for nene = 1 to T_size
for f = 1 to F
for r = 1 to R
for s = 1 to S
for mm = 1 to SYS_W
for cc = 1 to SYS_H
output[n][m][e][f] += input[n][e][f][c] * weight[r][s][c][m]
Loop Tiling
for m = 1 to M_folds
for c = 1 to C_folds
for ne = 1 to N*E
for f = 1 to F
for r = 1 to R
for s = 1 to S
for mm = 1 to SYS_W
for cc = 1 to SYS_H
output[n][m][e][f] += input[n][e][f][c] * weight[r][s][c][m]
_
for m = 1 to M_folds
for c = 1 to C_folds
for ne = 1 to T_folds
for nenef = 1 to T_size*F
for r = 1 to R
for s = 1 to S
for mm = 1 to SYS_W
for cc = 1 to SYS_H
output[n][m][e][f] += input[n][e][f][c] * weight[r][s][c][m]
_
Loop Fusion
for m = 1 to M_folds
for c = 1 to C_folds
for ne = 1 to T_folds
for nene = 1 to T_size
for f = 1 to F
for r = 1 to R
for s = 1 to S
for mm = 1 to SYS_W
for cc = 1 to SYS_H
output[n][m][e][f] += input[n][e][f][c] * weight[r][s][c][m]
for m = 1 to M_folds
for c = 1 to C_folds
for ne = 1 to T_folds
for r = 1 to R
for s = 1 to S
for nenef = 1 to T_size*F
for mm = 1 to SYS_W
for cc = 1 to SYS_H
output[n][m][e][f] += input[n][e][f][c] * weight[r][s][c][m]
_
Loop Interchange
for m = 1 to M_folds
for c = 1 to C_folds
for ne = 1 to T_folds
for nenef = 1 to T_size*F
for r = 1 to R
for s = 1 to S
for mm = 1 to SYS_W
for cc = 1 to SYS_H
output[n][m][e][f] += input[n][e][f][c] * weight[r][s][c][m]
_
for m = 1 to M_folds
for c = 1 to C_folds
for ne = 1 to T_folds
for r = 1 to R
for s = 1 to S
for nenef = 1 to T_size*F
for mm = 1 to SYS_W
for cc = 1 to SYS_H
output[n][m][e][f] += input[n][e][f][c] * weight[r][s][c][m]
for n = 1 to N
for m = 1 to M
for e = 1 to E
for f = 1 to F
for c = 1 to C
for r = 1 to R
for s = 1 to S
output[n][m][e][f] += input[n][e][f][c] * weight[r][s][c][m]
Final loop nest
Initial loop nest
for m = 1 to M_folds
for c = 1 to C_folds
for ne = 1 to T_folds
for r = 1 to R
for s = 1 to S
for nenef = 1 to T_size*F
for mm = 1 to SYS_W
for cc = 1 to SYS_H
output[n][m][e][f] += input[n][e][f][c] * weight[r][s][c][m]
for n = 1 to N
for m = 1 to M
for e = 1 to E
for f = 1 to F
for c = 1 to C
for r = 1 to R
for s = 1 to S
output[n][m][e][f] += input[n][e][f][c] * weight[r][s][c][m]
Innermost: One systolic fold
NHWC - Input
RSCM - Weight
NEFM - Output
for m = 1 to M_folds
for c = 1 to C_folds
for ne = 1 to NE_folds
for mm = 1 to SYS_W
for nenef = 1 to F*NE_Size # F*NE_Size <= SYS_H
for ccrs = 1 to C_Size*R*S # buffer constraint
output[n][m][e][f] += input[n][e][f][c] * weight[r][s][c][m]
for n = 1 to N
for m = 1 to M
for e = 1 to E
for f = 1 to F
for c = 1 to C
for r = 1 to R
for s = 1 to S
output[n][m][e][f] += input[n][e][f][c] * weight[r][s][c][m]
Innermost: One systolic fold
CNHW - Input
RSCM - Weight
NEFM - Output
for ne = 1 to NE_folds
for c = 1 to C_folds
for m = 1 to M_folds
for rsmm = 1 to M_Size*R*S # constrained by buffer size
for nenef = 1 to F*NE_Size # F*NE_Size <= SYS_W
for cc = 1 to SYS_H
output[n][m][e][f] += input[n][e][f][c] * weight[r][s][c][m]
for n = 1 to N
for m = 1 to M
for e = 1 to E
for f = 1 to F
for c = 1 to C
for r = 1 to R
for s = 1 to S
output[n][m][e][f] += input[n][e][f][c] * weight[r][s][c][m]
Innermost: One systolic fold
NHWC - Input
RSMC - Weight
NEFM - Output
Model
H/W Config
Mapper
ISA trace
array size, buffer capacity
ISA trace
Simulator
Operand values at time step t
ISA trace
Simulator
Overall latency
Layerwise results
Trace Log
HW Utilisation
LOAD WGT
GEMM
ReLU
STORE
LOAD INP
LOAD WGT
LOAD INP
GEMM
ReLU
STORE
PyTorch model
TensorFlow model
TVM Compiler
Relay graph
Optimizer
Relay graph
(optimized)
Codegen
shared library(.so)
Runtime
Hardware
PyTorch model
TensorFlow model
TVM Compiler
Relay graph
Optimizer
Relay graph
(optimized)
Codegen
shared library(.so)
Runtime
Hardware
PyTorch model
TensorFlow model
TVM Compiler
Relay graph
Optimizer
Relay graph
(optimized)
Codegen
shared library(.so)
Runtime
Hardware
TVM AutoTuner - Simulated annealing
Reinforcement Learning based optimizer
cost-model based on simulator
PyTorch model
TensorFlow model
TVM Compiler
Relay graph
Optimizer
Relay graph
(optimized)
Codegen
shared library(.so)
Runtime
Hardware
Offload computation to accelerator
Execute unsupported operations on CPU
PyTorch model
TensorFlow model
TVM Compiler
Relay graph
Optimizer
Relay graph
(optimized)
Codegen
shared library(.so)
Runtime
Hardware
TVM Runtime schedules operations to CPU or accelerator
Accelerator runtime configures the accelerator to perform a certain operation
%1 = nn.conv2d(%data, %weight, ...) %2 = add(%1, %bias) %3 = nn.relu(%2)