Valgrind
register allocator overhaul
Ivo Raisr
FOSDEM 2018
Ivo Raisr
39.6
GNU Toolchain
Valgrind master
Why?
If-Then-Else support into IR
VEX register allocator v3
VEX operation
assembly
IR
IR
toIR
optimize
instrument
vcode
isel
rcode
allocate
registers
assembly
emit
0x4001CA3: movq %rdx,(%rsi,%rax,8)
------ IMark(0x4001CA3, 4, 0) ------
t0 = Add64(GET:I64(64),Shl64(GET:I64(16),0x3:I8))
STle(t0) = GET:I64(32)
PUT(184) = 0x4001CA7:I64
------ IMark(0x4001CA3, 4, 0) ------
t12 = GET:I64(32)
STle(Add64(GET:I64(64),Shl64(GET:I64(16),0x3:I8))) = t12
-- t12 = GET:I64(32)
movq 0x20(%rbp),%vR12
-- STle(Add64(GET:I64(64),Shl64(GET:I64(16),0x3:I8))) = t12
movq 0x40(%rbp),%vR24
movq 0x10(%rbp),%vR25
movq %vR12,0x0(%vR24,%vR25,8)
movq 0x20(%rbp),%r10
movq 0x40(%rbp),%r9
movq 0x10(%rbp),%r8
movq %r10,0x0(%r9,%r8,8)
VEX register allocator
0 (evCheck) decl 0x8(%rbp); jns nofail; jmp *(%rbp); nofail:
1 movq 0x40(%rbp),%r10
2 movq 0x10(%rbp),%r9
3 leaq 0x0(%r10,%r9,8),%rbx
4 movq 0x3C0(%rbp),%r15
5 movq 0x20(%rbp),%r14
6 movq 0x3E0(%rbp),%r10
7 movq 0x3B0(%rbp),%r9
8 shlq $3,%r9
9 orq %r9,%r10
10 callnz[0,RLPri_None] 0x58024160
11 movq %rbx,%rdi
12 movq %r15,%rsi
13 call[2,RLPri_None] 0x58023660
14 movq %r14,(%rbx)
15 movq %r15,%r10
16 notq %r10
17 movq %r14,%r9
...
vcode
0 (evCheck) decl 0x8(%rbp); jns nofail; jmp *(%rbp); nofail:
1 movq 0x40(%rbp),%vR65
2 movq 0x10(%rbp),%vR66
3 leaq 0x0(%vR65,%vR66,8),%vR8
4 movq 0x3C0(%rbp),%vR35
5 movq 0x20(%rbp),%vR12
6 movq 0x3E0(%rbp),%vR67
7 movq 0x3B0(%rbp),%vR69
8 movq %vR69,%vR68
9 shlq $3,%vR68
10 movq %vR67,%vR70
11 orq %vR68,%vR70
12 callnz[0,RLPri_None] 0x58024160
13 movq %vR8,%rdi
14 movq %vR35,%rsi
15 call[2,RLPri_None] 0x58023660
16 movq %vR12,(%vR8)
17 movq %vR35,%vR75
18 notq %vR75
19 movq %vR12,%vR74
...
rcode
RegAlloc Terminology
vcode
0 (evCheck) decl 0x8(%rbp); jns nofail; jmp *(%rbp); nofail:
1 movq 0x40(%rbp),%vR65
2 movq 0x10(%rbp),%vR66
3 leaq 0x0(%vR65,%vR66,8),%vR8
4 movq 0x3C0(%rbp),%vR35
5 movq 0x20(%rbp),%vR12
6 movq 0x3E0(%rbp),%vR67
7 movq 0x3B0(%rbp),%vR69
8 movq %vR69,%vR68
9 shlq $3,%vR68
10 movq %vR67,%vR70
11 orq %vR68,%vR70
12 callnz[0,RLPri_None] 0x58024160
13 movq %vR8,%rdi
14 movq %vR35,%rsi
15 call[2,RLPri_None] 0x58023660
16 movq %vR12,(%vR8)
17 movq %vR35,%vR75
18 notq %vR75
19 movq %vR12,%vR74
...
1 movq 0x40(%rbp), %vR65
2 movq 0x10(%rbp), %vR66
8 movq %vR69, %vR68
...
9 shlq $3, %vR68
10 movq %vR67, %vR70
11 orq %vR68, %vR70
12 callnz[0, RLPri_None] <addr>
13 movq %vR8, %rdi
14 movq %vR35, %rsi
15 call[2, RLPri_None] <addr>
...
RegAlloc v3 Passes
8 movq %vR69, %vR68
...
9 shlq $3, %vR68
10 movq %vR67, %vR70
11 orq %vR68, %vR70
12 callnz[0, RLPri_None] <addr>
13 movq %vR8, %rdi
14 movq %vR35, %rsi
15 call[2, RLPri_None] <addr>
...
1. scan insns
21 movq %vR70, %vR9
%vR69
%rdi
2. coalescing
%vR67 -> %vR70 -> %vR9
3. spill slots
4. process insns
%vR68 ... %rdi
%vR69 ... %rax
%vR70 ... %r9
RegAlloc v3 State
8 movq %vR69, %vR68
...
9 shlq $3, %vR68
10 movq %vR67, %vR70
11 orq %vR68, %vR70
12 callnz[0, RLPri_None] <addr>
13 movq %vR8, %rdi
14 movq %vR35, %rsi
15 call[2, RLPri_None] <addr>
...
vreg state
21 movq %vR70, %vR9
live after
%vR67 -> %vR70 -> %vR9
%vR68 ... [8, 12) ... %rdx... [12]
%vR69 ... [7, 9) ... --- ... [10]
%vR70 ... [10, 12) ... %r9 ... [5]
dead before
real reg
spill slot
RegAlloc v3 State II.
8 movq %vR69, %vR68
...
9 shlq $3, %vR68
10 movq %vR67, %vR70
11 orq %vR68, %vR70
12 callnz[0, RLPri_None] <addr>
13 movq %vR8, %rdi
14 movq %vR35, %rsi
15 call[2, RLPri_None] <addr>
...
rreg state
21 movq %vR70, %vR9
%rdx ... %vR68
%rcx ... ---
%rdi ... [reserved]
rreg universe
%r12, %r13, %r14, %r15, %rbx,
%rsi, %rdi, %r8, %r9, %r10
HRcInt64
Processing insn (simple cases)
movq 0x40(%rbp), %vR68
movq 0x40(%rbp), %r10
orq %vR68, %vR70
orq %r10, %r9
%vR68 ... %r10
%vR70 ... %r9
%r9 ... %vR70
%r10 ... %vR68
movq %v70, %rsi
call[2, RLPri_None] <addr>
%vR68 ... %r10
%vR70 ... ---
%r9 ... ---
%r10 ... %vR68
movq %r9, %rsi
%vR68 ... %r10
%vR70 ... %r9
%rsi ... reserved
%r9 ... %vR70
%r10 ... %vR68
vreg state
rreg state
Processing insn (spill)
movq 0x40(%rbp), %vR15
movq 0x40(%rbp), %r9
%vR15 ... ---
%vR68 ... %r10
%vR70 ... %r9
%r9 ... %vR70
%r10 ... %vR68
...
(all assigned)
all rregs are taken, what to do?
movq %r9, 0xC0A(%rbp)
spill slot
Optimizations
1. MOV vregs coalescing
2. reusing spill slots
3. vreg spilling criteria
4. avoid spilling if rreg == spill slot
5. rreg allocation strategy
6. direct reload
5. rreg allocation strategy
%r12
%r13
%r14
%r15
%rbx
%rsi
%rdi
%r8
%r9
%r10
amd64 rreg universe for HRcInt64
caller save
callee save
6. direct reload from a spill slot
addq %vR68, $0x9823, %vR15
%vR68 ... spilled
standard way
addq %r9, 0x9823, %r10
movq 0xC0A(%rbp), %r9
direct reload
addq 0xC0A(%rbp), $0x9823, %r10
Benchmarks
Memcheck on perf/bz2, amd64
total insns
v2
v3
4,170 M
regalloc insns
v2
v3
167 M
4,102 M
148 M
16.0
15.8
ratio
v2
v3
VEX register allocator v3 is now the default.
The old implementation available with:
--vex-regalloc-version=2
Valgrind - register allocator overhaul
By Ivo Raisr
Valgrind - register allocator overhaul
Register allocator is a key component in Valgrind's VEX subsystem. Superficially it only translates virtual registers to the real ones. But is that really all? What actually happens under the covers, what algorithms are at play here and what are the constraints under which it operates? In 2017, a major overhaul of the Valgrind VEX register allocator has been done, resulting in a new version v3 which is now used by Valgrind. In addition to a new design, new register allocation algorithms have been also implemented, producing faster and smaller code.
- 1,300