LOW LEVEL Performance
What will be faster #1
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int i = 0; i < x; ++i)
{
for (int j = 0; j < y; ++j)
{
n += ar[i][j];
}
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int j = 0; j < y; ++j)
{
for (int i = 0; i < x; ++i)
{
n += ar[i][j];
}
}
// n == 0
What will be faster #1
≈ 16X slower
Moore's LAW
Moore's LAW
CACHE
CPU
RAM
CACHE
CACHE
CPU
RAM
CACHE
CACHE
CPU
RAM
L1
L2
L3
CACHE
{
CACHE
CPU
RAM
L1
L2
L3
x = 5;
👀
CACHE
CPU
RAM
L1
L2
L3
x = 5;
👀
CACHE
CPU
RAM
L1
L2
L3
x = 5;
👀
CACHE
CPU
RAM
L1
L2
L3
x = 5;
👀
CACHE
RAM
L1
L2
L3
x = 5;
&x == 0x7ffe19e65d24
0x7f0d09490260
logical address
physical address
Translation lookaside buffer
👀
CACHE
Intel Core i7 920 Bloomfield
(2008)
CACHE
Intel Core i7 920 Bloomfield
(2008)
CACHE
Q
U
E
U
E
CORE
CORE
CORE
CORE
MEMORY CONTROLLER
Q
P
I
0
Q
P
I
1
M
I
S
C
I
O
M
I
S
C
I
O
Intel Core i7 920 Bloomfield
(2008)
ick
ath
nterconnect
CACHE
Q
U
E
U
E
CORE
CORE
CORE
CORE
MEMORY CONTROLLER
L2
L2
L2
L2
L1
L1
L1
L1
SHARED
L3 CACHE
Q
P
I
0
Q
P
I
1
M
I
S
C
I
O
M
I
S
C
I
O
Intel Core i7 920 Bloomfield
(2008)
CACHE
[word here]
👀
👀
cache
hit
cache
miss
{
cache size [bytes]
L1 cache
L1_D
L1_I
instruction cache
movl -16(%rbp), %eax
cmpl %eax, -4(%rbp)
jge .L9
leaq -24(%rbp), %rax
movq %rax, %rsi
movl $_ZSt3cin, %edi
call _ZNSirsERi
movl -24(%rbp), %eax
subl $1, %eax
movl %eax, -24(%rbp)
movl -24(%rbp), %eax
movslq %eax, %rdx
movl tab(,%rdx,4), %edx
addl $1, %edx
cltq
movl %edx, tab(,%rax,4)
addl $1, -4(%rbp)
jmp .L10
CACHE
where does data in cache come from?
memory
read x;
👀
Translation lookaside buffer
CACHE
CACHE
where does data in cache come from?
memory
read x;
👀
Translation lookaside buffer
CACHE
CACHE
where does data in cache come from?
memory
read x;
👀
Translation lookaside buffer
CACHE
CACHE
where does data in cache come from?
memory
read x;
👀
Translation lookaside buffer
CACHE
CACHE
where does data in cache come from?
memory
read x;
👀
Translation lookaside buffer
CACHE
CACHE
where does data in cache come from?
memory
read x;
👀
Translation lookaside buffer
CACHE
}
cache line
{
cache entry
cache entry tag
CACHE
policies
cache replacement policy
cache write policy
cache placement policy
CACHE
memory
N-way
associative
cache line
cache entry
cache entry
cache entry
cache entry
1 - way associative = direct-mapped cache
∞ - way associative = fully associative cache
( )
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int i = 0; i < x; ++i)
{
for (int j = 0; j < y; ++j)
{
n += ar[i][j];
}
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int j = 0; j < y; ++j)
{
for (int i = 0; i < x; ++i)
{
n += ar[i][j];
}
}
// n == 0
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CACHE
friendliness
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int i = 0; i < x; ++i)
{
for (int j = 0; j < y; ++j)
{
n += ar[i][j];
}
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int j = 0; j < y; ++j)
{
for (int i = 0; i < x; ++i)
{
n += ar[i][j];
}
}
// n == 0
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CACHE
friendliness
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int i = 0; i < x; ++i)
{
for (int j = 0; j < y; ++j)
{
n += ar[i][j];
}
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int j = 0; j < y; ++j)
{
for (int i = 0; i < x; ++i)
{
n += ar[i][j];
}
}
// n == 0
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CACHE
friendliness
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int i = 0; i < x; ++i)
{
for (int j = 0; j < y; ++j)
{
n += ar[i][j];
}
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int j = 0; j < y; ++j)
{
for (int i = 0; i < x; ++i)
{
n += ar[i][j];
}
}
// n == 0
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CACHE
friendliness
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int i = 0; i < x; ++i)
{
for (int j = 0; j < y; ++j)
{
n += ar[i][j];
}
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int j = 0; j < y; ++j)
{
for (int i = 0; i < x; ++i)
{
n += ar[i][j];
}
}
// n == 0
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CACHE
friendliness
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int i = 0; i < x; ++i)
{
for (int j = 0; j < y; ++j)
{
n += ar[i][j];
}
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int j = 0; j < y; ++j)
{
for (int i = 0; i < x; ++i)
{
n += ar[i][j];
}
}
// n == 0
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CACHE
friendliness
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int i = 0; i < x; ++i)
{
for (int j = 0; j < y; ++j)
{
n += ar[i][j];
}
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int j = 0; j < y; ++j)
{
for (int i = 0; i < x; ++i)
{
n += ar[i][j];
}
}
// n == 0
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CACHE
friendliness
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int i = 0; i < x; ++i)
{
for (int j = 0; j < y; ++j)
{
n += ar[i][j];
}
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int j = 0; j < y; ++j)
{
for (int i = 0; i < x; ++i)
{
n += ar[i][j];
}
}
// n == 0
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CACHE
friendliness
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int i = 0; i < x; ++i)
{
for (int j = 0; j < y; ++j)
{
n += ar[i][j];
}
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int j = 0; j < y; ++j)
{
for (int i = 0; i < x; ++i)
{
n += ar[i][j];
}
}
// n == 0
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CACHE
friendliness
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int i = 0; i < x; ++i)
{
for (int j = 0; j < y; ++j)
{
n += ar[i][j];
}
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int j = 0; j < y; ++j)
{
for (int i = 0; i < x; ++i)
{
n += ar[i][j];
}
}
// n == 0
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CACHE
friendliness
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int i = 0; i < x; ++i)
{
for (int j = 0; j < y; ++j)
{
n += ar[i][j];
}
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int j = 0; j < y; ++j)
{
for (int i = 0; i < x; ++i)
{
n += ar[i][j];
}
}
// n == 0
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CACHE
friendliness
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int i = 0; i < x; ++i)
{
for (int j = 0; j < y; ++j)
{
n += ar[i][j];
}
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int j = 0; j < y; ++j)
{
for (int i = 0; i < x; ++i)
{
n += ar[i][j];
}
}
// n == 0
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CACHE
friendliness
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int i = 0; i < x; ++i)
{
for (int j = 0; j < y; ++j)
{
n += ar[i][j];
}
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int j = 0; j < y; ++j)
{
for (int i = 0; i < x; ++i)
{
n += ar[i][j];
}
}
// n == 0
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CACHE
friendliness
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int i = 0; i < x; ++i)
{
for (int j = 0; j < y; ++j)
{
n += ar[i][j];
}
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int j = 0; j < y; ++j)
{
for (int i = 0; i < x; ++i)
{
n += ar[i][j];
}
}
// n == 0
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CACHE
friendliness
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int i = 0; i < x; ++i)
{
for (int j = 0; j < y; ++j)
{
n += ar[i][j];
}
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int j = 0; j < y; ++j)
{
for (int i = 0; i < x; ++i)
{
n += ar[i][j];
}
}
// n == 0
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CACHE
friendliness
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int i = 0; i < x; ++i)
{
for (int j = 0; j < y; ++j)
{
n += ar[i][j];
}
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int j = 0; j < y; ++j)
{
for (int i = 0; i < x; ++i)
{
n += ar[i][j];
}
}
// n == 0
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CACHE
friendliness
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int i = 0; i < x; ++i)
{
for (int j = 0; j < y; ++j)
{
n += ar[i][j];
}
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int j = 0; j < y; ++j)
{
for (int i = 0; i < x; ++i)
{
n += ar[i][j];
}
}
// n == 0
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CACHE
friendliness
x cache misses
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int i = 0; i < x; ++i)
{
for (int j = 0; j < y; ++j)
{
n += ar[i][j];
}
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int j = 0; j < y; ++j)
{
for (int i = 0; i < x; ++i)
{
n += ar[i][j];
}
}
// n == 0
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CACHE
friendliness
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int i = 0; i < x; ++i)
{
for (int j = 0; j < y; ++j)
{
n += ar[i][j];
}
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int j = 0; j < y; ++j)
{
for (int i = 0; i < x; ++i)
{
n += ar[i][j];
}
}
// n == 0
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CACHE
friendliness
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int i = 0; i < x; ++i)
{
for (int j = 0; j < y; ++j)
{
n += ar[i][j];
}
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int j = 0; j < y; ++j)
{
for (int i = 0; i < x; ++i)
{
n += ar[i][j];
}
}
// n == 0
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CACHE
friendliness
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int i = 0; i < x; ++i)
{
for (int j = 0; j < y; ++j)
{
n += ar[i][j];
}
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int j = 0; j < y; ++j)
{
for (int i = 0; i < x; ++i)
{
n += ar[i][j];
}
}
// n == 0
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CACHE
friendliness
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int i = 0; i < x; ++i)
{
for (int j = 0; j < y; ++j)
{
n += ar[i][j];
}
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int j = 0; j < y; ++j)
{
for (int i = 0; i < x; ++i)
{
n += ar[i][j];
}
}
// n == 0
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CACHE
friendliness
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int i = 0; i < x; ++i)
{
for (int j = 0; j < y; ++j)
{
n += ar[i][j];
}
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int j = 0; j < y; ++j)
{
for (int i = 0; i < x; ++i)
{
n += ar[i][j];
}
}
// n == 0
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CACHE
friendliness
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int i = 0; i < x; ++i)
{
for (int j = 0; j < y; ++j)
{
n += ar[i][j];
}
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int j = 0; j < y; ++j)
{
for (int i = 0; i < x; ++i)
{
n += ar[i][j];
}
}
// n == 0
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CACHE
friendliness
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int i = 0; i < x; ++i)
{
for (int j = 0; j < y; ++j)
{
n += ar[i][j];
}
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int j = 0; j < y; ++j)
{
for (int i = 0; i < x; ++i)
{
n += ar[i][j];
}
}
// n == 0
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CACHE
friendliness
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int i = 0; i < x; ++i)
{
for (int j = 0; j < y; ++j)
{
n += ar[i][j];
}
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int j = 0; j < y; ++j)
{
for (int i = 0; i < x; ++i)
{
n += ar[i][j];
}
}
// n == 0
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CACHE
friendliness
x * y cache misses
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int i = 0; i < x; ++i)
{
for (int j = 0; j < y; ++j)
{
n += ar[i][j];
}
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int j = 0; j < y; ++j)
{
for (int i = 0; i < x; ++i)
{
n += ar[i][j];
}
}
// n == 0
CACHE
friendliness
cache misses
cache misses
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int i = 0; i < x; ++i)
{
for (int j = 0; j < y; ++j)
{
n += ar[i][j];
}
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];
int n = 0;
for (int j = 0; j < y; ++j)
{
for (int i = 0; i < x; ++i)
{
n += ar[i][j];
}
}
// n == 0
CACHE
friendliness
cache misses
cache misses
CACHE
how to be cache friendly?
Locality of reference
Spatial (data locality)
Temporal
CACHE
how to be cache friendly?
std::array
std::vector
branch friendly code
data oriented design
std::list
std::map
std::set std::unordered_map std::unordered_set
std::binary_search
std::valarray
virtual
function pointers
pointers
false sharing
Cache
frienfly
contiguous memory access
Cache
unfrienfly
random memory access
(hyperthreading)
std::vector<int> v = CreateVec(100000);
// v == {0, 1, 2, ...}
int n = 0;
for (int i = 0; i < v.size(); ++i)
{
if (v[i] < 50000)
{
++n;
}
}
// n == 50000
std::vector<int> v = CreateVec(100000);
ShuffleVec(v);
int n = 0;
for (int i = 0; i < v.size(); ++i)
{
if (v[i] < 50000)
{
++n;
}
}
// n == 50000
What will be faster #2
What will be faster #2
branching
if (cond)
cond
!cond
bool cond = input();
if (cond)
{
return 13;
}
else
{
return 17;
}
bool cond = input();
return 13;
return 17;
branching
if (cond)
cond
!cond
bool cond = input();
if (cond)
{
return 13;
}
else
{
return 17;
}
bool cond = input();
return 13;
return 17;
cond =
???
branching
if (cond)
cond
!cond
bool cond = input();
if (cond)
{
return 13;
}
else
{
return 17;
}
bool cond = input();
return 13;
return 17;
cond =
true
branching
if (cond)
cond
!cond
bool cond = input();
if (cond)
{
return 13;
}
else
{
return 17;
}
bool cond = input();
return 13;
return 17;
cond =
true
branching
if (cond)
cond
!cond
bool cond = input();
if (cond)
{
return 13;
}
else
{
return 17;
}
bool cond = input();
return 13;
return 17;
cond =
???
branching
if (cond)
cond
!cond
bool cond = input();
if (cond)
{
return 13;
}
else
{
return 17;
}
bool cond = input();
return 13;
return 17;
cond =
???
branching
if (cond)
cond
!cond
bool cond = input();
if (cond)
{
return 13;
}
else
{
return 17;
}
bool cond = input();
return 13;
return 17;
cond =
???
branch prediction
branching
if (cond)
cond
!cond
bool cond = input();
if (cond)
{
return 13;
}
else
{
return 17;
}
bool cond = input();
return 13;
return 17;
cond =
???
branching
if (cond)
cond
!cond
bool cond = input();
if (cond)
{
return 13;
}
else
{
return 17;
}
bool cond = input();
return 13;
return 17;
cond =
true
correct branch prediction
branching
if (cond)
cond
!cond
bool cond = input();
if (cond)
{
return 13;
}
else
{
return 17;
}
bool cond = input();
return 13;
return 17;
cond =
???
branching
if (cond)
cond
!cond
bool cond = input();
if (cond)
{
return 13;
}
else
{
return 17;
}
bool cond = input();
return 13;
return 17;
cond =
???
branching
if (cond)
cond
!cond
bool cond = input();
if (cond)
{
return 13;
}
else
{
return 17;
}
bool cond = input();
return 13;
return 17;
cond =
???
branch prediction
branching
if (cond)
cond
!cond
bool cond = input();
if (cond)
{
return 13;
}
else
{
return 17;
}
bool cond = input();
return 13;
return 17;
cond =
???
branching
if (cond)
cond
!cond
bool cond = input();
if (cond)
{
return 13;
}
else
{
return 17;
}
bool cond = input();
return 13;
return 17;
cond =
false
branch misprediction
branching
if (cond)
cond
!cond
bool cond = input();
if (cond)
{
return 13;
}
else
{
return 17;
}
bool cond = input();
return 13;
return 17;
cond =
false
pipeline flush
branching
if (cond)
cond
!cond
bool cond = input();
if (cond)
{
return 13;
}
else
{
return 17;
}
bool cond = input();
return 13;
return 17;
cond =
false
branch prediction
std::vector<int> v = CreateVec(100000);
// v == {0, 1, 2, ...}
int n = 0;
for (int i = 0; i < v.size(); ++i)
{
if (v[i] < 50000)
{
++n;
}
}
// n == 50000
std::vector<int> v = CreateVec(100000);
ShuffleVec(v);
int n = 0;
for (int i = 0; i < v.size(); ++i)
{
if (v[i] < 50000)
{
++n;
}
}
// n == 50000
low level performance
Low level performance
By Jan Bielak
Low level performance
A presentation about cache and branch prediction. It is presented here: https://www.youtube.com/watch?v=qicj1F88H78 .
- 585