LOW LEVEL Performance

What will be faster #1

int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int i = 0; i < x; ++i)
{
  for (int j = 0; j < y; ++j)
  {
    n += ar[i][j];
  }
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int j = 0; j < y; ++j)
{
  for (int i = 0; i < x; ++i)
  {
    n += ar[i][j];
  }
}
// n == 0
O(xy)
O(xy)

What will be faster #1

 ≈ 16X slower

Moore's LAW

Moore's LAW

speed(RAM) << speed(CPU)

CACHE

CPU

RAM

CACHE

CACHE

CPU

RAM

CACHE

CACHE

CPU

RAM

L1

L2

L3

CACHE

{

CACHE

CPU

RAM

L1

L2

L3

x = 5;

👀

CACHE

CPU

RAM

L1

L2

L3

x = 5;

👀

CACHE

CPU

RAM

L1

L2

L3

x = 5;

👀

CACHE

CPU

RAM

L1

L2

L3

x = 5;

👀

CACHE

RAM

L1

L2

L3

x = 5;
&x == 0x7ffe19e65d24
0x7f0d09490260

logical address

physical address

Translation lookaside buffer

👀

CACHE

Intel Core i7 920 Bloomfield

(2008)

CACHE

Intel Core i7 920 Bloomfield

(2008)

CACHE

Q

U

E

U

E

CORE

CORE

CORE

CORE

MEMORY CONTROLLER

Q

P

I

 

0

Q

P

I

 

1

M
I
S
C

 

I

O

M
I
S
C

 

I

O

Intel Core i7 920 Bloomfield

(2008)

ick

ath

nterconnect

CACHE

Q

U

E

U

E

CORE

CORE

CORE

CORE

MEMORY CONTROLLER

L2

L2

L2

L2

L1

L1

L1

L1

SHARED

L3 CACHE

Q

P

I

 

0

Q

P

I

 

1

M
I
S
C

 

I

O

M
I
S
C

 

I

O

Intel Core i7 920 Bloomfield

(2008)

CACHE

[word here]

👀

👀

cache

hit

cache

miss

{

cache size [bytes]

L1 cache

L1_D

L1_I

instruction cache

movl -16(%rbp), %eax
cmpl %eax, -4(%rbp)
jge .L9
leaq -24(%rbp), %rax
movq %rax, %rsi
movl $_ZSt3cin, %edi
call _ZNSirsERi
movl -24(%rbp), %eax
subl $1, %eax
movl %eax, -24(%rbp)
movl -24(%rbp), %eax
movslq %eax, %rdx
movl tab(,%rdx,4), %edx
addl $1, %edx
cltq
movl %edx, tab(,%rax,4)
addl $1, -4(%rbp)
jmp .L10

CACHE

where does data in cache come from?

memory

read x;

👀

Translation lookaside buffer

CACHE

CACHE

where does data in cache come from?

memory

read x;

👀

Translation lookaside buffer

CACHE

CACHE

where does data in cache come from?

memory

read x;

👀

Translation lookaside buffer

CACHE

CACHE

where does data in cache come from?

memory

read x;

👀

Translation lookaside buffer

CACHE

CACHE

where does data in cache come from?

memory

read x;

👀

Translation lookaside buffer

CACHE

CACHE

where does data in cache come from?

memory

read x;

👀

Translation lookaside buffer

CACHE

}

cache line

{

cache entry

cache entry tag

CACHE

policies

cache replacement policy

cache write policy

cache placement policy

CACHE

memory

N-way

associative

cache line

cache entry

cache entry

cache entry

cache entry

1 - way associative = direct-mapped cache

∞ - way associative = fully associative cache

(                         )

int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int i = 0; i < x; ++i)
{
  for (int j = 0; j < y; ++j)
  {
    n += ar[i][j];
  }
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int j = 0; j < y; ++j)
{
  for (int i = 0; i < x; ++i)
  {
    n += ar[i][j];
  }
}
// n == 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0

CACHE

friendliness

int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int i = 0; i < x; ++i)
{
  for (int j = 0; j < y; ++j)
  {
    n += ar[i][j];
  }
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int j = 0; j < y; ++j)
{
  for (int i = 0; i < x; ++i)
  {
    n += ar[i][j];
  }
}
// n == 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0

CACHE

friendliness

int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int i = 0; i < x; ++i)
{
  for (int j = 0; j < y; ++j)
  {
    n += ar[i][j];
  }
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int j = 0; j < y; ++j)
{
  for (int i = 0; i < x; ++i)
  {
    n += ar[i][j];
  }
}
// n == 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0

CACHE

friendliness

int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int i = 0; i < x; ++i)
{
  for (int j = 0; j < y; ++j)
  {
    n += ar[i][j];
  }
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int j = 0; j < y; ++j)
{
  for (int i = 0; i < x; ++i)
  {
    n += ar[i][j];
  }
}
// n == 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0

CACHE

friendliness

int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int i = 0; i < x; ++i)
{
  for (int j = 0; j < y; ++j)
  {
    n += ar[i][j];
  }
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int j = 0; j < y; ++j)
{
  for (int i = 0; i < x; ++i)
  {
    n += ar[i][j];
  }
}
// n == 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0

CACHE

friendliness

int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int i = 0; i < x; ++i)
{
  for (int j = 0; j < y; ++j)
  {
    n += ar[i][j];
  }
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int j = 0; j < y; ++j)
{
  for (int i = 0; i < x; ++i)
  {
    n += ar[i][j];
  }
}
// n == 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0

CACHE

friendliness

int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int i = 0; i < x; ++i)
{
  for (int j = 0; j < y; ++j)
  {
    n += ar[i][j];
  }
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int j = 0; j < y; ++j)
{
  for (int i = 0; i < x; ++i)
  {
    n += ar[i][j];
  }
}
// n == 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0

CACHE

friendliness

int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int i = 0; i < x; ++i)
{
  for (int j = 0; j < y; ++j)
  {
    n += ar[i][j];
  }
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int j = 0; j < y; ++j)
{
  for (int i = 0; i < x; ++i)
  {
    n += ar[i][j];
  }
}
// n == 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0

CACHE

friendliness

int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int i = 0; i < x; ++i)
{
  for (int j = 0; j < y; ++j)
  {
    n += ar[i][j];
  }
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int j = 0; j < y; ++j)
{
  for (int i = 0; i < x; ++i)
  {
    n += ar[i][j];
  }
}
// n == 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0

CACHE

friendliness

int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int i = 0; i < x; ++i)
{
  for (int j = 0; j < y; ++j)
  {
    n += ar[i][j];
  }
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int j = 0; j < y; ++j)
{
  for (int i = 0; i < x; ++i)
  {
    n += ar[i][j];
  }
}
// n == 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0

CACHE

friendliness

int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int i = 0; i < x; ++i)
{
  for (int j = 0; j < y; ++j)
  {
    n += ar[i][j];
  }
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int j = 0; j < y; ++j)
{
  for (int i = 0; i < x; ++i)
  {
    n += ar[i][j];
  }
}
// n == 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0

CACHE

friendliness

int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int i = 0; i < x; ++i)
{
  for (int j = 0; j < y; ++j)
  {
    n += ar[i][j];
  }
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int j = 0; j < y; ++j)
{
  for (int i = 0; i < x; ++i)
  {
    n += ar[i][j];
  }
}
// n == 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0

CACHE

friendliness

int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int i = 0; i < x; ++i)
{
  for (int j = 0; j < y; ++j)
  {
    n += ar[i][j];
  }
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int j = 0; j < y; ++j)
{
  for (int i = 0; i < x; ++i)
  {
    n += ar[i][j];
  }
}
// n == 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0

CACHE

friendliness

int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int i = 0; i < x; ++i)
{
  for (int j = 0; j < y; ++j)
  {
    n += ar[i][j];
  }
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int j = 0; j < y; ++j)
{
  for (int i = 0; i < x; ++i)
  {
    n += ar[i][j];
  }
}
// n == 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0

CACHE

friendliness

int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int i = 0; i < x; ++i)
{
  for (int j = 0; j < y; ++j)
  {
    n += ar[i][j];
  }
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int j = 0; j < y; ++j)
{
  for (int i = 0; i < x; ++i)
  {
    n += ar[i][j];
  }
}
// n == 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0

CACHE

friendliness

int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int i = 0; i < x; ++i)
{
  for (int j = 0; j < y; ++j)
  {
    n += ar[i][j];
  }
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int j = 0; j < y; ++j)
{
  for (int i = 0; i < x; ++i)
  {
    n += ar[i][j];
  }
}
// n == 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0

CACHE

friendliness

int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int i = 0; i < x; ++i)
{
  for (int j = 0; j < y; ++j)
  {
    n += ar[i][j];
  }
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int j = 0; j < y; ++j)
{
  for (int i = 0; i < x; ++i)
  {
    n += ar[i][j];
  }
}
// n == 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0

CACHE

friendliness

x cache misses

int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int i = 0; i < x; ++i)
{
  for (int j = 0; j < y; ++j)
  {
    n += ar[i][j];
  }
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int j = 0; j < y; ++j)
{
  for (int i = 0; i < x; ++i)
  {
    n += ar[i][j];
  }
}
// n == 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0

CACHE

friendliness

int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int i = 0; i < x; ++i)
{
  for (int j = 0; j < y; ++j)
  {
    n += ar[i][j];
  }
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int j = 0; j < y; ++j)
{
  for (int i = 0; i < x; ++i)
  {
    n += ar[i][j];
  }
}
// n == 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0

CACHE

friendliness

int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int i = 0; i < x; ++i)
{
  for (int j = 0; j < y; ++j)
  {
    n += ar[i][j];
  }
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int j = 0; j < y; ++j)
{
  for (int i = 0; i < x; ++i)
  {
    n += ar[i][j];
  }
}
// n == 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0

CACHE

friendliness

int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int i = 0; i < x; ++i)
{
  for (int j = 0; j < y; ++j)
  {
    n += ar[i][j];
  }
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int j = 0; j < y; ++j)
{
  for (int i = 0; i < x; ++i)
  {
    n += ar[i][j];
  }
}
// n == 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0

CACHE

friendliness

int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int i = 0; i < x; ++i)
{
  for (int j = 0; j < y; ++j)
  {
    n += ar[i][j];
  }
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int j = 0; j < y; ++j)
{
  for (int i = 0; i < x; ++i)
  {
    n += ar[i][j];
  }
}
// n == 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0

CACHE

friendliness

int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int i = 0; i < x; ++i)
{
  for (int j = 0; j < y; ++j)
  {
    n += ar[i][j];
  }
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int j = 0; j < y; ++j)
{
  for (int i = 0; i < x; ++i)
  {
    n += ar[i][j];
  }
}
// n == 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0

CACHE

friendliness

int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int i = 0; i < x; ++i)
{
  for (int j = 0; j < y; ++j)
  {
    n += ar[i][j];
  }
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int j = 0; j < y; ++j)
{
  for (int i = 0; i < x; ++i)
  {
    n += ar[i][j];
  }
}
// n == 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0

CACHE

friendliness

int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int i = 0; i < x; ++i)
{
  for (int j = 0; j < y; ++j)
  {
    n += ar[i][j];
  }
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int j = 0; j < y; ++j)
{
  for (int i = 0; i < x; ++i)
  {
    n += ar[i][j];
  }
}
// n == 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0

CACHE

friendliness

int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int i = 0; i < x; ++i)
{
  for (int j = 0; j < y; ++j)
  {
    n += ar[i][j];
  }
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int j = 0; j < y; ++j)
{
  for (int i = 0; i < x; ++i)
  {
    n += ar[i][j];
  }
}
// n == 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0

CACHE

friendliness

x * y cache misses

int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int i = 0; i < x; ++i)
{
  for (int j = 0; j < y; ++j)
  {
    n += ar[i][j];
  }
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int j = 0; j < y; ++j)
{
  for (int i = 0; i < x; ++i)
  {
    n += ar[i][j];
  }
}
// n == 0

CACHE

friendliness

cache misses

cache misses

x
xy
x
xy
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int i = 0; i < x; ++i)
{
  for (int j = 0; j < y; ++j)
  {
    n += ar[i][j];
  }
}
// n == 0
int x = 8192, y = 16;
static int ar[x][y];

int n = 0;
for (int j = 0; j < y; ++j)
{
  for (int i = 0; i < x; ++i)
  {
    n += ar[i][j];
  }
}
// n == 0

CACHE

friendliness

cache misses

cache misses

x
xy
x
xy
=y=16

CACHE

how to be cache friendly?

Locality of reference

Spatial (data locality)

Temporal

CACHE

how to be cache friendly?

std::array
std::vector

branch friendly code
data oriented design
std::list
​std::map
std::set std::unordered_map std::unordered_set
std::binary_search
std::valarray
virtual
function pointers
pointers
false sharing 

Cache

frienfly

contiguous memory access

Cache

unfrienfly

random memory access

(hyperthreading)
std::vector<int> v = CreateVec(100000);
// v == {0, 1, 2, ...}
int n = 0;
for (int i = 0; i < v.size(); ++i)
{
  if (v[i] < 50000)
  {
    ++n;
  }
}
// n == 50000
std::vector<int> v = CreateVec(100000);
ShuffleVec(v);
int n = 0;
for (int i = 0; i < v.size(); ++i)
{
  if (v[i] < 50000)
  {
    ++n;
  }
}
// n == 50000

What will be faster #2

What will be faster #2

branching

if (cond)
cond
!cond
bool cond = input();
if (cond)
{
  return 13;
}
else
{
  return 17;
}
bool cond = input();
return 13;
return 17;

branching

if (cond)
cond
!cond
bool cond = input();
if (cond)
{
  return 13;
}
else
{
  return 17;
}
bool cond = input();
return 13;
return 17;
cond =
???

branching

if (cond)
cond
!cond
bool cond = input();
if (cond)
{
  return 13;
}
else
{
  return 17;
}
bool cond = input();
return 13;
return 17;
cond =
true

branching

if (cond)
cond
!cond
bool cond = input();
if (cond)
{
  return 13;
}
else
{
  return 17;
}
bool cond = input();
return 13;
return 17;
cond =
true

branching

if (cond)
cond
!cond
bool cond = input();
if (cond)
{
  return 13;
}
else
{
  return 17;
}
bool cond = input();
return 13;
return 17;
cond =
???

branching

if (cond)
cond
!cond
bool cond = input();
if (cond)
{
  return 13;
}
else
{
  return 17;
}
bool cond = input();
return 13;
return 17;
cond =
???

branching

if (cond)
cond
!cond
bool cond = input();
if (cond)
{
  return 13;
}
else
{
  return 17;
}
bool cond = input();
return 13;
return 17;
cond =
???
branch prediction

branching

if (cond)
cond
!cond
bool cond = input();
if (cond)
{
  return 13;
}
else
{
  return 17;
}
bool cond = input();
return 13;
return 17;
cond =
???

branching

if (cond)
cond
!cond
bool cond = input();
if (cond)
{
  return 13;
}
else
{
  return 17;
}
bool cond = input();
return 13;
return 17;
cond =
true
correct branch prediction

branching

if (cond)
cond
!cond
bool cond = input();
if (cond)
{
  return 13;
}
else
{
  return 17;
}
bool cond = input();
return 13;
return 17;
cond =
???

branching

if (cond)
cond
!cond
bool cond = input();
if (cond)
{
  return 13;
}
else
{
  return 17;
}
bool cond = input();
return 13;
return 17;
cond =
???

branching

if (cond)
cond
!cond
bool cond = input();
if (cond)
{
  return 13;
}
else
{
  return 17;
}
bool cond = input();
return 13;
return 17;
cond =
???
branch prediction

branching

if (cond)
cond
!cond
bool cond = input();
if (cond)
{
  return 13;
}
else
{
  return 17;
}
bool cond = input();
return 13;
return 17;
cond =
???

branching

if (cond)
cond
!cond
bool cond = input();
if (cond)
{
  return 13;
}
else
{
  return 17;
}
bool cond = input();
return 13;
return 17;
cond =
false
branch misprediction

branching

if (cond)
cond
!cond
bool cond = input();
if (cond)
{
  return 13;
}
else
{
  return 17;
}
bool cond = input();
return 13;
return 17;
cond =
false
pipeline flush

branching

if (cond)
cond
!cond
bool cond = input();
if (cond)
{
  return 13;
}
else
{
  return 17;
}
bool cond = input();
return 13;
return 17;
cond =
false

branch prediction

std::vector<int> v = CreateVec(100000);
// v == {0, 1, 2, ...}
int n = 0;
for (int i = 0; i < v.size(); ++i)
{
  if (v[i] < 50000)
  {
    ++n;
  }
}
// n == 50000
std::vector<int> v = CreateVec(100000);
ShuffleVec(v);
int n = 0;
for (int i = 0; i < v.size(); ++i)
{
  if (v[i] < 50000)
  {
    ++n;
  }
}
// n == 50000

low level performance

Low level performance

By Jan Bielak

Low level performance

A presentation about cache and branch prediction. It is presented here: https://www.youtube.com/watch?v=qicj1F88H78 .

  • 605