Can you make your code 10x faster?
int array[COLS][ROWS];
long sum = 0;
// initialize array.
for (int r=0; r<ROWS; ++r)
for (int c=0; c<COLS; ++c)
sum += array[c][r];
int array[COLS][ROWS];
long sum = 0;
// initialize array.
for (int c=0; c<COLS; ++c)
for (int r=0; r<ROWS; ++r)
sum += array[c][r];
Same amount of instructions,
same amount of computation,
but
DIFFERENT SPEEDS!!!
Modern software gave you valuable abstractions, but you might have forgotten that you are talking with a machine.
It is getting worse and worse!!
Core
L1 cache
L2 cache
L3 cache
...
Core
L1 cache
L2 cache
Core
L1 cache
L2 cache
Cache is copied using contiguous segments
int doWork(const vector<int>& data)
{
unsigned long sum = 0;
for (int d : data)
sum += d * d;
return (int)(sum / data.size());
}
auto data = vector<int>(SIZE);
initData( data );
doWork( data ); //benchmark this
int doWork(const vector<int*>& data)
{
unsigned long sum = 0;
for (int* d : data)
sum += (*d) * (*d);
return (int)(sum / data.size());
}
auto data = vector<int>(SIZE);
auto pointers = vector<int*>(data.size());
for (size_t i = 0; i < data.size(); ++i)
pointers[i] = &data[i]; // pointing to contiguous memory
}
initData( data );
doWork( pointers );
int doWork(const vector<int*>& data)
{
unsigned long sum = 0;
for (int* d : data)
sum += (*d) * (*d);
return (int)(sum / data.size());
}
auto pointers = vector<int*>( dataSize );
for (size_t i = 0; i < dataSize; ++i)
pointers[i] = new int; // pointing to "sparse memory"
}
initData( pointers );
doWork( pointers );
struct Object{
bool toBeUpdated;
// ...
//stuff
//....
};
std::vector< Object > objectList;
//main loop
for(int i=0; i objectList.size(); i++){
if( objectList[i].toBeUpdated )
updateObject( objectList[i] );
}
Top get the value of the field "toBeUpdated", you are obliged to read a chunk of the Object
std::vector< bool > toBeUpdated;
std::vector< Object > objectList;
//main loop
for(int i=0; i objectList.size(); i++){
if( toBeUpdated[i] )
updateObject( objectList[i] );
}
You don't need to read the Object from memory unless "toBeUpdated[i]" is true
OOP...
You have been fooled to think that "objects are cool".
Sometimes the truth is that they can impact your cache locality
int odds = 0;
for( int i = 0; i < DIM; ++i )
for( int j = 0; j < DIM; ++j )
if( matrix[i*DIM + j] % 2 != 0 )
++odds;
Example: count odds in a matrix
void computeOdds(int *matrix_chunk, int chunk_size, int &partial_count)
{
for( int i = 0; i < chunk_size; ++i )
if( matrix_chunk[i] %2 == 0 )
partial_count++;
}
int computeOddsInParallel( int *matrix, int DIM )
{
int result[POOL_SIZE];
// Each of P parallel workers processes 1/P-th of the data
// let's suppose for the sake of simplicity that DIM is a multiple of POOL_SIZE...
int chunkSize = DIM/POOL_SIZE;
for( int p = 0; p < POOL_SIZE; ++p )
{
// push partial work inside a worker
pool.run( std::bind(computeOdds, &matrix[ p*chunkSize ], chunkSize, result[p] ));
}
// Wait for the parallel work to complete…
pool.join();
// Finally, do the sequential "reduction" step to combine the results
int odds = 0;
for( int p = 0; p < POOL_SIZE; ++p )
odds += result[p];
return odds;
}
Making it parallel (some pseudo code used)
void computeOdds(int *matrix_chunk,
int chunk_size,
int &partial_count)
{
int count = 0;
for( int i = 0; i < chunk_size; ++i )
if( matrix_chunk[i] %2 == 0 )
count++;
partial_count = count;
}
void computeOdds(int *matrix_chunk,
int chunk_size,
int &partial_count)
{
for( int i = 0; i < chunk_size; ++i )
if( matrix_chunk[i] %2 == 0 )
partial_count ++;
}
int result[POOLS_SIZE] was on a single cache line.
Unfortunately it was continuously invalidated!!!
http://www.drdobbs.com/parallel/eliminate-false-sharing/217500206
http://gameprogrammingpatterns.com/data-locality.html
http://bitbashing.io/memory-performance.html
https://www.youtube.com/watch?v=WDIkqP4JbkE&index=31&list=PLXIxC-YkNzOKdmwQvTJvfZa-E69AXNY70