Data locality

Data locality#

One of the questions in Activity 53 was about the trade-offs of different data structures, e.g., array vs struct.

How far data is placed from each other can affect performance.

Array of structs vs struct of arrays#

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define N 1'000'000

// Array of Structs (AoS)
typedef struct {
  int x;
  int y;
  int z;
} PointAoS;

PointAoS points_aos[N];

// Struct of Arrays (SoA)
typedef struct {
  int x[N];
  int y[N];
  int z[N];
} PointSoA;

PointSoA points_soa;

void initAoS() {
  for (int i = 0; i < N; i++) {
    points_aos[i].x = rand();
    points_aos[i].y = rand();
    points_aos[i].z = rand();
  }
}
void initSoA() {
  for (int i = 0; i < N; i++) {
    points_soa.x[i] = rand();
    points_soa.y[i] = rand();
    points_soa.z[i] = rand();
  }
}

int sum_x_aos() {
  int sum = 0;
  for (int i = 0; i < N; i++) {
    sum += points_aos[i].x; // x's are more far from each other
  }
  return sum;
}

int sum_x_soa() {
  int sum = 0;
  for (int i = 0; i < N; i++) {
    sum += points_soa.x[i]; // x's are more nearby
  }
  return sum;
}

int main() {
  srand(time(NULL));

  initAoS();
  auto start = clock();
  int sum_aos = sum_x_aos();
  auto end = clock();
  printf("AoS sum: %d, time: %f ms\n", sum_aos,
         ((double)(end - start) / CLOCKS_PER_SEC) * 1000);

  initSoA();
  start = clock();
  int sumSoA = sum_x_soa();
  end = clock();
  printf("SoA sum: %d, time: %f ms\n", sumSoA,
         ((double)(end - start) / CLOCKS_PER_SEC) * 1000);
}
AoS sum: -2014180790, time: 2.464000 ms
SoA sum: -2020133871, time: 2.457000 ms
https://upload.wikimedia.org/wikipedia/commons/e/e8/Shared_private.png

Fig. 24 When data is read, it is stored on many layers of cache, which become smaller in size towards the processor cores.
CC BY-SA 4.0. By Avadlam3. Source: Wikimedia Commons
#

Struct of arrays achieve a better performance, because x’s are near each other.

cache

a hardware of software component that stores data so that future requests for that data can be served faster

For example my computer has the following caches:

$ lscpu
...
Caches (sum of all):         
  L1d:                       352 KiB (10 instances)
  L1i:                       576 KiB (10 instances)
  L2:                        6.5 MiB (4 instances)
  L3:                        12 MiB (1 instance)