There is an interesting set of articles at Fluent{C++} discussing AoS vs SoA.
I decided to do my own performance test: to iterate and accumulate data over an array of 10,000,000 structures that describe a person:
1 2 3 4 5 6 |
struct Person { string name; uint8_t age; uint32_t dob; }; |
Versus a structure of arrays of the same data:
1 2 3 4 5 6 |
struct Persons { vector<string> names; vector<uint8_t> ages; vector<uint32_t> dobs; }; |
Below are the numbers measured on my 2012 MacBook Pro 2.3 GHz Intel Core i7. The code was compiled with maximum optimizations using latest GCC, Apple Clang, and LLVM compilers available for Mac OS:
GCC -Ofast -march=native -lstdc++
AoS duration 108.039 ms
SoA duration 42.228 ms
Apple CLANG -Ofast -march=native -lc++
AoS duration 64.001 ms
SoA duration 24.916 ms
LLVM CLANG -Ofast -march=native -lc++
AoS duration 67.579 ms
SoA duration 22.620 ms
Conclusion: locality of reference matters 🙂
Complete listing:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
#include <iostream> #include <iomanip> #include <string> #include <chrono> #include <vector> #include <numeric> using namespace std; using namespace chrono; const int ELEMS = 10'000'000; struct Person { Person(const string& n, uint8_t a, uint32_t d) : name(n), age(a), dob(d) {} string name; uint8_t age; uint32_t dob; }; using VP = vector<Person>; void addPerson(VP& v, Person&& p) { v.push_back(move(p)); } uint64_t averageNameLen(const VP& v) { return accumulate(begin(v), end(v), (uint64_t)0, [](auto sum, auto& p) { return sum + p.name.length(); }) / v.size(); } uint64_t averageAge(const VP& v) { return accumulate(begin(v), end(v), (uint64_t)0, [](auto sum, auto& p) { return sum + p.age; }) / v.size(); } uint64_t averageDob(const VP& v) { return accumulate(begin(v), end(v), (uint64_t)0, [](auto sum, auto& p) { return sum + p.dob; }) / v.size(); } struct Persons { vector<string> names; vector<uint8_t> ages; vector<uint32_t> dobs; void addPerson(const string& n, uint8_t a, uint32_t d) { names.push_back(n); ages.push_back(a); dobs.push_back(d); } uint64_t averageNameLen() const { return accumulate(begin(names), end(names), (uint64_t)0, [](auto sum, auto& n) { return sum + n.length(); }) / names.size(); } uint64_t averageAge() const { return accumulate(begin(ages), end(ages), (uint64_t)0) / ages.size(); } uint64_t averageDob() const { return accumulate(begin(dobs), end(dobs), (uint64_t)0) / dobs.size(); } }; int main(int argc, char** argv) { VP v1; v1.reserve(ELEMS); for(int i = 0; i < ELEMS; ++i) addPerson(v1, Person(string(string().capacity(), 'N'), i % 0xFF, i % 0xFFFF)); auto start_time = high_resolution_clock::now(); auto sum = averageNameLen(v1); sum += averageAge(v1); sum += averageDob(v1); auto end_time = high_resolution_clock::now(); cout << fixed << setprecision(3); cout << "AoS duration " << duration_cast<microseconds>(end_time - start_time).count() / 1000.f << " ms" << endl; v1.clear(); v1.shrink_to_fit(); Persons p; p.names.reserve(ELEMS); p.ages.reserve(ELEMS); p.dobs.reserve(ELEMS); for(int i = 0; i < ELEMS; ++i) p.addPerson(string(string().capacity(), 'N'), rand() % 0xFF, rand() % 0xFFFF); start_time = high_resolution_clock::now(); sum += p.averageNameLen(); sum += p.averageAge(); sum += p.averageDob(); end_time = high_resolution_clock::now(); cout << "SoA duration " << duration_cast<microseconds>(end_time - start_time).count() / 1000.f << " ms" << endl; return sum; } |
P.S. I had to do the
sum += and
return sum; hack. Otherwise the compilers kept optimizing away the averageXXX calls!