Currently, I am conducting basic data validation on a large dataset using Node.js (version v7.5.0) with a matrix size of 15849x12771 entries. To maximize performance, the entire dataset is stored in memory. It is crucial for me to minimize memory consumption as much as possible, considering that each number uses 8 bytes in JavaScript.
I would like to compare two different methods to achieve the same goal:
Using forEach
regressData.forEach((yxa, yxaIndex) => {
yxa.forEach((yx, yxIndex) => {
if (!_.isFinite(yx)) {
throw new Error(`non-finite entry at [${yxaIndex}, ${yxIndex}]`);
}
});
});
Despite consuming over 4GB of my node process' memory and struggling to complete the loop efficiently (possibly relying on slower swap memory), this method performs the desired validation.
The same validation can be achieved using a traditional for
loop:
for (var yxai = 0, yxal = regressData.length; yxai < yxal; yxai++) {
const yx = regressData[yxai];
for (var yxi = 0, yxl = yx.length; yxi < yxl; yxi++) {
if (!_.isFinite(yx[yxi])) {
throw new Error(`non-finite entry at [${yxai}, ${yxi}]`);
}
}
}
This alternative approach drastically reduces extra memory consumption, enabling the validation process to be completed within seconds.
Is this behavior to be expected? I initially assumed that the closed scopes within the forEach
method would prevent any additional memory usage compared to a more conventional for
loop.
EDIT: Standalone Test
node --expose-gc test_foreach.js
if (!gc) throw new Error('please run node like node --expose-gc test_foreach.js');
const _ = require('lodash');
// prepare data to work with
const x = 15849;
const y = 12771;
let regressData = new Array(x);
for (var i = 0; i < x; i++) {
regressData[i] = new Array(y);
for (var j = 0; j < y; j++) {
regressData[i][j] = _.random(true);
}
}
// for loop
gc();
const mb_pre_for = _.round(process.memoryUsage().heapUsed / 1024 / 1024, 2);
console.log(`Memory consumption before the for loop ${mb_pre_for} megabytes`);
validateFor(regressData);
gc();
const mb_post_for = _.round(process.memoryUsage().heapUsed / 1024 / 1024, 2);
const mb_for = _.round(mb_post_for - mb_pre_for, 2);
console.log(`Memory consumption caused by the for loop ${mb_for} megabytes`);
// forEach loop
gc();
const mb_pre_foreach = _.round(process.memoryUsage().heapUsed / 1024 / 1024, 2);
console.log(`Memory consumption before the forEach loop ${mb_pre_foreach} megabytes`);
validateForEach(regressData);
gc();
const mb_post_foreach = _.round(process.memoryUsage().heapUsed / 1024 / 1024, 2);
const mb_foreach = _.round(mb_post_foreach - mb_pre_foreach, 2);
console.log(`Memory consumption caused by the forEach loop ${mb_foreach} megabytes`);
function validateFor(regressData) {
for (var yxai = 0, yxal = regressData.length; yxai < yxal; yxai++) {
const yx = regressData[yxai];
for (var yxi = 0, yxl = yx.length; yxi < yxl; yxi++) {
if (!_.isFinite(yx[yxi])) {
throw new Error(`Non-finite entry at [${yxai}, ${yxi}]`);
}
}
}
};
function validateForEach(regressData) {
regressData.forEach((yxa, yxaIndex) => {
yxa.forEach((yx, yxIndex) => {
if (!_.isFinite(yx)) {
throw new Error(`Non-finite entry at [${yxaIndex}, ${yxIndex}]`);
}
});
});
};
Output:
toms-mbp-2:mem_test tommedema$ node --expose-gc test_foreach.js
Memory consumption before the for loop 1549.31 megabytes
Memory consumption caused by the for loop 0.31 megabytes
Memory consumption before the forEach loop 1549.66 megabytes
Memory consumption caused by the forEach loop 3087.9 megabytes