I have a substantial dataset containing documents that sometimes reference each other and sometimes do not. Before I can perform mapreduce operations based on these cross-references, I need to ensure that the array of cross-references is consistent for every value in the dataset.
To achieve this, I utilize the following shell function to consolidate those arrays:
function unifyCrossReferences() {
var counter = 0;
// Retrieving only the cross-referenced fields from the database
var cursor = db.catalog.find({}, {xref: true, _id: false});
// Initializing an empty array outside the loop to prevent memory leaks
var consolidatedArray = [];
while (cursor.hasNext()) {
var xref1 = cursor.next().xref;
// Creating a consolidated array when the cross-references match
var limitedCursor1 = db.catalog.find({"name":{$in:xref1}});
while (limitedCursor1.hasNext()) {
var doc1 = limitedCursor1.next();
consolidatedArray = consolidatedArray.concat(doc1.xref);
}
consolidatedArray = consolidatedArray.unique();
// Resetting the xref field of the object to the consolidated array
for (var i=0; i<consolidatedArray.length; i++) {
db.catalog.update({name:consolidatedArray[i]},{$set:{xref: consolidatedArray}},false, true);
}
consolidatedArray.length = 0;
counter++;
if (counter % 1000 == 0) {
print("Processed " + counter + " documents.");
}
}
}
The current solution works, but I find myself needing to run it frequently. Any suggestions for improvements would be greatly appreciated.