Hey @Narendra_S_Sikarwar,
I actually gave this a second thought and I think I found a faster way to deal with this: by using a covered query.
- I created this little python script that creates some fake data + creates the index:
{is_workflow_processing:1, is_error:1, status: 1, control_monitorkey:1, Masterid:1, SYSTEMID:1}
from faker import Faker
from pymongo import ASCENDING
from pymongo import MongoClient
fake = Faker()
def random_docs():
docs = []
for _ in range(10000):
doc = {
'firstname': fake.first_name(),
'lastname': fake.last_name(),
'is_workflow_processing': fake.pybool(),
'is_error': fake.pybool(),
'status': fake.pyint(),
'control_monitorkey': fake.pyint(),
'Masterid': fake.pyint(),
'SYSTEMID': fake.pyint()}
docs.append(doc)
return docs
if __name__ == '__main__':
client = MongoClient()
collection = client.test.coll
collection.insert_many(random_docs())
collection.create_index([("is_workflow_processing", ASCENDING), ("is_error", ASCENDING), ("status", ASCENDING),
("control_monitorkey", ASCENDING), ("Masterid", ASCENDING), ("SYSTEMID", ASCENDING), ])
print('Done!')
In your case, the index might be a little fat but you have 64GB of RAM… So it should get the job done fast with the following aggregation pipeline:
- Cover the query with a $match + $group with a subset of fields.
Note that we don’t need a $project right after the $match because it’s already optimized automatically for us.
[
{
'$match': {
'is_workflow_processing': false,
'is_error': true
}
}, {
'$group': {
'_id': {
'status': '$status',
'control_monitorkey': '$control_monitorkey',
'Masterid': '$Masterid',
'SYSTEMID': '$SYSTEMID'
},
'count': {
'$sum': 1
}
}
}, {
'$project': {
'_id': 0,
'status': '$_id.status',
'control_monitorkey': '$_id.control_monitorkey',
'Masterid': '$_id.Masterid',
'SYSTEMID': '$_id.SYSTEMID',
'count': 1
}
}, {
'$sort': {
'count': -1
}
}, {
'$limit': 5
}
]
Here is the explain plan of this aggregation:
{
"stages" : [
{
"$cursor" : {
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "test.coll",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"is_error" : {
"$eq" : true
}
},
{
"is_workflow_processing" : {
"$eq" : false
}
}
]
},
"queryHash" : "86D43161",
"planCacheKey" : "AF368344",
"winningPlan" : {
"stage" : "PROJECTION_COVERED",
"transformBy" : {
"Masterid" : 1,
"SYSTEMID" : 1,
"control_monitorkey" : 1,
"status" : 1,
"_id" : 0
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"is_workflow_processing" : 1,
"is_error" : 1,
"status" : 1,
"control_monitorkey" : 1,
"Masterid" : 1,
"SYSTEMID" : 1
},
"indexName" : "is_workflow_processing_1_is_error_1_status_1_control_monitorkey_1_Masterid_1_SYSTEMID_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"is_workflow_processing" : [ ],
"is_error" : [ ],
"status" : [ ],
"control_monitorkey" : [ ],
"Masterid" : [ ],
"SYSTEMID" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"is_workflow_processing" : [
"[false, false]"
],
"is_error" : [
"[true, true]"
],
"status" : [
"[MinKey, MaxKey]"
],
"control_monitorkey" : [
"[MinKey, MaxKey]"
],
"Masterid" : [
"[MinKey, MaxKey]"
],
"SYSTEMID" : [
"[MinKey, MaxKey]"
]
}
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$group" : {
"_id" : {
"status" : "$status",
"control_monitorkey" : "$control_monitorkey",
"Masterid" : "$Masterid",
"SYSTEMID" : "$SYSTEMID"
},
"count" : {
"$sum" : {
"$const" : 1
}
}
}
},
{
"$project" : {
"count" : true,
"status" : "$_id.status",
"control_monitorkey" : "$_id.control_monitorkey",
"Masterid" : "$_id.Masterid",
"SYSTEMID" : "$_id.SYSTEMID",
"_id" : false
}
},
{
"$sort" : {
"sortKey" : {
"count" : -1
},
"limit" : NumberLong(5)
}
}
],
"serverInfo" : {
"host" : "hafx",
"port" : 27017,
"version" : "4.4.1",
"gitVersion" : "ad91a93a5a31e175f5cbf8c69561e788bbc55ce1"
},
"ok" : 1,
"$clusterTime" : {
"clusterTime" : Timestamp(1602536597, 4),
"signature" : {
"hash" : BinData(0,"AAAAAAAAAAAAAAAAAAAAAAAAAAA="),
"keyId" : NumberLong(0)
}
},
"operationTime" : Timestamp(1602536597, 4)
}
As you can see above, you only have an IXSCAN
stage followed by a PROJECTION_COVERED
stage. No FETCH
stage: meaning that no data is retrieved from disk. It’s only using the content of the index in RAM.
If this query isn’t below the second - I don’t get it!
Cheers,
Maxime.