Ch4_$Facets Lab

I was able to get the solution easy enough, I couldn’t get the $setIntersection to return the correct answer (obviously, “The Godfather” – and I disagree with the video that the Godfather II was better. Both were great, but the first was amazing…

I solved it this way after refactoring a few times, I found it to be the most efficient (which helps trying to use Compass).

db.movies.aggregate([
    { "$match": {
      "imdb.rating": {"$gt": 0},
      "metacritic": {$type: "int"}
    }},
    {"$project": {
      _id: 0,
      title: 1,
      "imdb.rating": 1,
      metacritic: 1
    }},
    {"$facet": {
      "byImdbRating": [
        {$sort: {"imdb.rating": -1}},
        {$limit: 10},
        {
          "$bucketAuto": {
            "groupBy": "$imdb.rating",
            "buckets": 1,
            "output": {
              "titles": {"$push": "$title"}
              }
            }
          },
        ],
        "byMetacriticRating": [
           {$sort: {metacritic: -1}},
           {$limit: 10},
          {
            "$bucketAuto": {
              "groupBy": "$metacritic",
              "buckets": 1,
              "output": {
                "titles": {"$push": "$title"}
                }
              }
            }
          ]
      }},
      {$project: {
        "byMetacriticRating.titles":1,
        "byImdbRating.titles":1,
      }},
      {$addFields: {
        "commonToBoth": {$setIntersection: ["$byMetacriticRating.titles", "$byImdbRating.titles"]}
      }}
]).pretty()

My result set is as follows:

{
	"byImdbRating" : [
		{
			"titles" : [
				"The Power of Nightmares: The Rise of the Politics of Fear",
				"Pulp Fiction",
				"Schindler's List",
				"The Good, the Bad and the Ugly",
				"The Century of the Self",
				"The Dark Knight",
				"The Godfather: Part II",
				"The Martian",
				"The Godfather",
				"The Shawshank Redemption"
			]
		}
	],
	"byMetacriticRating" : [
		{
			"titles" : [
				"Boyhood",
				"Fanny and Alexander",
				"The Conformist",
				"Lawrence of Arabia",
				"Au Hasard Balthazar",
				"Sweet Smell of Success",
				"The Leopard",
				"The Godfather",
				"Journey to Italy",
				"Best Kept Secret"
			]
		}
	],
	"commonToBoth" : [ ]
}

I tried a bunch of different things to get that final thing working, I just didn’t have time to Trial&Error every Aggregation operator, so what step did I miss? I suspected it was something to do with the fact that my results were an object within an array, but curious…

I ran into the object within an array as well.

Here is a snippet of my code. I used $facets to produce separate IMDB and Metacritic buckets and then used $Match to get only the highest rated films from each (I previously filtered to get only movies that had a rating in both IMDB and Meta-critic).

The results came back like this:
“Metacritic” : [
{
“_id” : 100,
“titles” : [
“Fanny and Alexander”,
“Boyhood”,
“Best Kept Secret”,
“The Wizard of Oz”,
“Au Hasard Balthazar”,
“Journey to Italy”,
“Sweet Smell of Success”,
“Lawrence of Arabia”,
“The Leopard”,
“The Conformist”,
“The Godfather”
]
}
]

Metacritic and IMDB were both arrays containing a document. I could not figure out how to make the $setIntersection work with that data. I tried things like “$IMDB.0.titles” etc but nothing was working. So I ended up using an $unwind on both to bring the document out of the array it was in.

{
	"$facet": {
		"IMDB": [{
			"$match": {
				"$and": [ { "imdbRating": { $exists: true } }, { "imdbRating": { "$gte": 8.9 } } ]
			}
		},{
			"$bucket": {
				"groupBy": "$imdbRating",
				"boundaries": [ 8.9, Infinity ],
				"output": {
					"titles": { "$addToSet": "$title" }
				}
			}
		}],
		"Metacritic": [{
			"$match": {
				"$and": [ { "metacritic": { $exists: true } }, { "metacritic": { "$gte": 100 } } ] 
			}
		},{
			"$bucket": {
				"groupBy": "$metacritic",
				"boundaries": [ 100, Infinity ],
				"output": {
					"titles": { "$addToSet": "$title" }
				}
			}
		}]
	}
},{
	"$unwind": "$IMDB"
},{
	"$unwind": "$Metacritic"
}

From there I was able to $setIntersect the unwound results and get the answer that only The Godfather matched.

Was there an easier way to do this? I did a lot of Googling and searching on Mongo documentation but I couldn’t figure out how to compare two arrays that were both in nested documents within arrays.

I figured out my issue. I had tried unwinding the embedded array, not the property/field level array:

db.movies.aggregate([
    { "$match": {
      "imdb.rating": {"$gt": 0},
      "metacritic": {$type: "int"}
    }},
    {"$project": {
      _id: 0,
      title: 1,
      "imdb.rating": 1,
      metacritic: 1
    }},
    {"$facet": {
      "byImdbRating": [
        {$sort: {"imdb.rating": -1}},
        {$limit: 10},
        {
          "$bucketAuto": {
            "groupBy": "$imdb.rating",
            "buckets": 1,
            "output": {
              "titles": {"$push": "$title"}
              }
            }
          },
        ],
        "byMetacriticRating": [
          {$sort: {metacritic: -1}},
          {$limit: 10},
          {
            "$bucketAuto": {
              "groupBy": "$metacritic",
              "buckets": 1,
              "output": {
                "titles": {"$push": "$title"}
                }
              }
            }
          ]
      }},
      {$unwind: "$byImdbRating"},
      {$unwind: "$byMetacriticRating"},
      {$project: {
        "byMetacriticRating.titles": 1,
        "byImdbRating.titles":1,
      }},
      {$addFields: {
        "commonToBoth": {$setIntersection: ["$byMetacriticRating.titles", "$byImdbRating.titles"]}
      }}
]).pretty()
1 Like

3 things jump out, maybe Carlos Jose will have something differnt, but:

  1. after your $match stage, you will wanna do a $project so you’re only passing the amount of data you exactly want into the $bucket stage.
  2. You should use a $sort and $limit stage in order to make the query more performant. Sorting and limiting greatly reduces the amount of in-memory usage your pipelines will have, making everything more performant. This might be the big benefit of a $facet stage b/c you can lump all that in together as you’re essentially pipelining 2 differnt documents which you’ll eventually compare against each other…
  3. You could combine your match statments and move them to the top of the document, that’ll also improve query performance.
1 Like

This is how I got mine resolved eventually…please forgive the comments on the side. I add them so I don’t drive myself insane :slight_smile:

MongoDB Enterprise Cluster0-shard-0:PRIMARY> db.movies.aggregate([
… {
… $match : { “imdb.rating” : { $gte : 0 }, “metacritic” : { $gte : 0 } } // exclude all non-number fields from both
… },
… {
… $facet : { // define facet for 2 separate pipelines
… metacritic_rated : [ // pipeline 1 - Staring with metacritic value, give output field a name
… {
… $sort : { “metacritic” : -1 } // sort metacritic field in desc order
… },
… {
… $project : { _id : 0, rating : “$metacritic”, title : 1} // get rid of _id field
… },
… {
… $limit : 10 // Pick the first 10 highest rated films
… },
… {
… $group : { // group all ten results and…
… _id : “”,
… matched_titles : { $addToSet : “$title”} // add films to an array called matched_titles
… }
… },
… {
… $project: { _id : 0, “matched_titles” : 1} // return the top 10 rated films in an array
… }
… ],
… imdb_rated : [ // pipeline 2 - ow with imdb_rating, define field name as such
… {
… $sort : { “imdb.rating” : -1 } // sort imdb.rating field in desc order
… },
… {
… $project : { _id : 0, rating : “$imdb.rating”, title : 1} // get rid of _id field
… },
… {
… $limit : 10 // Pick the first 10 highest rated films
… },
… {
… $group : {
… _id : “”,
… matched_titles : { $addToSet : “$title”} // add films to an array called matched_titles
… }
… },
… {
… $project: { _id : 0, “matched_titles” : 1} // return the top 10 rated films in an array
… }
… ]
… }
… }, // end of $facet with 2 different pipelines
… {
… $project : { meta_rated : “$metacritic_rated.matched_titles”, imdb_rated : “$imdb_rated.matched_titles” } // Both are now arrays but nested i.e. [[ ]], [[ ]] which wont work with $setIntersect so…
… },
… {
… $unwind: “$meta_rated” // unwind first array
… },
… {
… $unwind: “$imdb_rated” // unwind second array
… },
… {
… $project : { commonFilm_count : { $size : { $setIntersection : ["$meta_rated", “$imdb_rated” ] }} } // return a count of the intersect of both arrays
… }
… ]).pretty()
{ “commonFilm_count” : 1 }
MongoDB Enterprise Cluster0-shard-0:PRIMARY>