MongoDB PHP Aggregating data + count + where

Question

I create a platform in PHP/MYsql and I am now migrating to mongo

My old query for mysql :

select sum(game_won) as game_won,count(id) as total,position
from games_player_stats 
where position < 6 and position > 0 and user_id = :pa_id 
group by position 
order by total desc

The new json format looks like this:

{
 "region" : "EUW",
 "players" : [
         {
            "position" : 2,
            "summoner_id" : 123456,
            "game_won": 1
         },
         {
            "position" : 1,
            "summoner_id" : 123459,
            "game_won": 0
         },
         {
            "position" : 3,
            "summoner_id" : 123458,
            "game_won": 1
         },
         {
            "position" : 4,
            "summoner_id" : 123457,
            "game_won": 0
         }
             ]
}

Having multiple documents like this, I need to find howmany times summoner_id 123456 has had position 2 or any of the other positions 1-6 and howmany times did he win in that position

The Index needs to be queryable on region and summoner_id

Outcome would look like

 {
   "positions" : 
         [
           { "position" : 1,
             "total" : 123,
             "won" : 65
           },
           { "position" : 2,
             "total" : 37,
             "won" : 10
           }
         ]
   }

Would I need to use Map/Reduce for this?

Sede · Accepted Answer · 2016-03-29 05:19:54Z

The best results for this are obtained by the aggregation framework for MongoDB. It differs from mapReduce in that all operations are performed using "natively coded operators" as opposed to the JavaScript evaluation that is used by mapReduce.

This means "faster", and significantly so. Not to mention there are also certain parts of what you are looking for in a result that actually favour the "multiple group" concept that is inherently available to a "pipeline" of operations, that would otherwise be a fairly ugly accumulator using mapReduce.

Aggregation Pipeline Formats

The best approach will differ depending on the MongoDB "server" version you have available.

Ideally with MongoDB 3.2 you use $filter to "pre-filter" the array content before processing with $unwind:

var pipeline = [
    // Match documents with array members matching conditions
    { "$match": {
        "players": {
            "$elemMatch": {
                "summoner_id": 123456,
                "position": { "$gte": 1, "$lte": 6  }
            }
        }
    }},

    // Filter the array content for matched conditions
    { "$project": {
      "players": {
          "$filter": {
              "input": "$players",
              "as": "player"
              "cond": {
                  "$and": [
                      { "$eq": [ "$$player.summoner_id", 123456 ] },
                      { "$gte": [ "$$player.position", 1 ] },
                      { "$lte": [ "$$player.position", 6 ] }
                  ]
              }
          }
      }
    }},

    // Unwind the array contents to de-normalize
    { "$unwind": "$players" },

    // Group on the inner "position"
    { "$group": {
        "_id": "$players.position",
        "total": { "$sum": 1 },
        "won": { "$sum": "$players.won" }
    }},

    // Optionally Sort by position since $group is not ordered
    { "$sort": { "total": -1 } },

    // Optionally $group to a single document response with an array
    { "$group": {
        "_id": null,
        "positions": {
            "$push": {
                "position": "$_id",
                "total": "$total",
                "won": "$won"
            }
        }
    }}
];

db.collection.aggregate(pipeline);

For MongoDB 2.6.x releases, still "pre-filter" but using $map and $setDifference:

var pipeline = [
    // Match documents with array members matching conditions
    { "$match": {
        "players": {
            "$elemMatch": {
                "summoner_id": 123456,
                "position": { "$gte": 1, "$lte": 6  }
            }
        }
    }},

    // Filter the array content for matched conditions
    { "$project": {
      "players": {
          "$setDifference": [
              { "$map": {
                  "input": "$players",
                  "as": "player",
                  "in": {
                      "$cond": {
                          "if": {
                              "$and": [
                                  { "$eq": [ "$$player.summoner_id", 123456 ] },
                                  { "$gte": [ "$$player.position", 1 ] },
                                  { "$lte": [ "$$player.position", 6 ] }
                              ]
                          },
                          "then": "$$player",
                          "else": false
                      }
                  }    
              }},
              [false]   
          ]
      }
    }},

    // Unwind the array contents to de-normalize
    { "$unwind": "$players" },

    // Group on the inner "position"
    { "$group": {
        "_id": "$players.position",
        "total": { "$sum": 1 },
        "won": { "$sum": "$players.won" }
    }},

    // Optionally Sort by position since $group is not ordered
    { "$sort": { "total": -1 } },

    // Optionally $group to a single document response with an array
    { "$group": {
        "_id": null,
        "positions": {
            "$push": {
                "position": "$_id",
                "total": "$total",
                "won": "$won"
            }
        }
    }}
];

And for earlier versions with the aggregation framework from MongoDB 2.2, "post filter" with $match "after" the $unwind:

var pipeline = [
    // Match documents with array members matching conditions
    { "$match": {
        "players": {
            "$elemMatch": {
                "summoner_id": 123456,
                "position": { "$gte": 1, "$lte": 6  }
            }
        }
    }},

    { "$unwind": "$players" },

    // Post filter the denormalized content
    { "$match": {
        "players.summoner_id": 123456,
        "players.position": { "$gte": 1, "$lte": 6 }
    }},

    // Group on the inner "position"
    { "$group": {
        "_id": "$players.position",
        "total": { "$sum": 1 },
        "won": { "$sum": "$players.won" }
    }},

    // Optionally Sort by position since $group is not ordered
    { "$sort": { "total": -1 } },

    // Optionally $group to a single document response with an array
    { "$group": {
        "_id": null,
        "positions": {
            "$push": {
                "position": "$_id",
                "total": "$total",
                "won": "$won"
            }
        }
    }}
];

Walkthrough

Matching the Document: This is primarily done using $elemMatch since you are looking for "multiple" conditions within the array elements. With a "single" condition on an array element it is fine to use "dot notation":
```
"players.summoner_id": 12345
```
But for anything more than "one" condition you need to use $elemMatch, otherwise all the statement is really asking is "does this match something within the array?", and that does not contain to "all" within the element. So even the $gte and $lte combination alone is actually "two" conditions, and therefore requires $elemMatch:
```
"players": {
    "$elemMatch": {
        "position": { "$gte": 1, "$lte": 6 }
    }
}
```
Also noting here that from "1 to 6 inclusive" means "greater than or equal to" and vice versa for the "less than" condition.

-

"Pre-filtering": Noting here that the eventual goal is to "group" by an element within the array, being "position". This means that eventually you are going to need to $unwind the content to do that.

However, the $unwind pipeline operation is going to be quite costly, considering that it "takes apart" the array and creates a new document to process for each array member. Since you only want "some" of the members that actually match the conditions, it's desirable to "remove" any un-matched content from the array "before" you de-normalize this content.

MongoDB 3.2 has a good method for this with the $filter operator. It performs exactly as named by "filtering" the content of the array to only elements that match a particular set of conditions.

In an aggregation pipeline stage we use it's "logical variants" of the operators such as $gte and $lte. These return a true/false value depending on where the condition matched. Also within the array, these can actually be referred to using the member fields using "dot notation" to the alias argument in "as" which points to the current processed member.

The $and here is also another "logical operator" which does the same true/false response. So this means "all" the arguments in it's array of arguments must be met in order to return true. For the $filter itself, the true/false evaluated in "cond" determines whether to return the array element or not.

For MongoDB 2.6 which does not have the $filter operator, the same is represented with the combination of $map and $setDifference Simply put the $map looks at each element and applies an expression within "in". In this case we use $cond which as a "ternary" operator evaluates an 'if/then/else` form.

So here where the "if" returns true the expression in "then" is returned as the current array member. Where it is false, the expression in else returns, and in this case we are returning the value of false ( PHP False ).

Since all members are actually being returned by the result of $map we then emulate $filter by applying the $setDifference operator. This does a comparison to the members of the array and effectively "removes" any members where the element was returned as false from the result. So with distinct array members such as you have, the resulting "set" ( being a "set" of "unique" elements) just contains those elements where the condition was true and a non-false value was returned.
"Post" filtering: The alternate approach which is mandatory for server versions below MongoDB 2.6 is to "post" filter the array content. Since there are no operators in these versions that allow such actions on array content before $unwind, the simple process here to applying another $match to the content "after" the $unwind is processed:
```
{ "$match": {
    "players.summoner_id": 123456,
    "players.position": { "$gte": 1, "$lte": 6 }
}}
```
Here you use "dot notation" since each array element is now actually it's own document, and there is nothing else to compare to other than looking at the conditions on the specified path.

This is not ideal, since when you process $unwind all of the elements that actually don't match the conditions are still present. This ultimately means "more documents to process" and has the double cost of:
1. Had to create a new document for every member despite it not matching the conditions
2. Now you have to to apply the condition across every "document" emitted as a result of $unwind
This has a potentially huge impact on performance, and for that reason the modern MongoDB releases introduce ways to act on arrays without resorting to $unwind in order to process. You still need it for the remaining processing since you are "grouping" on a property contained within the array. But it is of course desirably to "get rid of un-matched elements first".
Remainging Grouping: Now the elements are filtered and de-normalized, it only remains to do the actual $group condition that will total things by the "position" within each element. This is a simple matter of providing the grouping key to "_id" and using the appropriate data accumulation.

In this case you have two constructs, being:
```
    "total": { "$sum": 1 },
    "won": { "$sum": "$players.won" }
```
The basic { "$sum": 1 } is just "counting" the elements matched for each group and the { "$sum": "$players.won" } actually uses the "won" value to accumulate a total. This is pretty standard usage for the $sum accumulator.

Of course your output shows the content within an "array", so the following stages are really "optional" since the real work of actually "grouping" is already done. So you could actually just use the results in the form provided up to this first $group, and the remaining just puts everything into a single document response rather than "one document per 'position' value", which would be the return at this point.

The first note is output from $group is not ordered. So if you want a specific order of results ( i.e by position ascending ) then you must $sort after that $group stage. This will order the resulting documents of the pipeline as of the point where it is applied.

In your case you are actually asking for a sort on "total" anyway, so you would of course apply this with -1 meaning "descending" in this case. But whatever the case, you still should not presume that the output from $group is ordered in any way.

The "second" $group here is basically cosmetic in that this is what makes a "single document" response. Using null ( PHP NULL ) in the grouping key basically says "group everything" and will produce a single document in response. The $push accumulator here is what actually makes the "array" from the documents in the pipeline preceding this.

Wrap-Up

So that's the general process in accumulating data like this:

Match the documents required to the conditions, since after all it would be a waste to apply conditions later to every document when they don't even contain array elements that would match the conditions you eventually want.
Filter the array content and de-normalize. Ideally done as a "pre-filter" where possible. This gets the documents into a form for grouping, from there original array form.
Accumulate the content using appropriate operators for the task, either $sum or $avg or $push or any other available according to needs. Nothing also that depending on structure and conditions you can always use "more than one" $group pipeline stage.

PHP Translation

The initial example in PHP notation:

pipeline = array(
    array(
        '$match' => array(
            'players' => array(
                '$elemMatch' => array(
                    'summoner_id' => 123456,
                    'position' => array( '$gte' => 0, '$lte' => 6 )
                )
            )
        )
    ),
    array(
        '$project' => array(
            '$filter' => array(
                'input' => '$players',
                'as' => 'player',
                'cond' => (
                    '$and' => array(
                        array( '$eq' => array( '$$player.summoner_id' => 123456 ) ),
                        array( '$gte' => array( '$$player.position' => 1 ) ),
                        array( '$lte' => array( '$$player.position' => 6 ) )
                    )
                )
            )
        )
    ),
    array( '$unwind' => '$players' ),
    array(
        '$group' => array(
            '_id' => '$players.position',
            'total' => array( '$sum' => 1 ),
            'won' => array( '$sum' => '$players.won' )
        )
    ),
    array( '$sort' => array( 'total' => -1 ) ),
    array(
        '$group' => array(
            '_id' => NULL,
            'positions' => array(
                '$push' => array(
                    'position' => '$_id',
                    'total' => '$total',
                    'won' => '$won'
                )
            )
        )
    )
)

$result = $collection->aggregate($pipeline);

When making data structures in PHP that you are comparing to JSON, it is is often useful to check your structure with something like:

echo json_encode($pipeline, JSON_PRETTY_PRINT)

Then you can see that what you are doing in PHP notation is the same as the JSON example you are following. It's a helpful tip so that you cannot really go wrong. If it looks different then you are not doing the "same" thing.

Collectives™ on Stack Overflow

MongoDB PHP Aggregating data + count + where

1 Answer 1

Aggregation Pipeline Formats

Walkthrough

Wrap-Up

PHP Translation

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

Aggregation Pipeline Formats

Walkthrough

Wrap-Up

PHP Translation

Comments

Your Answer

Sign up or log in

Post as a guest

Related