0

We are using Elasticsearch 7.9

Our index, called method_info_tree, contains two-level nested fields:

  • The top level described a java method,
  • the nested level below it describes a thread that ran the method, and
  • the third nested level within a thread describes the states of that thread over time.

Below is the mapping of the index in Elasticsearch:

{
  "mappings": {
    "properties": {
      "method_id" : {
        "type" : "long"
      },
      "threads": {
        "type": "nested",
        "properties": {
          "thread_id": {
            "type": "long"
          },
          "states": {
            "type": "nested",
            "properties": {
              "collect_time": {
                "type": "date"
              },
              "state": {
                "type": "keyword"
              },
              "elapsed_time": {
                "type" : "integer"
              }
            }
          }

        }
      }
    }
  }
}

Here is a sample document in the index:

{
  "took" : 13,
  "timed_out" : false,
  "_shards" : {
    "total" : 2,
    "successful" : 2,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 5198,
      "relation" : "eq"
    },
    "max_score" : 0.0,
    "hits" : [
      {
        "_index" : "method_info_tree-000001",
        "_type" : "_doc",
        "_id" : "WiHCCYQBhPdvF01n3kp1",
        "_score" : 0.0,
        "_routing" : "86163-d5c064d0-55a3-44b9-88fb-c44b7233cfa4",
        "_source" : {
          "timestamp" : 1666610993800,
          "method_id" : 140280075031760,

          "threads" : [
            {
              "thread_id" : 1,

              "states_hit" : [
                {
                  "state" : "RUNNABLE",
                  "collect_time" : 1666610994750,
                  "elapsed_time" : 50
                },
                {
                  "state" : "IO",
                  "collect_time" : 1666610994800,
                  "elapsed_time" : 50
                },
                {
                  "state" : "IO",
                  "collect_time" : 1666610994850,
                  "elapsed_time" : 50
                },
                {
                  "state" : "IO",
                  "collect_time" : 1666610994900,
                  "elapsed_time" : 50
                },
                {
                  "state" : "IO",
                  "collect_time" : 1666610994950,
                  "elapsed_time" : 50
                },
                {
                  "state" : "IO",
                  "collect_time" : 1666610995000,
                  "elapsed_time" : 50
                },
                {
                  "state" : "IO",
                  "collect_time" : 1666610995050,
                  "elapsed_time" : 50
                },
                {
                  "state" : "IO",
                  "collect_time" : 1666610995100,
                  "elapsed_time" : 50
                },
                {
                  "state" : "IO",
                  "collect_time" : 1666610995150,
                  "elapsed_time" : 50
                }
              ]

            }
          ]
        }
      }
    ]
  }
}

Note that for each method_id we have many documents for various thread_ids.

I would like, for each method_id, to calculate the sum of elapsed_time field per state (for all threads), something like:

method_id -> 
    [
      {
        "state" : "IO",
        "elapsed_time" : 566622.0
      },
      {
        "state" : "RUNNABLE",
        "elapsed_time" : 566572.0
      },
      {
        "state" : "BLOCKED",
        "elapsed_time" : 50.0
      }
    ]

Below is my Elasticsearch query:

GET method_info_tree/_search
{
  "from": 0,
  "size": 0,
  "track_total_hits": true,
  "query": {
    "bool": {
      "filter": [
        {
          "term": {
            "session_id": "86163-d5c064d0-55a3-44b9-88fb-c44b7233cfa4"
          }
        },
        {
          "nested": {
            "path": "threads.states_hit",
            "query": {
              "bool": {
                "filter": [
                  {
                    "range": {
                      "threads.states_hit.collect_time": {
                        "gte": 0,
                        "lte": 2000000000000
                      }
                    }
                  }
                ]
              }
            }
          }
        }
      ]
    }
  },
  "aggs": {
    "top_methods_agg": {
      "terms": {
        "field": "method_id",
        "size": 20
      },
      "aggs": {
        "elapsed_time_agg": {
          "nested": {
            "path": "threads.states_hit"
          },
          "aggs": {
            "states_range": {
              "range": {
                "field": "threads.states_hit.collect_time",
                "ranges": [
                  {
                    "from": 0,
                    "to": 2000000000001
                  }
                ]
              },
              "aggs": {
                "elapsed_time_per_state_agg": {
                  "terms": {
                    "field": "threads.states_hit.state",
                    "size": 10
                  },
                  "aggs": {
                    "elapsed_time": {
                      "sum": {
                        "field": "threads.states_hit.elapsed_time"
                      }
                    }
                  }
                },
                "total_self_elapsed_time": {
                  "sum": {
                    "field": "threads.states_hit.elapsed_time"
                  }
                },
                "wasted_elapsed_time": {
                  "filter": {
                    "terms": {
                      "threads.states_hit.state": [
                        "BLOCKED",
                        "IO"
                      ]
                    }
                  },
                  "aggs": {
                    "total_wasted": {
                      "sum": {
                        "field": "threads.states_hit.elapsed_time"
                      }
                    }
                  }
                }
              }
            }
          }
        }
      }
    }
  }
}

A sample result would be:

{
  "took" : 218,
  "timed_out" : false,
  "_shards" : {
    "total" : 2,
    "successful" : 2,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 5727,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "aggregations" : {
    "top_methods_agg" : {
      "doc_count_error_upper_bound" : 1,
      "sum_other_doc_count" : 73,
      "buckets" : [
        {
          "key" : 140280074341584,
          "doc_count" : 728,
          "elapsed_time_agg" : {
            "doc_count" : 21838,
            "states_range" : {
              "buckets" : [
                {
                  "key" : "1970-01-01T00:00:00.000Z-2033-05-18T03:33:20.001Z",
                  "from" : 0.0,
                  "from_as_string" : "1970-01-01T00:00:00.000Z",
                  "to" : 2.000000000001E12,
                  "to_as_string" : "2033-05-18T03:33:20.001Z",
                  "doc_count" : 21838,
                  "total_self_elapsed_time" : {
                    "value" : 1133244.0
                  },
                  "wasted_elapsed_time" : {
                    "doc_count" : 1,
                    "total_wasted" : {
                      "value" : 50.0
                    }
                  },
                  "elapsed_time_per_state_agg" : {
                    "doc_count_error_upper_bound" : 0,
                    "sum_other_doc_count" : 0,
                    "buckets" : [
                      {
                        "key" : "IO",
                        "doc_count" : 10919,
                        "elapsed_time" : {
                          "value" : 566622.0
                        }
                      },
                      {
                        "key" : "RUNNABLE",
                        "doc_count" : 10918,
                        "elapsed_time" : {
                          "value" : 566572.0
                        }
                      },
                      {
                        "key" : "BLOCKED",
                        "doc_count" : 1,
                        "elapsed_time" : {
                          "value" : 50.0
                        }
                      }
                    ]
                  }
                }
              ]
            }
          }
        },
        {
          "key" : 140282650318928,
          "doc_count" : 3,
          "elapsed_time_agg" : {
            "doc_count" : 3,
            "states_range" : {
              "buckets" : [
                {
                  "key" : "1970-01-01T00:00:00.000Z-2033-05-18T03:33:20.001Z",
                  "from" : 0.0,
                  "from_as_string" : "1970-01-01T00:00:00.000Z",
                  "to" : 2.000000000001E12,
                  "to_as_string" : "2033-05-18T03:33:20.001Z",
                  "doc_count" : 3,
                  "total_self_elapsed_time" : {
                    "value" : 150.0
                  },
                  "wasted_elapsed_time" : {
                    "doc_count" : 0,
                    "total_wasted" : {
                      "value" : 0.0
                    }
                  },
                  "elapsed_time_per_state_agg" : {
                    "doc_count_error_upper_bound" : 0,
                    "sum_other_doc_count" : 0,
                    "buckets" : [
                      {
                        "key" : "RUNNABLE",
                        "doc_count" : 3,
                        "elapsed_time" : {
                          "value" : 150.0
                        }
                      }
                    ]
                  }
                }
              ]
            }
          }
        }
      ]
    }
  }
}

Note that I deleted some of the result buckets for better coherence of the example.

My problem: I need to sort the results by 'total_self_elapsed_time' and return only the top 5 results. Since the result is nested, I cannot access the calculated field 'total_self_elapsed_time'. Can you please direct me to how I can add to my query sorting by this aggregated field?

2
  • You have used "states_range": { "range": { "field": "threads.states_hit.collect_time", "ranges": [ { "from": 0, "to": 2000000000001 } ] } do you intend to pass multiple ranges or only single range value? Commented Oct 25, 2022 at 11:51
  • I intend to pass a single range, I just couldn't find how to use one range only. In fact I tried using 'filter' yet failed. The examples given in the documentation demonstrated ranges. See elastic.co/guide/en/elasticsearch/reference/7.9/… Commented Oct 25, 2022 at 12:11

2 Answers 2

2

I have replaced states_range range aggregation with filter aggregation. Range aggregation will generate buckets for each ranges specified. So you cannot sort terms based on sub multi buckets.

To sort I have used "order" in terms aggregation.

Query

  "aggs": {
    "top_methods_agg": {
      "terms": {
        "field": "method_id",
        "size": 20,
        "order": {
          "elapsed_time_agg>states_range>total_self_elapsed_time": "asc"
        }
      },
      "aggs": {
        "elapsed_time_agg": {
          "nested": {
            "path": "threads.states_hit"
          },
          "aggs": {
            "states_range": {
              "filter": {
                "range": {
                  "threads.states_hit.collect_time": {
                    "gte": 0,
                    "lte": 2000000000000
                  }
                }
              },
              "aggs": {
                "elapsed_time_per_state_agg": {
                  "terms": {
                    "field": "threads.states_hit.state",
                    "size": 10
                  },
                  "aggs": {
                    "elapsed_time": {
                      "sum": {
                        "field": "threads.states_hit.elapsed_time"
                      }
                    }
                  }
                },
                "total_self_elapsed_time": {
                  "sum": {
                    "field": "threads.states_hit.elapsed_time"
                  }
                },
                "wasted_elapsed_time": {
                  "filter": {
                    "terms": {
                      "threads.states_hit.state": [
                        "BLOCKED",
                        "IO"
                      ]
                    }
                  },
                  "aggs": {
                    "total_wasted": {
                      "sum": {
                        "field": "threads.states_hit.elapsed_time"
                      }
                    }
                  }
                }
              }
            }
          }
        }
      }
    }
  }

Try above , see if it works out for you.

Sign up to request clarification or add additional context in comments.

1 Comment

Your sort indeed works, yet I need to sort the result and take only top 5 results. So I replaced your 'order' aggregation with 'bucket_sort' as a sibling aggregation to 'elapsed_time_agg' aggregation, and there I limited the ending result to 5: "top_methods_sort": { "bucket_sort": { "sort": [ { "elapsed_time_agg>states_range>total_self_elapsed_time": { "order": "desc" } } ], "size": 5 } }
0

For completeness sake, here's the query that works:

  "aggs": {
    "top_methods_agg": {
      "terms": {
        "field": "method_id",
        "size": 20
      },
      "aggs": {
        "elapsed_time_agg": {
          "nested": {
            "path": "threads.states_hit"
          },
          "aggs": {
            "states_range": {
              "filter": {
                "range": {
                  "threads.states_hit.collect_time": {
                    "gte": 0,
                    "lte": 2000000000000
                  }
                }
              },
              "aggs": {
                "elapsed_time_per_state_agg": {
                  "terms": {
                    "field": "threads.states_hit.state",
                    "size": 10
                  },
                  "aggs": {
                    "elapsed_time": {
                      "sum": {
                        "field": "threads.states_hit.elapsed_time"
                      }
                    }
                  }
                },
                "total_self_elapsed_time": {
                  "sum": {
                    "field": "threads.states_hit.elapsed_time"
                  }
                },
                "wasted_elapsed_time": {
                  "filter": {
                    "terms": {
                      "threads.states_hit.state": [
                        "BLOCKED",
                        "IO"
                      ]
                    }
                  },
                  "aggs": {
                    "total_wasted": {
                      "sum": {
                        "field": "threads.states_hit.elapsed_time"
                      }
                    }
                  }
                }
              }
            }
          }
        },
        "top_methods_sort": {
          "bucket_sort": {
            "sort": [
              {
                "elapsed_time_agg>states_range>total_self_elapsed_time": {
                  "order": "desc"
                }
              }
            ],
            "size": 5
          }
        }
      }
    }
  }

Many thanks!

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.