跳到主要内容

聚合分析

聚合(Aggregations)是 Elasticsearch 强大的数据分析功能,可以对数据进行统计、分组、计算等操作。

聚合概述

聚合类型

┌─────────────────────────────────────────────────────────────┐
│ 聚合类型分类 │
├─────────────────────────────────────────────────────────────┤
│ Bucket 聚合 - 分桶聚合,将文档分组到桶中 │
│ Metric 聚合 - 指标聚合,计算数值指标 │
│ Pipeline 聚合 - 管道聚合,基于其他聚合结果计算 │
│ Matrix 聚合 - 矩阵聚合,对多个字段进行操作 │
└─────────────────────────────────────────────────────────────┘

基本语法

GET /articles/_search
{
"size": 0, # 不返回文档,只返回聚合结果
"aggs": {
"聚合名称": {
"聚合类型": {
"field": "字段名"
}
}
}
}

Bucket 聚合

terms 聚合

按字段值分组统计:

GET /articles/_search
{
"size": 0,
"aggs": {
"by_category": {
"terms": {
"field": "category.keyword",
"size": 10, # 返回前 10 个桶
"order": {
"_count": "desc" # 按文档数量排序
}
}
}
}
}

响应示例

{
"aggregations": {
"by_category": {
"buckets": [
{ "key": "Python", "doc_count": 150 },
{ "key": "Java", "doc_count": 120 },
{ "key": "Go", "doc_count": 80 }
]
}
}
}

解释

  • terms 聚合按字段的唯一值创建桶
  • 每个桶包含该值对应的文档数量
  • 适合用于分类统计、标签云等场景

range 聚合

按数值范围分组:

GET /articles/_search
{
"size": 0,
"aggs": {
"view_ranges": {
"range": {
"field": "views",
"ranges": [
{ "to": 100, "key": "低" },
{ "from": 100, "to": 1000, "key": "中" },
{ "from": 1000, "key": "高" }
]
}
}
}
}

date_histogram 聚合

按时间间隔分组:

GET /articles/_search
{
"size": 0,
"aggs": {
"articles_over_time": {
"date_histogram": {
"field": "created_at",
"calendar_interval": "month", # 按月分组
"format": "yyyy-MM",
"min_doc_count": 0 # 没有数据的月份也返回
}
}
}
}

时间间隔选项

参数说明
calendar_interval日历间隔:minute、hour、day、week、month、quarter、year
fixed_interval固定间隔:如 30d12h

histogram 聚合

按数值间隔分组:

GET /articles/_search
{
"size": 0,
"aggs": {
"view_histogram": {
"histogram": {
"field": "views",
"interval": 500,
"min_doc_count": 1
}
}
}
}

filter 聚合

过滤后聚合:

GET /articles/_search
{
"size": 0,
"aggs": {
"published_articles": {
"filter": {
"term": { "status": "published" }
},
"aggs": {
"avg_views": {
"avg": { "field": "views" }
}
}
}
}
}

filters 聚合

多过滤器分组:

GET /articles/_search
{
"size": 0,
"aggs": {
"articles_by_status": {
"filters": {
"filters": {
"published": { "term": { "status": "published" } },
"draft": { "term": { "status": "draft" } }
}
}
}
}
}

Metric 聚合

基本统计

GET /articles/_search
{
"size": 0,
"aggs": {
"avg_views": { "avg": { "field": "views" } },
"max_views": { "max": { "field": "views" } },
"min_views": { "min": { "field": "views" } },
"sum_views": { "sum": { "field": "views" } },
"count_views": { "value_count": { "field": "views" } }
}
}

stats 聚合

一次性返回多个统计值:

GET /articles/_search
{
"size": 0,
"aggs": {
"views_stats": {
"stats": { "field": "views" }
}
}
}

# 响应
{
"aggregations": {
"views_stats": {
"count": 1000,
"min": 0,
"max": 10000,
"avg": 1500.5,
"sum": 1500500
}
}
}

extended_stats 聚合

扩展统计,包含方差、标准差等:

GET /articles/_search
{
"size": 0,
"aggs": {
"views_stats": {
"extended_stats": { "field": "views" }
}
}
}

cardinality 聚合

统计唯一值数量(近似去重):

GET /articles/_search
{
"size": 0,
"aggs": {
"unique_authors": {
"cardinality": {
"field": "author.keyword",
"precision_threshold": 1000 # 精度阈值
}
}
}
}

percentile 聚合

计算百分位数:

GET /articles/_search
{
"size": 0,
"aggs": {
"views_percentiles": {
"percentiles": {
"field": "views",
"percents": [50, 75, 90, 95, 99]
}
}
}
}

top_hits 聚合

获取每个桶中的文档:

GET /articles/_search
{
"size": 0,
"aggs": {
"by_category": {
"terms": { "field": "category.keyword" },
"aggs": {
"top_articles": {
"top_hits": {
"size": 3,
"sort": [{ "views": "desc" }],
"_source": ["title", "author", "views"]
}
}
}
}
}
}

嵌套聚合

多层分组

GET /articles/_search
{
"size": 0,
"aggs": {
"by_category": {
"terms": { "field": "category.keyword" },
"aggs": {
"by_author": {
"terms": { "field": "author.keyword", "size": 5 },
"aggs": {
"avg_views": {
"avg": { "field": "views" }
}
}
}
}
}
}
}

解释:先按分类分组,再在每个分类下按作者分组,最后计算每个作者的平均浏览量。

桶排序

GET /articles/_search
{
"size": 0,
"aggs": {
"by_category": {
"terms": {
"field": "category.keyword",
"order": {
"avg_views": "desc" # 按子聚合结果排序
}
},
"aggs": {
"avg_views": {
"avg": { "field": "views" }
}
}
}
}
}

Pipeline 聚合

derivative 聚合(导数)

计算相邻桶的差值:

GET /articles/_search
{
"size": 0,
"aggs": {
"articles_over_time": {
"date_histogram": {
"field": "created_at",
"calendar_interval": "day"
},
"aggs": {
"daily_count": {
"value_count": { "field": "_id" }
},
"derivative": {
"derivative": {
"buckets_path": "daily_count"
}
}
}
}
}
}

cumulative_sum 聚合(累计和)

GET /articles/_search
{
"size": 0,
"aggs": {
"articles_over_time": {
"date_histogram": {
"field": "created_at",
"calendar_interval": "day"
},
"aggs": {
"daily_count": {
"value_count": { "field": "_id" }
},
"cumulative": {
"cumulative_sum": {
"buckets_path": "daily_count"
}
}
}
}
}
}

moving_avg 聚合(移动平均)

GET /articles/_search
{
"size": 0,
"aggs": {
"articles_over_time": {
"date_histogram": {
"field": "created_at",
"calendar_interval": "day"
},
"aggs": {
"daily_views": {
"sum": { "field": "views" }
},
"moving_avg": {
"moving_fn": {
"buckets_path": "daily_views",
"window": 7,
"script": "MovingFunctions.avg(values)"
}
}
}
}
}
}

综合示例

文章统计分析

GET /articles/_search
{
"size": 0,
"query": {
"bool": {
"filter": [
{ "term": { "status": "published" } }
]
}
},
"aggs": {
"by_category": {
"terms": { "field": "category.keyword" },
"aggs": {
"total_views": { "sum": { "field": "views" } },
"avg_views": { "avg": { "field": "views" } },
"top_articles": {
"top_hits": {
"size": 3,
"sort": [{ "views": "desc" }],
"_source": ["title", "views"]
}
}
}
},
"views_over_time": {
"date_histogram": {
"field": "created_at",
"calendar_interval": "month"
},
"aggs": {
"monthly_views": { "sum": { "field": "views" } }
}
},
"popular_tags": {
"terms": {
"field": "tags.keyword",
"size": 10
}
}
}
}

小结

本章我们学习了:

  1. 聚合概述和基本语法
  2. Bucket 聚合(terms、range、date_histogram)
  3. Metric 聚合(avg、sum、max、min、stats)
  4. 嵌套聚合
  5. Pipeline 聚合

练习

  1. 统计每个分类的文章数量和平均浏览量
  2. 按时间统计每月新增文章数量
  3. 找出每个分类下浏览量最高的 5 篇文章
  4. 计算浏览量的百分位数分布

参考资源