聚合分析
聚合(Aggregations)是 Elasticsearch 强大的数据分析功能,可以对数据进行统计、分组、计算等操作。
聚合概述
聚合类型
┌─────────────────────────────────────────────────────────────┐
│ 聚合类型分类 │
├─────────────────────────────────────────────────────────────┤
│ Bucket 聚合 - 分桶聚合,将文档分组到桶中 │
│ Metric 聚合 - 指标聚合,计算数值指标 │
│ Pipeline 聚合 - 管道聚合,基于其他聚合结果计算 │
│ Matrix 聚合 - 矩阵聚合,对多个字段进行操作 │
└─────────────────────────────────────────────────────────────┘
基本语法
GET /articles/_search
{
"size": 0, # 不返回文档,只返回聚合结果
"aggs": {
"聚合名称": {
"聚合类型": {
"field": "字段名"
}
}
}
}
Bucket 聚合
terms 聚合
按字段值分组统计:
GET /articles/_search
{
"size": 0,
"aggs": {
"by_category": {
"terms": {
"field": "category.keyword",
"size": 10, # 返回前 10 个桶
"order": {
"_count": "desc" # 按文档数量排序
}
}
}
}
}
响应示例:
{
"aggregations": {
"by_category": {
"buckets": [
{ "key": "Python", "doc_count": 150 },
{ "key": "Java", "doc_count": 120 },
{ "key": "Go", "doc_count": 80 }
]
}
}
}
解释:
terms聚合按字段的唯一值创建桶- 每个桶包含该值对应的文档数量
- 适合用于分类统计、标签云等场景
range 聚合
按数值范围分组:
GET /articles/_search
{
"size": 0,
"aggs": {
"view_ranges": {
"range": {
"field": "views",
"ranges": [
{ "to": 100, "key": "低" },
{ "from": 100, "to": 1000, "key": "中" },
{ "from": 1000, "key": "高" }
]
}
}
}
}
date_histogram 聚合
按时间间隔分组:
GET /articles/_search
{
"size": 0,
"aggs": {
"articles_over_time": {
"date_histogram": {
"field": "created_at",
"calendar_interval": "month", # 按月分组
"format": "yyyy-MM",
"min_doc_count": 0 # 没有数据的月份也返回
}
}
}
}
时间间隔选项:
| 参数 | 说明 |
|---|---|
calendar_interval | 日历间隔:minute、hour、day、week、month、quarter、year |
fixed_interval | 固定间隔:如 30d、12h |
histogram 聚合
按数值间隔分组:
GET /articles/_search
{
"size": 0,
"aggs": {
"view_histogram": {
"histogram": {
"field": "views",
"interval": 500,
"min_doc_count": 1
}
}
}
}
filter 聚合
过滤后聚合:
GET /articles/_search
{
"size": 0,
"aggs": {
"published_articles": {
"filter": {
"term": { "status": "published" }
},
"aggs": {
"avg_views": {
"avg": { "field": "views" }
}
}
}
}
}
filters 聚合
多过滤器分组:
GET /articles/_search
{
"size": 0,
"aggs": {
"articles_by_status": {
"filters": {
"filters": {
"published": { "term": { "status": "published" } },
"draft": { "term": { "status": "draft" } }
}
}
}
}
}
Metric 聚合
基本统计
GET /articles/_search
{
"size": 0,
"aggs": {
"avg_views": { "avg": { "field": "views" } },
"max_views": { "max": { "field": "views" } },
"min_views": { "min": { "field": "views" } },
"sum_views": { "sum": { "field": "views" } },
"count_views": { "value_count": { "field": "views" } }
}
}
stats 聚合
一次性返回多个统计值:
GET /articles/_search
{
"size": 0,
"aggs": {
"views_stats": {
"stats": { "field": "views" }
}
}
}
# 响应
{
"aggregations": {
"views_stats": {
"count": 1000,
"min": 0,
"max": 10000,
"avg": 1500.5,
"sum": 1500500
}
}
}
extended_stats 聚合
扩展统计,包含方差、标准差等:
GET /articles/_search
{
"size": 0,
"aggs": {
"views_stats": {
"extended_stats": { "field": "views" }
}
}
}
cardinality 聚合
统计唯一值数量(近似去重):
GET /articles/_search
{
"size": 0,
"aggs": {
"unique_authors": {
"cardinality": {
"field": "author.keyword",
"precision_threshold": 1000 # 精度阈值
}
}
}
}
percentile 聚合
计算百分位数:
GET /articles/_search
{
"size": 0,
"aggs": {
"views_percentiles": {
"percentiles": {
"field": "views",
"percents": [50, 75, 90, 95, 99]
}
}
}
}
top_hits 聚合
获取每个桶中的文档:
GET /articles/_search
{
"size": 0,
"aggs": {
"by_category": {
"terms": { "field": "category.keyword" },
"aggs": {
"top_articles": {
"top_hits": {
"size": 3,
"sort": [{ "views": "desc" }],
"_source": ["title", "author", "views"]
}
}
}
}
}
}
嵌套聚合
多层分组
GET /articles/_search
{
"size": 0,
"aggs": {
"by_category": {
"terms": { "field": "category.keyword" },
"aggs": {
"by_author": {
"terms": { "field": "author.keyword", "size": 5 },
"aggs": {
"avg_views": {
"avg": { "field": "views" }
}
}
}
}
}
}
}
解释:先按分类分组,再在每个分类下按作者分组,最后计算每个作者的平均浏览量。
桶排序
GET /articles/_search
{
"size": 0,
"aggs": {
"by_category": {
"terms": {
"field": "category.keyword",
"order": {
"avg_views": "desc" # 按子聚合结果排序
}
},
"aggs": {
"avg_views": {
"avg": { "field": "views" }
}
}
}
}
}
Pipeline 聚合
derivative 聚合(导数)
计算相邻桶的差值:
GET /articles/_search
{
"size": 0,
"aggs": {
"articles_over_time": {
"date_histogram": {
"field": "created_at",
"calendar_interval": "day"
},
"aggs": {
"daily_count": {
"value_count": { "field": "_id" }
},
"derivative": {
"derivative": {
"buckets_path": "daily_count"
}
}
}
}
}
}
cumulative_sum 聚合(累计和)
GET /articles/_search
{
"size": 0,
"aggs": {
"articles_over_time": {
"date_histogram": {
"field": "created_at",
"calendar_interval": "day"
},
"aggs": {
"daily_count": {
"value_count": { "field": "_id" }
},
"cumulative": {
"cumulative_sum": {
"buckets_path": "daily_count"
}
}
}
}
}
}
moving_avg 聚合(移动平均)
GET /articles/_search
{
"size": 0,
"aggs": {
"articles_over_time": {
"date_histogram": {
"field": "created_at",
"calendar_interval": "day"
},
"aggs": {
"daily_views": {
"sum": { "field": "views" }
},
"moving_avg": {
"moving_fn": {
"buckets_path": "daily_views",
"window": 7,
"script": "MovingFunctions.avg(values)"
}
}
}
}
}
}
综合示例
文章统计分析
GET /articles/_search
{
"size": 0,
"query": {
"bool": {
"filter": [
{ "term": { "status": "published" } }
]
}
},
"aggs": {
"by_category": {
"terms": { "field": "category.keyword" },
"aggs": {
"total_views": { "sum": { "field": "views" } },
"avg_views": { "avg": { "field": "views" } },
"top_articles": {
"top_hits": {
"size": 3,
"sort": [{ "views": "desc" }],
"_source": ["title", "views"]
}
}
}
},
"views_over_time": {
"date_histogram": {
"field": "created_at",
"calendar_interval": "month"
},
"aggs": {
"monthly_views": { "sum": { "field": "views" } }
}
},
"popular_tags": {
"terms": {
"field": "tags.keyword",
"size": 10
}
}
}
}
小结
本章我们学习了:
- 聚合概述和基本语法
- Bucket 聚合(terms、range、date_histogram)
- Metric 聚合(avg、sum、max、min、stats)
- 嵌套聚合
- Pipeline 聚合
练习
- 统计每个分类的文章数量和平均浏览量
- 按时间统计每月新增文章数量
- 找出每个分类下浏览量最高的 5 篇文章
- 计算浏览量的百分位数分布