elasticsearch 将数据转换为数组
elasticsearch transform data to array
我想使用 ES 来计算用户留存率:
- 1、事件日志到默认索引
- 2、转为中间索引:以entity为中心的数据,group by acc
- 3、使用aggs过滤器(或adjacency_matrix)计算每一天的相交结果。
问题出在第 2 步:如何生成漂亮的变换
输入事件日志:
POST _bulk
{"index": {"_index": "test.u1"}}
{"acc":1001, "event":"create", "timestamp":"2020-08-01 09:00"}
{"index": {"_index": "test.u1"}}
{"acc":1001, "event":"login", "timestamp":"2020-08-01 10:00"}
{"index": {"_index": "test.u1"}}
{"acc":1001, "event":"login", "timestamp":"2020-08-02 10:00"}
{"index": {"_index": "test.u1"}}
{"acc":1001, "event":"login", "timestamp":"2020-08-03 10:00"}
{"index": {"_index": "test.u1"}}
{"acc":1002, "event":"create", "timestamp":"2020-08-01 10:00"}
{"index": {"_index": "test.u1"}}
{"acc":1002, "event":"login", "timestamp":"2020-08-02 10:00"}
{"index": {"_index": "test.u1"}}
{"acc":1002, "event":"login", "timestamp":"2020-08-02 11:00"}
{"index": {"_index": "test.u1"}}
{"acc":1003, "event":"create", "timestamp":"2020-08-01 10:00"}
{"index": {"_index": "test.u1"}}
{"acc":1004, "event":"create", "timestamp":"2020-08-02 10:00"}
{"index": {"_index": "test.u1"}}
{"acc":1004, "event":"login", "timestamp":"2020-08-02 10:00"}
{"index": {"_index": "test.u1"}}
{"acc":1004, "event":"login", "timestamp":"2020-08-03 10:00"}
期望中间指数:
{"acc":1001, "create":"08-01", "login":[08-01, 08-02, 08-03]}
{"acc":1002, "create":"08-01", "login":[08-02]}
{"acc":1003, "create":"08-01", "login":[]}
{"acc":1004, "create":"08-02", "login":[08-02, 08-03]}
如何生成 "login" 数组?
或者任何更好的设计都欢迎。
由 aggs.scripted_metric
完成
PUT _transform/tr-acc2-ar2
{
"source": {
"index": [
"mhlog2-*"
]
},
"pivot": {
"group_by": {
"msg.#account_id": {
"histogram": {
"field": "msg.#account_id",
"interval": "1"
}
}
},
"aggregations": {
"create": {
"filter": {
"term": {
"msg.#event_name.keyword": "createRole"
}
},
"aggs": {
"time": {
"min": {
"field": "@timestamp"
}
}
}
},
"login": {
"filter": {
"term": {
"msg.#event_name.keyword": "login"
}
},
"aggs": {
"days": {
"scripted_metric": {
"init_script": "state.days=[:];",
"map_script": "state.days[doc['@timestamp'].value.toString('yyyy-MM-dd')]=1; ",
"combine_script": "return state",
"reduce_script": "def days = [:]; def array =[]; for (s in states) { for (d in s.days.keySet()) { days[d]=1; } } for (d in days.keySet()) { array.add(d);} return array; "
}
}
}
}
}
},
"dest": {
"index": "idx.tr.acc2.ar2"
},
"sync": {
"time": {
"field": "@timestamp",
"delay": "60s"
}
}
}
gen中间索引:
_id : AAAAAAAA
_index : acc.array
_score : 0
_type : _doc
create.time : Aug 18, 2020 @ 11:17:43.000
login.days : 2020-08-18T00:00:00.000Z, 2020-08-19T00:00:00.000Z, 2020-08-20T00:00:00.000Z
msg.#account_id : 12333212323
最后,2020-08-18 user-retention 2020-08-19 很容易通过 KQL 过滤器:
create.time: 2020-08-18 AND login.days: 2020-08-19
我想使用 ES 来计算用户留存率:
- 1、事件日志到默认索引
- 2、转为中间索引:以entity为中心的数据,group by acc
- 3、使用aggs过滤器(或adjacency_matrix)计算每一天的相交结果。
问题出在第 2 步:如何生成漂亮的变换
输入事件日志:
POST _bulk
{"index": {"_index": "test.u1"}}
{"acc":1001, "event":"create", "timestamp":"2020-08-01 09:00"}
{"index": {"_index": "test.u1"}}
{"acc":1001, "event":"login", "timestamp":"2020-08-01 10:00"}
{"index": {"_index": "test.u1"}}
{"acc":1001, "event":"login", "timestamp":"2020-08-02 10:00"}
{"index": {"_index": "test.u1"}}
{"acc":1001, "event":"login", "timestamp":"2020-08-03 10:00"}
{"index": {"_index": "test.u1"}}
{"acc":1002, "event":"create", "timestamp":"2020-08-01 10:00"}
{"index": {"_index": "test.u1"}}
{"acc":1002, "event":"login", "timestamp":"2020-08-02 10:00"}
{"index": {"_index": "test.u1"}}
{"acc":1002, "event":"login", "timestamp":"2020-08-02 11:00"}
{"index": {"_index": "test.u1"}}
{"acc":1003, "event":"create", "timestamp":"2020-08-01 10:00"}
{"index": {"_index": "test.u1"}}
{"acc":1004, "event":"create", "timestamp":"2020-08-02 10:00"}
{"index": {"_index": "test.u1"}}
{"acc":1004, "event":"login", "timestamp":"2020-08-02 10:00"}
{"index": {"_index": "test.u1"}}
{"acc":1004, "event":"login", "timestamp":"2020-08-03 10:00"}
期望中间指数:
{"acc":1001, "create":"08-01", "login":[08-01, 08-02, 08-03]}
{"acc":1002, "create":"08-01", "login":[08-02]}
{"acc":1003, "create":"08-01", "login":[]}
{"acc":1004, "create":"08-02", "login":[08-02, 08-03]}
如何生成 "login" 数组? 或者任何更好的设计都欢迎。
由 aggs.scripted_metric
完成PUT _transform/tr-acc2-ar2
{
"source": {
"index": [
"mhlog2-*"
]
},
"pivot": {
"group_by": {
"msg.#account_id": {
"histogram": {
"field": "msg.#account_id",
"interval": "1"
}
}
},
"aggregations": {
"create": {
"filter": {
"term": {
"msg.#event_name.keyword": "createRole"
}
},
"aggs": {
"time": {
"min": {
"field": "@timestamp"
}
}
}
},
"login": {
"filter": {
"term": {
"msg.#event_name.keyword": "login"
}
},
"aggs": {
"days": {
"scripted_metric": {
"init_script": "state.days=[:];",
"map_script": "state.days[doc['@timestamp'].value.toString('yyyy-MM-dd')]=1; ",
"combine_script": "return state",
"reduce_script": "def days = [:]; def array =[]; for (s in states) { for (d in s.days.keySet()) { days[d]=1; } } for (d in days.keySet()) { array.add(d);} return array; "
}
}
}
}
}
},
"dest": {
"index": "idx.tr.acc2.ar2"
},
"sync": {
"time": {
"field": "@timestamp",
"delay": "60s"
}
}
}
gen中间索引:
_id : AAAAAAAA
_index : acc.array
_score : 0
_type : _doc
create.time : Aug 18, 2020 @ 11:17:43.000
login.days : 2020-08-18T00:00:00.000Z, 2020-08-19T00:00:00.000Z, 2020-08-20T00:00:00.000Z
msg.#account_id : 12333212323
最后,2020-08-18 user-retention 2020-08-19 很容易通过 KQL 过滤器:
create.time: 2020-08-18 AND login.days: 2020-08-19