MongoDB中如何通过MapReduce匹配数组元素对文档进行分组?
How to group documents by matching array elements with MapReduce in MongoDB?
我有一个数据库,其中有一列包含字符串数组。例子 table:
name | words | ...
Ash | ["Apple", "Pear", "Plum"] | ...
Joe | ["Walnut", "Peanut"] | ...
Max | ["Pineapple", "Apple", "Plum"] | ...
现在我想将此 table 与给定的单词数组进行匹配,并按匹配率对文档进行分组。
具有预期结果的示例输入:
// matched for input = ["Walnut", "Peanut", "Apple"]
{
"1.00": [{name:"Joe", match:"1.00"}],
"0.33": [{name:"Ash", match:"0.33"}, {name:"Max", match:"0.33"}]
}
我正在使用以下 map
函数发出以匹配率作为键的文档:
function map() {
var matches = 0.0;
for(var i in input)
if(this.words.indexOf(input[i]) !== -1) matches+=1;
matches /= input.length;
var key = ""+matches.toFixed(2);
emit(key, {name: this.name, match: key});
}
现在缺少匹配的 reduce
函数,用于将发出的 KV 对组合到结果对象中。
我试过这样的:
function reduce(key, value) {
var res = {};
res[key] = values;
return res;
}
但是我对
的规范有疑问
MongoDB can invoke the reduce function more than once for the same
key. In this case, the previous output from the reduce function for
that key will become one of the input values to the next reduce
function invocation for that key.
...导致嵌套的结果对象。按匹配项对文档进行分组的正确方法是什么?
invoke the reduce function more than once for the same key.
那是 idempotence,reduce 函数必须尊重它。
但是,为了简单起见,您只需确保 map 输出的格式与 reduce 输出的格式相同。
对于你的情况,类似这样的方法会起作用:
db.col.insert({"name": "Ash", "words": ["Apple", "Pear", "Plum"]})
db.col.insert({"name": "Joe", "words": ["Walnut", "Peanut"]})
db.col.insert({"name": "Max", "words": ["Pineapple", "Apple", "Plum"]})
function map() {
input = ["Walnut", "Peanut", "Apple"]
var matches = 0.0;
for(var i in input)
if(this.words.indexOf(input[i]) !== -1) matches+=1;
matches /= input.length;
var key = ""+matches.toFixed(2);
emit(key, {users: [{name: this.name, match: key}]});
}
function reduce(key, value) {
ret = value[0]
for(var i=1; i<value.length; i++){
ret.users = ret.users.concat(value[i].users)
}
return ret
}
db.col.mapReduce(map, reduce, {"out": {inline:1}})
输出:
{
"results" : [
{
"_id" : "0.33",
"value" : {
"users" : [
{
"name" : "Ash",
"match" : "0.33"
},
{
"name" : "Max",
"match" : "0.33"
}
]
}
},
{
"_id" : "0.67",
"value" : {
"users" : [
{
"name" : "Joe",
"match" : "0.67"
}
]
}
}
],
"timeMillis" : 22,
"counts" : {
"input" : 3,
"emit" : 3,
"reduce" : 1,
"output" : 2
},
"ok" : 1
}
我有一个数据库,其中有一列包含字符串数组。例子 table:
name | words | ...
Ash | ["Apple", "Pear", "Plum"] | ...
Joe | ["Walnut", "Peanut"] | ...
Max | ["Pineapple", "Apple", "Plum"] | ...
现在我想将此 table 与给定的单词数组进行匹配,并按匹配率对文档进行分组。
具有预期结果的示例输入:
// matched for input = ["Walnut", "Peanut", "Apple"]
{
"1.00": [{name:"Joe", match:"1.00"}],
"0.33": [{name:"Ash", match:"0.33"}, {name:"Max", match:"0.33"}]
}
我正在使用以下 map
函数发出以匹配率作为键的文档:
function map() {
var matches = 0.0;
for(var i in input)
if(this.words.indexOf(input[i]) !== -1) matches+=1;
matches /= input.length;
var key = ""+matches.toFixed(2);
emit(key, {name: this.name, match: key});
}
现在缺少匹配的 reduce
函数,用于将发出的 KV 对组合到结果对象中。
我试过这样的:
function reduce(key, value) {
var res = {};
res[key] = values;
return res;
}
但是我对
的规范有疑问MongoDB can invoke the reduce function more than once for the same key. In this case, the previous output from the reduce function for that key will become one of the input values to the next reduce function invocation for that key.
...导致嵌套的结果对象。按匹配项对文档进行分组的正确方法是什么?
invoke the reduce function more than once for the same key.
那是 idempotence,reduce 函数必须尊重它。
但是,为了简单起见,您只需确保 map 输出的格式与 reduce 输出的格式相同。
对于你的情况,类似这样的方法会起作用:
db.col.insert({"name": "Ash", "words": ["Apple", "Pear", "Plum"]})
db.col.insert({"name": "Joe", "words": ["Walnut", "Peanut"]})
db.col.insert({"name": "Max", "words": ["Pineapple", "Apple", "Plum"]})
function map() {
input = ["Walnut", "Peanut", "Apple"]
var matches = 0.0;
for(var i in input)
if(this.words.indexOf(input[i]) !== -1) matches+=1;
matches /= input.length;
var key = ""+matches.toFixed(2);
emit(key, {users: [{name: this.name, match: key}]});
}
function reduce(key, value) {
ret = value[0]
for(var i=1; i<value.length; i++){
ret.users = ret.users.concat(value[i].users)
}
return ret
}
db.col.mapReduce(map, reduce, {"out": {inline:1}})
输出:
{
"results" : [
{
"_id" : "0.33",
"value" : {
"users" : [
{
"name" : "Ash",
"match" : "0.33"
},
{
"name" : "Max",
"match" : "0.33"
}
]
}
},
{
"_id" : "0.67",
"value" : {
"users" : [
{
"name" : "Joe",
"match" : "0.67"
}
]
}
}
],
"timeMillis" : 22,
"counts" : {
"input" : 3,
"emit" : 3,
"reduce" : 1,
"output" : 2
},
"ok" : 1
}