确定整个数据的结构
Determine structure of the entire data
比如说,你有以下数据:
data <- tibble::tribble(~Countries, ~States, ~Continents,
"Country 1", 1L, "continent 1",
"Country 1", 2L, "continent 1",
"Country 1", 3L, "continent 1",
"Country 1", 4L, "continent 1",
"Country 2", 1L, "continent 1",
"Country 2", 2L, "continent 1",
"Country 2", 3L, "continent 1",
"Country 2", 4L, "continent 1",
"Country 3", 1L, "continent 1",
"Country 3", 2L, "continent 1",
"Country 3", 3L, "continent 1",
"Country 3", 4L, "continent 1",
"Country 1", 1L, "continent 2",
"Country 1", 2L, "continent 2",
"Country 1", 3L, "continent 2",
"Country 1", 4L, "continent 2",
"Country 2", 1L, "continent 2",
"Country 2", 2L, "continent 2",
"Country 2", 3L, "continent 2",
"Country 2", 4L, "continent 2",
"Country 3", 1L, "continent 2",
"Country 3", 2L, "continent 2",
"Country 3", 3L, "continent 2",
"Country 3", 4L, "continent 2")
此数据可能有许多不同格式的变量,具有不同的粒度级别。我想了解数据的结构,以便我可以说最高级别的数据是具有 2 个值的大陆,下一级粒度是具有 3 个值的县,最低级别是上述数据中具有 4 个值的州。
理解这一点的一种粗略方法可能是让 variable/s 在左侧保持不同值的数量最少,即大洲和不同值数量最多的那些,即国家到数据集的右边。
更简单的让我们了解混乱数据的方法是创建某种树形图,并在顶部、各大洲、此处查看最细粒度的数据,在底部各州查看最细粒度的数据,在这里,如 leaves/nodes.
作为第一步,我们可以使用一些技巧,例如在 first/top 处显示两个或多个变量中的任何一个,以防唯一值的数量相同时出现平局。
如果做第二很难,我们怎么能至少做第一呢? ...可能是 评估任何通用混乱数据中每个变量的不同值,然后对变量进行排序 !任何其他带有 R 代码的方法都会很有帮助。
第一点的解决方案如下所示:
data <- tibble::tribble( ~Continents, ~Countries, ~States,
"continent 1", "Country 1", 1L,
"continent 1", "Country 1", 2L,
"continent 1", "Country 1", 3L,
"continent 1", "Country 1", 4L,
"continent 1", "Country 2", 1L,
"continent 1", "Country 2", 2L,
"continent 1", "Country 2", 3L,
"continent 1", "Country 2", 4L,
"continent 1", "Country 3", 1L,
"continent 1", "Country 3", 2L,
"continent 1", "Country 3", 3L,
"continent 1", "Country 3", 4L,
"continent 2", "Country 1", 1L,
"continent 2", "Country 1", 2L,
"continent 2", "Country 1", 3L,
"continent 2", "Country 1", 4L,
"continent 2", "Country 2", 1L,
"continent 2", "Country 2", 2L,
"continent 2", "Country 2", 3L,
"continent 2", "Country 2", 4L,
"continent 2", "Country 3", 1L,
"continent 2", "Country 3", 2L,
"continent 2", "Country 3", 3L,
"continent 2", "Country 3", 4L)
如果我没猜错,下面的代码会回答您的问题:
data[order(sapply(data, function(x) length(unique(x))))] # returns the data in the desired order
# simple function for plotting the 'tree'.
plotTree <- function(lengths, names, space = 0.3){
L <- lengths[O <- order(lengths)]
N <- names[O]
XMax <- max(L)
YMax <- (length(L))
plot(NULL, xlim = c(-XMax, XMax), ylim = c(-YMax, YMax), axes = F, xlab = "", ylab = "")
for (i in 1:length(L)){
rect(-L[i], YMax - 1 - i * (space + 1), L[i], YMax - i * (space + 1), col = i)
text(0, YMax - 1/2 - i * (space + 1), N[i], col = if (i == 1) "white" else "black")
}
}
# usage
plotTree(sapply(data, function(x) length(unique(x))), names(data), space = 0.3)
比如说,你有以下数据:
data <- tibble::tribble(~Countries, ~States, ~Continents,
"Country 1", 1L, "continent 1",
"Country 1", 2L, "continent 1",
"Country 1", 3L, "continent 1",
"Country 1", 4L, "continent 1",
"Country 2", 1L, "continent 1",
"Country 2", 2L, "continent 1",
"Country 2", 3L, "continent 1",
"Country 2", 4L, "continent 1",
"Country 3", 1L, "continent 1",
"Country 3", 2L, "continent 1",
"Country 3", 3L, "continent 1",
"Country 3", 4L, "continent 1",
"Country 1", 1L, "continent 2",
"Country 1", 2L, "continent 2",
"Country 1", 3L, "continent 2",
"Country 1", 4L, "continent 2",
"Country 2", 1L, "continent 2",
"Country 2", 2L, "continent 2",
"Country 2", 3L, "continent 2",
"Country 2", 4L, "continent 2",
"Country 3", 1L, "continent 2",
"Country 3", 2L, "continent 2",
"Country 3", 3L, "continent 2",
"Country 3", 4L, "continent 2")
此数据可能有许多不同格式的变量,具有不同的粒度级别。我想了解数据的结构,以便我可以说最高级别的数据是具有 2 个值的大陆,下一级粒度是具有 3 个值的县,最低级别是上述数据中具有 4 个值的州。
理解这一点的一种粗略方法可能是让 variable/s 在左侧保持不同值的数量最少,即大洲和不同值数量最多的那些,即国家到数据集的右边。
更简单的让我们了解混乱数据的方法是创建某种树形图,并在顶部、各大洲、此处查看最细粒度的数据,在底部各州查看最细粒度的数据,在这里,如 leaves/nodes.
作为第一步,我们可以使用一些技巧,例如在 first/top 处显示两个或多个变量中的任何一个,以防唯一值的数量相同时出现平局。
如果做第二很难,我们怎么能至少做第一呢? ...可能是 评估任何通用混乱数据中每个变量的不同值,然后对变量进行排序 !任何其他带有 R 代码的方法都会很有帮助。
第一点的解决方案如下所示:
data <- tibble::tribble( ~Continents, ~Countries, ~States,
"continent 1", "Country 1", 1L,
"continent 1", "Country 1", 2L,
"continent 1", "Country 1", 3L,
"continent 1", "Country 1", 4L,
"continent 1", "Country 2", 1L,
"continent 1", "Country 2", 2L,
"continent 1", "Country 2", 3L,
"continent 1", "Country 2", 4L,
"continent 1", "Country 3", 1L,
"continent 1", "Country 3", 2L,
"continent 1", "Country 3", 3L,
"continent 1", "Country 3", 4L,
"continent 2", "Country 1", 1L,
"continent 2", "Country 1", 2L,
"continent 2", "Country 1", 3L,
"continent 2", "Country 1", 4L,
"continent 2", "Country 2", 1L,
"continent 2", "Country 2", 2L,
"continent 2", "Country 2", 3L,
"continent 2", "Country 2", 4L,
"continent 2", "Country 3", 1L,
"continent 2", "Country 3", 2L,
"continent 2", "Country 3", 3L,
"continent 2", "Country 3", 4L)
如果我没猜错,下面的代码会回答您的问题:
data[order(sapply(data, function(x) length(unique(x))))] # returns the data in the desired order
# simple function for plotting the 'tree'.
plotTree <- function(lengths, names, space = 0.3){
L <- lengths[O <- order(lengths)]
N <- names[O]
XMax <- max(L)
YMax <- (length(L))
plot(NULL, xlim = c(-XMax, XMax), ylim = c(-YMax, YMax), axes = F, xlab = "", ylab = "")
for (i in 1:length(L)){
rect(-L[i], YMax - 1 - i * (space + 1), L[i], YMax - i * (space + 1), col = i)
text(0, YMax - 1/2 - i * (space + 1), N[i], col = if (i == 1) "white" else "black")
}
}
# usage
plotTree(sapply(data, function(x) length(unique(x))), names(data), space = 0.3)