重新排列 csv 文件
Rearranging a csv file
我有一个文件,其内容类似于以下内容
Boy,Football
Boy,Football
Boy,Football
Boy,Squash
Boy,Tennis
Boy,Football
Girl,Tennis
Girl,Squash
Girl,Tennis
Girl,Tennis
Boy,Football
如何使用 'awk' 或类似的方法将其重新排列为以下内容:
Football Tennis Squash
Boy 5 1 1
Girl 0 3 1
我什至不确定这是否可行,但任何帮助都会很棒。
我会正常循环:
awk -F, -v OFS="\t" '
{names[]; sport[]; count[,]++}
END{printf "%s", OFS;
for (i in sport)
printf "%s%s", i, OFS;
print "";
for (n in names) {
printf "%s%s", n, OFS
for (s in sport)
printf "%s%s", count[n,s]?count[n,s]:0, OFS; print ""
}
}' file
这会跟踪三个数组:names[]
用于第一列,sport[]
用于第二列,count[name,sport]
用于计算每个组合的出现次数。
然后,循环遍历结果并以一种奇特的方式打印它们,并确保在 count[a,b]
不存在的情况下打印 0
。
测试
$ awk -F, -v OFS="\t" '{names[]; sport[]; count[,]++} END{printf "%s", OFS; for (i in sport) printf "%s%s", i, OFS; print ""; for (n in names) {printf "%s%s", n, OFS; for (s in sport) printf "%s%s", count[n,s]?count[n,s]:0, OFS; print ""}}' a
Squash Tennis Football
Boy 1 1 5
Girl 1 3 0
格式有点难看,有一些尾随的OFS。
去除尾随 OFS:
awk -F, -v OFS="\t" '{names[]; sport[]; count[,]++} END{printf "%s", OFS; for (i in sport) {cn++; printf "%s%s", i, (cn<length(sport)?OFS:ORS)} for (n in names) {cs=0; printf "%s%s", n, OFS; for (s in sport) {cs++; printf "%s%s", count[n,s]?count[n,s]:0, (cs<length(sport)?OFS:ORS)}}}' a
您始终可以通过管道传输到 column -t
以获得不错的输出。
$ cat tst.awk
BEGIN{ FS=","; OFS="\t" }
{
genders[]
sports[]
count[,]++
}
END {
printf ""
for (sport in sports) {
printf "%s%s", OFS, sport
}
print ""
for (gender in genders) {
printf "%s", gender
for (sport in sports) {
printf "%s%s", OFS, count[gender,sport]+0
}
print ""
}
}
$ awk -f tst.awk file
Squash Tennis Football
Boy 1 1 5
Girl 1 3 0
通常,当您知道循环的终点时,您会在每个字段后放置 OFS 或 ORS:
for (i=1; i<=n; i++) {
printf "%s%s", $i, (i<n?OFS:ORS)
}
但如果不这样做,则将 OFS 放在第二个和后续字段之前,并在循环后打印 ORS:
for (x in array) {
printf "%s%s", (++i>1?OFS:""), array[x]
}
print ""
我喜欢:
n = length(array)
for (x in array) {
printf "%s%s", array[x], (++i<n?OFS:ORS)
}
也想结束循环,但是 length(array)
是特定于 gawk 的。
要考虑的另一种方法:
$ cat tst.awk
BEGIN{ FS=","; OFS="\t" }
{
for (i=1; i<=NF; i++) {
if (!seen[i,$i]++) {
map[i,++num[i]] = $i
}
}
count[,]++
}
END {
for (i=0; i<=num[2]; i++) {
printf "%s%s", map[2,i], (i<num[2]?OFS:ORS)
}
for (i=1; i<=num[1]; i++) {
printf "%s%s", map[1,i], OFS
for (j=1; j<=num[2]; j++) {
printf "%s%s", count[map[1,i],map[2,j]]+0, (j<num[2]?OFS:ORS)
}
}
}
$ awk -f tst.awk file
Football Squash Tennis
Boy 5 1 1
Girl 0 1 3
最后一个将按照读取的顺序打印行和列。虽然它的工作原理并不那么明显:-)。
我有一个文件,其内容类似于以下内容
Boy,Football
Boy,Football
Boy,Football
Boy,Squash
Boy,Tennis
Boy,Football
Girl,Tennis
Girl,Squash
Girl,Tennis
Girl,Tennis
Boy,Football
如何使用 'awk' 或类似的方法将其重新排列为以下内容:
Football Tennis Squash
Boy 5 1 1
Girl 0 3 1
我什至不确定这是否可行,但任何帮助都会很棒。
我会正常循环:
awk -F, -v OFS="\t" '
{names[]; sport[]; count[,]++}
END{printf "%s", OFS;
for (i in sport)
printf "%s%s", i, OFS;
print "";
for (n in names) {
printf "%s%s", n, OFS
for (s in sport)
printf "%s%s", count[n,s]?count[n,s]:0, OFS; print ""
}
}' file
这会跟踪三个数组:names[]
用于第一列,sport[]
用于第二列,count[name,sport]
用于计算每个组合的出现次数。
然后,循环遍历结果并以一种奇特的方式打印它们,并确保在 count[a,b]
不存在的情况下打印 0
。
测试
$ awk -F, -v OFS="\t" '{names[]; sport[]; count[,]++} END{printf "%s", OFS; for (i in sport) printf "%s%s", i, OFS; print ""; for (n in names) {printf "%s%s", n, OFS; for (s in sport) printf "%s%s", count[n,s]?count[n,s]:0, OFS; print ""}}' a
Squash Tennis Football
Boy 1 1 5
Girl 1 3 0
格式有点难看,有一些尾随的OFS。
去除尾随 OFS:
awk -F, -v OFS="\t" '{names[]; sport[]; count[,]++} END{printf "%s", OFS; for (i in sport) {cn++; printf "%s%s", i, (cn<length(sport)?OFS:ORS)} for (n in names) {cs=0; printf "%s%s", n, OFS; for (s in sport) {cs++; printf "%s%s", count[n,s]?count[n,s]:0, (cs<length(sport)?OFS:ORS)}}}' a
您始终可以通过管道传输到 column -t
以获得不错的输出。
$ cat tst.awk
BEGIN{ FS=","; OFS="\t" }
{
genders[]
sports[]
count[,]++
}
END {
printf ""
for (sport in sports) {
printf "%s%s", OFS, sport
}
print ""
for (gender in genders) {
printf "%s", gender
for (sport in sports) {
printf "%s%s", OFS, count[gender,sport]+0
}
print ""
}
}
$ awk -f tst.awk file
Squash Tennis Football
Boy 1 1 5
Girl 1 3 0
通常,当您知道循环的终点时,您会在每个字段后放置 OFS 或 ORS:
for (i=1; i<=n; i++) {
printf "%s%s", $i, (i<n?OFS:ORS)
}
但如果不这样做,则将 OFS 放在第二个和后续字段之前,并在循环后打印 ORS:
for (x in array) {
printf "%s%s", (++i>1?OFS:""), array[x]
}
print ""
我喜欢:
n = length(array)
for (x in array) {
printf "%s%s", array[x], (++i<n?OFS:ORS)
}
也想结束循环,但是 length(array)
是特定于 gawk 的。
要考虑的另一种方法:
$ cat tst.awk
BEGIN{ FS=","; OFS="\t" }
{
for (i=1; i<=NF; i++) {
if (!seen[i,$i]++) {
map[i,++num[i]] = $i
}
}
count[,]++
}
END {
for (i=0; i<=num[2]; i++) {
printf "%s%s", map[2,i], (i<num[2]?OFS:ORS)
}
for (i=1; i<=num[1]; i++) {
printf "%s%s", map[1,i], OFS
for (j=1; j<=num[2]; j++) {
printf "%s%s", count[map[1,i],map[2,j]]+0, (j<num[2]?OFS:ORS)
}
}
}
$ awk -f tst.awk file
Football Squash Tennis
Boy 5 1 1
Girl 0 1 3
最后一个将按照读取的顺序打印行和列。虽然它的工作原理并不那么明显:-)。