Bash: 从特定的列名称剪切一个 CSV 文件
Bash: Cut a CSV File from specific Column Names
我得到一个包含大量无用信息的 CSV 文件,我想将我需要的信息从该文件导入另一个 CSV 文件。
当前状态:
First Name,Middle Name, Last Name, Title, Suffix, Nickname, Given Yomi, Surname Yomi....
Angel,,Romero,,,Romi,, ....
在新文件中应该是这样的:
First Name, Last Name, Nickname
Angel, Romero, Romi
我想通过使用诸如 cut 和列名之类的东西来做到这一点,而不仅仅是字段编号。
有点像这样:
cut -d',' -f"First Name" file
我知道这行不通,但还有其他方法吗?
awk -v tags='First Name,Last Name,Nickname' '
BEGIN {
FS=", *"; OFS=", "
numOutFlds = split(tags,outFldNr2tag)
}
NR==1 {
for (inFldNr=1; inFldNr<=NF; inFldNr++) {
tag = $inFldNr
tag2inFldNr[tag] = inFldNr
}
}
{
for (outFldNr=1; outFldNr<=numOutFlds; outFldNr++) {
tag = outFldNr2tag[outFldNr]
inFldNr = tag2inFldNr[tag]
val = $inFldNr
printf "%s%s", val, (outFldNr<numOutFlds ? OFS : ORS)
}
}
' file
First Name, Last Name, Nickname
Angel, Romero, Romi
$ cat csvcut.awk
# csvcut.awk
function csvsplit(str, arr, i,j,n,s,fs,qt) {
# split comma-separated fields into arr; return number of fields in arr
# fields surrounded by double-quotes may contain commas;
# doubled double-quotes represent a single embedded quote
delete arr; s = "START"; n = 0; fs = ","; qt = "\""
for (i = 1; i <= length(str); i++) {
if (s == "START") {
if (substr(str,i,1) == fs) { arr[++n] = "" }
else if (substr(str,i,1) == qt) { j = i+1; s = "INQUOTES" }
else { j = i; s = "INFIELD" } }
else if (s == "INFIELD") {
if (substr(str,i,1) == fs) {
arr[++n] = substr(str,j,i-j); j = 0; s = "START" } }
else if (s == "INQUOTES") {
if (substr(str,i,1) == qt) { s = "MAYBEDOUBLE" } }
else if (s == "MAYBEDOUBLE") {
if (substr(str,i,1) == fs) {
arr[++n] = substr(str,j,i-j-1)
gsub(qt qt, qt, arr[n]); j = 0; s = "START" } } }
if (s == "INFIELD" || s == "INQUOTES") { arr[++n] = substr(str,j) }
else if (s == "MAYBEDOUBLE") {
arr[++n] = substr(str,j,length(str)-j); gsub(qt qt, qt, arr[n]) }
else if (s == "START") { arr[++n] = "" }
return n }
BEGIN { # read and store output field names
for (i=1; i<ARGC; i++) { fields[++nfields] = ARGV[i]; ARGV[i] = "" } }
NR == 1 { # read and store input field names, write output header
for (i=1; i<=csvsplit([=10=],arr); i++) { names[arr[i]] = i }
for (i=1; i<=nfields; i++) { printf "%s%s", sep, fields[i]; sep = "," }
printf "\n" }
NR > 1 { # read input record, split fields, write output record
delete csv; sep = ""; n = csvsplit([=10=], csv)
for (i=1; i<=nfields; i++) {
printf "%s%s", sep, csv[names[fields[i]]]; sep = "," }
printf "\n" }
$ cat mahmoud.input
FirstName,MiddleName,LastName,Title,Suffix,Nickname,GivenYomi,SurnameYomi
Angel,,Romero,,,Romi,,
$ awk -f csvcut.awk FirstName LastName Nickname <mahmoud.input
FirstName,LastName,Nickname
Angel,Romero,Romi
鉴于您有一个没有变量 space 的直接 CSV,您可以直接使用 Ruby 的 csv 解析器(无需先清理 csv 文件...)
鉴于:
cat file
First Name,Middle Name,Last Name,Title,Suffix,Nickname,Given Yomi,Surname Yomi
Angel,,Romero,,,Romi,,
您可以只过滤每个 csv 行:
ruby -r CSV -e 'BEGIN{wanted=["First Name", "Last Name", "Nickname"]
puts wanted.to_csv
}
CSV.parse($<.read, headers:true).each{
|h| puts h.to_hash.select{
|k,v| wanted.include?(k) }.values.to_csv}' file
打印:
First Name,Last Name,Nickname
Angel,Romero,Romi
这里的优点是支持完整的 csv 文件,包括带有嵌入式分隔符的引号字段。
工具是Miller:
mlr --csv cut -o -f "field A","field B" input.csv >output.csv
Here cut
动词的文档。
可能晚了而且不是很笼统,但是如果你不需要重用脚本就很简单了:
awk 'BEGIN {FS=", *"; OFS=","}{print ,,}' input.csv > output.csv
我得到一个包含大量无用信息的 CSV 文件,我想将我需要的信息从该文件导入另一个 CSV 文件。
当前状态:
First Name,Middle Name, Last Name, Title, Suffix, Nickname, Given Yomi, Surname Yomi....
Angel,,Romero,,,Romi,, ....
在新文件中应该是这样的:
First Name, Last Name, Nickname
Angel, Romero, Romi
我想通过使用诸如 cut 和列名之类的东西来做到这一点,而不仅仅是字段编号。 有点像这样:
cut -d',' -f"First Name" file
我知道这行不通,但还有其他方法吗?
awk -v tags='First Name,Last Name,Nickname' '
BEGIN {
FS=", *"; OFS=", "
numOutFlds = split(tags,outFldNr2tag)
}
NR==1 {
for (inFldNr=1; inFldNr<=NF; inFldNr++) {
tag = $inFldNr
tag2inFldNr[tag] = inFldNr
}
}
{
for (outFldNr=1; outFldNr<=numOutFlds; outFldNr++) {
tag = outFldNr2tag[outFldNr]
inFldNr = tag2inFldNr[tag]
val = $inFldNr
printf "%s%s", val, (outFldNr<numOutFlds ? OFS : ORS)
}
}
' file
First Name, Last Name, Nickname
Angel, Romero, Romi
$ cat csvcut.awk
# csvcut.awk
function csvsplit(str, arr, i,j,n,s,fs,qt) {
# split comma-separated fields into arr; return number of fields in arr
# fields surrounded by double-quotes may contain commas;
# doubled double-quotes represent a single embedded quote
delete arr; s = "START"; n = 0; fs = ","; qt = "\""
for (i = 1; i <= length(str); i++) {
if (s == "START") {
if (substr(str,i,1) == fs) { arr[++n] = "" }
else if (substr(str,i,1) == qt) { j = i+1; s = "INQUOTES" }
else { j = i; s = "INFIELD" } }
else if (s == "INFIELD") {
if (substr(str,i,1) == fs) {
arr[++n] = substr(str,j,i-j); j = 0; s = "START" } }
else if (s == "INQUOTES") {
if (substr(str,i,1) == qt) { s = "MAYBEDOUBLE" } }
else if (s == "MAYBEDOUBLE") {
if (substr(str,i,1) == fs) {
arr[++n] = substr(str,j,i-j-1)
gsub(qt qt, qt, arr[n]); j = 0; s = "START" } } }
if (s == "INFIELD" || s == "INQUOTES") { arr[++n] = substr(str,j) }
else if (s == "MAYBEDOUBLE") {
arr[++n] = substr(str,j,length(str)-j); gsub(qt qt, qt, arr[n]) }
else if (s == "START") { arr[++n] = "" }
return n }
BEGIN { # read and store output field names
for (i=1; i<ARGC; i++) { fields[++nfields] = ARGV[i]; ARGV[i] = "" } }
NR == 1 { # read and store input field names, write output header
for (i=1; i<=csvsplit([=10=],arr); i++) { names[arr[i]] = i }
for (i=1; i<=nfields; i++) { printf "%s%s", sep, fields[i]; sep = "," }
printf "\n" }
NR > 1 { # read input record, split fields, write output record
delete csv; sep = ""; n = csvsplit([=10=], csv)
for (i=1; i<=nfields; i++) {
printf "%s%s", sep, csv[names[fields[i]]]; sep = "," }
printf "\n" }
$ cat mahmoud.input
FirstName,MiddleName,LastName,Title,Suffix,Nickname,GivenYomi,SurnameYomi
Angel,,Romero,,,Romi,,
$ awk -f csvcut.awk FirstName LastName Nickname <mahmoud.input
FirstName,LastName,Nickname
Angel,Romero,Romi
鉴于您有一个没有变量 space 的直接 CSV,您可以直接使用 Ruby 的 csv 解析器(无需先清理 csv 文件...)
鉴于:
cat file
First Name,Middle Name,Last Name,Title,Suffix,Nickname,Given Yomi,Surname Yomi
Angel,,Romero,,,Romi,,
您可以只过滤每个 csv 行:
ruby -r CSV -e 'BEGIN{wanted=["First Name", "Last Name", "Nickname"]
puts wanted.to_csv
}
CSV.parse($<.read, headers:true).each{
|h| puts h.to_hash.select{
|k,v| wanted.include?(k) }.values.to_csv}' file
打印:
First Name,Last Name,Nickname
Angel,Romero,Romi
这里的优点是支持完整的 csv 文件,包括带有嵌入式分隔符的引号字段。
工具是Miller:
mlr --csv cut -o -f "field A","field B" input.csv >output.csv
Here cut
动词的文档。
可能晚了而且不是很笼统,但是如果你不需要重用脚本就很简单了:
awk 'BEGIN {FS=", *"; OFS=","}{print ,,}' input.csv > output.csv