如何计算给定路径内 s3 文件夹的数量?
How to count the number of s3 folders inside given path?
我一直试图搜索此解决方案,但并不走运。希望能在这里快速找到一些解决方案。我在 S3 中有一些已迁移的文件,现在需要确定给定路径中涉及的文件夹数量。假设我有一些文件如下。
如果我给aws s3 ls s3://my-bucket/foo1 --recursive >> file_op.txt
"cat file_op.txt" - 如下所示:
my-bucket/foo1/foo2/foo3/foo4/foo5/foo6/foo7/file1.txt
my-bucket/foo1/foo2/foo3/foo4/foo5/foo6/foo7/file2.txt
my-bucket/foo1/foo2/foo3/foo4/foo5/foo6/file1.pdf
my-bucket/foo1/foo2/foo3/foo4/foo6/file2.txt
my-bucket/foo1/foo2/foo3/file3.txt
my-bucket/foo1/foo8/file1.txt
my-bucket/foo1/foo9/foo10/file4.csv
我已将输出存储在一个文件中并通过 wc -l
处理以查找文件数
但是我找不到路径中涉及的文件夹数。
我需要如下输出:
number of files : 7
number of folders : 9
编辑 1:
更正了预期的文件夹数量。
(不包括 my-bucket
和 foo1
)
(foo6
在 foo5
和 foo4
目录中)
下面是我计算目录数失败的代码:
#!/bin/bash
if [[ "$#" -ne 1 ]] ; then
echo "Usage: [=12=] \"s3 folder path\" <eg. \"my-bucket/foo1\"> "
exit 1
else
start=$SECONDS
input=
input_code=$(echo $input | awk -F'/' '{print "_" }')
#input_length=$(echo $input | awk -F'/' '{print NF}' )
s3bucket=$(echo $input | awk -F'/' '{print }')
db_name=$(echo $input | awk -F'/' '{print }')
pathfinder=$(echo $input | awk 'BEGIN{FS=OFS="/"} {first = ; =""; print}'|sed 's#^/##g'|sed 's#$#/#g')
myn=$(whoami)
cdt=$(date +%Y%m%d%H%M%S)
filename=[=12=]_${myn}_${cdt}_${input_code}
folders=${filename}_folders
dcountfile=${filename}_dir_cnt
aws s3 ls s3://${input} --recursive | awk '{print }' > $filename
cat $filename |awk -F"$pathfinder" '{print }'| awk 'BEGIN{FS=OFS="/"}{NF--; print}'| sort -n | uniq > $folders
#grep -oP '(?<="$input_code" ).*'
fcount=`cat ${filename} | wc -l`
awk 'BEGIN{FS="/"}
{ if (NF > maxNF)
{
for (i = maxNF + 1; i <= NF; i++)
count[i] = 1;
maxNF = NF;
}
for (i = 1; i <= NF; i++)
{
if (col[i] != "" && $i != col[i])
count[i]++;
col[i] = $i;
}
}
END {
for (i = 1; i <= maxNF; i++)
print count[i];
}' $folders > $dcountfile
dcount=$(cat $dcountfile | xargs | awk '{for(i=t=0;i<NF;) t+=$++i; [=12=]=t}1' )
printf "Bucket name : \e[1;31m $s3bucket \e[0m\n" | tee -a ${filename}.out
printf "DB name : \e[1;31m $db_name \e[0m\n" | tee -a ${filename}.out
printf "Given folder path : \e[1;31m $input \e[0m\n" | tee -a ${filename}.out
printf "The number of folders in the given directory are\e[1;31m $dcount \e[0m\n" | tee -a ${filename}.out
printf "The number of files in the given directory are\e[1;31m $fcount \e[0m\n" | tee -a ${filename}.out
end=$SECONDS
elapsed=$((end - start))
printf '\n*** Script completed in %d:%02d:%02d - Elapsed %d:%02d:%02d ***\n' \
$((end / 3600)) $((end / 60 % 60)) $((end % 60)) \
$((elapsed / 3600)) $((elapsed / 60 % 60)) $((elapsed % 60)) | tee -a ${filename}.out
exit 0
fi
您已阐明要计算唯一名称,忽略前两层(my-bucket
和 foo1
)和最后一层(文件名)。
perl -F/ -lane'
++$f;
++$d{ $F[$_] } for 2 .. $#F - 1;
END {
print "Number of files: ".( $f // 0 );
print "Number of dirs: ".( keys(%d) // 0 );
}
'
输出:
Number of files: 7
number of dirs: 9
你的问题不清楚。
如果我们计算列表中唯一的亲属文件夹路径,前提是有 12 个:
my-bucket/foo1/foo2/foo3/foo4/foo5/foo6/foo7
my-bucket/foo1/foo2/foo3/foo4/foo5/foo6
my-bucket/foo1/foo2/foo3/foo4/foo6
my-bucket/foo1/foo2/foo3/foo4/foo5
my-bucket/foo1/foo2/foo3/foo4
my-bucket/foo1/foo2/foo3
my-bucket/foo1/foo2
my-bucket/foo1/foo8
my-bucket/foo1/foo9/foo10
my-bucket/foo1/foo9
my-bucket/foo1
my-bucket
计算这个的awk
脚本是:
BEGIN {FS = "/";} # set field deperator to "/"
{ # for each input line
commulativePath = OFS = ""; # reset commulativePath and OFS (Output Field Seperator) to ""
for (i = 1; i < NF; i++) { # loop all folders up to file name
if (i > 1) OFS = FS; # set OFS to "/" on second path
commulativePath = commulativePath OFS $i; # append current field to commulativePath variable
dirs[commulativePath] = 0; # insert commulativePath into an associative array dirs
}
}
END {
print NR " " length(dirs); # print records count, and associative array dirs length
}
如果我们计算唯一的文件夹名称,则有 11 个:
my-bucket
foo1
foo2
foo3
foo4
foo5
foo6
foo7
foo8
foo9
foo10
计算这个的awk
脚本是:
awk -F'/' '{for(i=1;i<NF;i++)dirs[$i]=1;}END{print NR " " length(dirs)}' input.txt
如果您不介意使用管道并调用 awk 两次,那么它相当干净:
mawk 'BEGIN {OFS=ORS;FS="/";_^=_}_+_<NF && --NF~($_="")' file \
\
| mawk 'NF {_[$__]} END { print length(_) }'
我一直试图搜索此解决方案,但并不走运。希望能在这里快速找到一些解决方案。我在 S3 中有一些已迁移的文件,现在需要确定给定路径中涉及的文件夹数量。假设我有一些文件如下。
如果我给aws s3 ls s3://my-bucket/foo1 --recursive >> file_op.txt
"cat file_op.txt" - 如下所示:
my-bucket/foo1/foo2/foo3/foo4/foo5/foo6/foo7/file1.txt
my-bucket/foo1/foo2/foo3/foo4/foo5/foo6/foo7/file2.txt
my-bucket/foo1/foo2/foo3/foo4/foo5/foo6/file1.pdf
my-bucket/foo1/foo2/foo3/foo4/foo6/file2.txt
my-bucket/foo1/foo2/foo3/file3.txt
my-bucket/foo1/foo8/file1.txt
my-bucket/foo1/foo9/foo10/file4.csv
我已将输出存储在一个文件中并通过 wc -l
处理以查找文件数
但是我找不到路径中涉及的文件夹数。
我需要如下输出:
number of files : 7
number of folders : 9
编辑 1: 更正了预期的文件夹数量。
(不包括 my-bucket
和 foo1
)
(foo6
在 foo5
和 foo4
目录中)
下面是我计算目录数失败的代码:
#!/bin/bash
if [[ "$#" -ne 1 ]] ; then
echo "Usage: [=12=] \"s3 folder path\" <eg. \"my-bucket/foo1\"> "
exit 1
else
start=$SECONDS
input=
input_code=$(echo $input | awk -F'/' '{print "_" }')
#input_length=$(echo $input | awk -F'/' '{print NF}' )
s3bucket=$(echo $input | awk -F'/' '{print }')
db_name=$(echo $input | awk -F'/' '{print }')
pathfinder=$(echo $input | awk 'BEGIN{FS=OFS="/"} {first = ; =""; print}'|sed 's#^/##g'|sed 's#$#/#g')
myn=$(whoami)
cdt=$(date +%Y%m%d%H%M%S)
filename=[=12=]_${myn}_${cdt}_${input_code}
folders=${filename}_folders
dcountfile=${filename}_dir_cnt
aws s3 ls s3://${input} --recursive | awk '{print }' > $filename
cat $filename |awk -F"$pathfinder" '{print }'| awk 'BEGIN{FS=OFS="/"}{NF--; print}'| sort -n | uniq > $folders
#grep -oP '(?<="$input_code" ).*'
fcount=`cat ${filename} | wc -l`
awk 'BEGIN{FS="/"}
{ if (NF > maxNF)
{
for (i = maxNF + 1; i <= NF; i++)
count[i] = 1;
maxNF = NF;
}
for (i = 1; i <= NF; i++)
{
if (col[i] != "" && $i != col[i])
count[i]++;
col[i] = $i;
}
}
END {
for (i = 1; i <= maxNF; i++)
print count[i];
}' $folders > $dcountfile
dcount=$(cat $dcountfile | xargs | awk '{for(i=t=0;i<NF;) t+=$++i; [=12=]=t}1' )
printf "Bucket name : \e[1;31m $s3bucket \e[0m\n" | tee -a ${filename}.out
printf "DB name : \e[1;31m $db_name \e[0m\n" | tee -a ${filename}.out
printf "Given folder path : \e[1;31m $input \e[0m\n" | tee -a ${filename}.out
printf "The number of folders in the given directory are\e[1;31m $dcount \e[0m\n" | tee -a ${filename}.out
printf "The number of files in the given directory are\e[1;31m $fcount \e[0m\n" | tee -a ${filename}.out
end=$SECONDS
elapsed=$((end - start))
printf '\n*** Script completed in %d:%02d:%02d - Elapsed %d:%02d:%02d ***\n' \
$((end / 3600)) $((end / 60 % 60)) $((end % 60)) \
$((elapsed / 3600)) $((elapsed / 60 % 60)) $((elapsed % 60)) | tee -a ${filename}.out
exit 0
fi
您已阐明要计算唯一名称,忽略前两层(my-bucket
和 foo1
)和最后一层(文件名)。
perl -F/ -lane'
++$f;
++$d{ $F[$_] } for 2 .. $#F - 1;
END {
print "Number of files: ".( $f // 0 );
print "Number of dirs: ".( keys(%d) // 0 );
}
'
输出:
Number of files: 7
number of dirs: 9
你的问题不清楚。
如果我们计算列表中唯一的亲属文件夹路径,前提是有 12 个:
my-bucket/foo1/foo2/foo3/foo4/foo5/foo6/foo7
my-bucket/foo1/foo2/foo3/foo4/foo5/foo6
my-bucket/foo1/foo2/foo3/foo4/foo6
my-bucket/foo1/foo2/foo3/foo4/foo5
my-bucket/foo1/foo2/foo3/foo4
my-bucket/foo1/foo2/foo3
my-bucket/foo1/foo2
my-bucket/foo1/foo8
my-bucket/foo1/foo9/foo10
my-bucket/foo1/foo9
my-bucket/foo1
my-bucket
计算这个的awk
脚本是:
BEGIN {FS = "/";} # set field deperator to "/"
{ # for each input line
commulativePath = OFS = ""; # reset commulativePath and OFS (Output Field Seperator) to ""
for (i = 1; i < NF; i++) { # loop all folders up to file name
if (i > 1) OFS = FS; # set OFS to "/" on second path
commulativePath = commulativePath OFS $i; # append current field to commulativePath variable
dirs[commulativePath] = 0; # insert commulativePath into an associative array dirs
}
}
END {
print NR " " length(dirs); # print records count, and associative array dirs length
}
如果我们计算唯一的文件夹名称,则有 11 个:
my-bucket
foo1
foo2
foo3
foo4
foo5
foo6
foo7
foo8
foo9
foo10
计算这个的awk
脚本是:
awk -F'/' '{for(i=1;i<NF;i++)dirs[$i]=1;}END{print NR " " length(dirs)}' input.txt
如果您不介意使用管道并调用 awk 两次,那么它相当干净:
mawk 'BEGIN {OFS=ORS;FS="/";_^=_}_+_<NF && --NF~($_="")' file \
\
| mawk 'NF {_[$__]} END { print length(_) }'