SFTP 如何列出大量文件
SFTP How to List Large # of Files
我目前正在将 SFTP 加载到 GCS 存储桶。但是,我可以通过获取文件列表并迭代文件的绝对路径来对任何给定 SFTP 目录中的有限数量的文件执行此操作。但是,如果该目录有太多文件(或另一个文件夹中的文件),我无法执行简单的 ls 并获取要从 SFTP 下载的文件列表。以下是从 sftp 递归获取任何给定目录中的文件列表的工作代码:
import sys
from stat import S_ISDIR, S_ISREG
import paramiko
sftp_url = '<URL>'
sftp_user = '<USER>'
sftp_pwd = '<PWD>'
def get_sftp_obj(sftp_cred_dict):
server = sftp_cred_dict['server']
username = sftp_cred_dict['username']
password = sftp_cred_dict['password']
timeout_min = sftp_cred_dict['timeout_min']
paramiko.sftp_file.SFTPFile.MAX_REQUEST_SIZE = pow(2, 22) #4MB Chunk Default
transport = paramiko.Transport((server, 22))
transport.connect(username=username, password=password)
sftp = paramiko.SFTPClient.from_transport(transport)
sftp.get_channel().settimeout(timeout_min*60)
return sftp
def sftp_get_recursive_files(path, skip_dir_list, sftp, sftp_files=[]):
item_list = sftp.listdir_attr(path)
for item in item_list:
mode = item.st_mode
item = item.filename
if S_ISDIR(mode):
path_build = path + '/' + item
if not(item in skip_dir_list):
sftp_get_recursive_files(path_build, skip_dir_list, sftp, sftp_files)
else:
print('skip directory files: ' + path_build)
elif S_ISREG(mode):
sftp_file_path = path + '/' + item
sftp_files.append(sftp_file_path)
return sftp_files
def main():
sftp_cred_dict = {
"server": sftp_url,
"username": sftp_user,
"password": sftp_pwd,
"timeout_min": 60
}
skip_dir_list = ["archive"]
arguments = sys.argv
ls_dir = arguments[1]
print(ls_dir)
sftp = get_sftp_obj(sftp_cred_dict)
files = sftp_get_recursive_files(ls_dir, skip_dir_list, sftp, [])
print(len(files))
if __name__ == "__main__":
main()
执行一段时间后出现以下异常:
(venv-sftp) user@poc-sftp:~/experiments/sftp-v1$ python ls-sftp.py /BU/SYSTEM/outbound/SYSTEM_Txn_Payment
Traceback (most recent call last):
File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp_client.py", line 852, in _read_response
t, data = self._read_packet()
File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp.py", line 201, in _read_packet
x = self._read_all(4)
File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp.py", line 188, in _read_all
raise EOFError()
EOFError
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "ls-sftp.py", line 62, in <module>
main()
File "ls-sftp.py", line 57, in main
files = sftp_get_recursive_files(ls_dir, skip_dir_list, sftp, [])
File "ls-sftp.py", line 27, in sftp_get_recursive_files
item_list = sftp.listdir_attr(path)
File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp_client.py", line 246, in listdir_attr
t, msg = self._request(CMD_READDIR, handle)
File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp_client.py", line 822, in _request
return self._read_response(num)
File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp_client.py", line 854, in _read_response
raise SSHException("Server connection dropped: {}".format(e))
paramiko.ssh_exception.SSHException: Server connection dropped:
[更新 1]
我尝试使用我得到的 find 命令使用以下代码 paramiko.SSHException: Channel closed.
def sftp_get_all_files(path, sftp_cred_dict):
command = "cd " + path + '; find . ! -path archive'
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect(sftp_cred_dict['server'], username=sftp_cred_dict['username'], password=sftp_cred_dict['password'], port=22)
(stdin, stdout, stderr) = ssh.exec_command(command)
all_files = stdout.readlines()
return all_files
[更新-2]
我尝试使用 rclone
配置 sftp 连接
sftp-v1$rclone ls -vv --dump headers --exclude=/archive/** dpprdsftp:/BU/SYSTEM/outbound/SYSTEM_Txn_Payment
DEBUG : rclone: Version "v1.57.0" starting with parameters ["rclone" "ls" "-vv" "--dump" "headers" "--exclude=/archive/**" "dpprdsftp:/BU/SYSTEM/outbound/SYSTEM_Txn_Payment"]
DEBUG : Creating backend with remote "dpprdsftp:/BU/SYSTEM/outbound/SYSTEM_Txn_Payment"
DEBUG : Using config file from "/home/user/.config/rclone/rclone.conf"
DEBUG : sftp://dpprdsftp@xx.xxx.xx.xx:22//BU/SYSTEM/outbound/SYSTEM_Txn_Payment: New connection xx.xxx.x.x:xxxxx->yy.yyy.y.yy:22 to "SSH-2.0-CrushFTPSSHD"
DEBUG : sftp://dpprdsftp@xx.xxx.xx.xx:22//BU/SYSTEM/outbound/SYSTEM_Txn_Payment: Connection failed, closing: connection lost
ERROR : : error listing: error listing "": connection lost
DEBUG : 2 go routines active
Failed to ls with 2 errors: last error was: error listing "": connection lost
我需要请求 sftp 管理员从源中启用某些功能吗?
您可以使用 find(1) 在 ssh 中执行 find 命令快速获取文件列表:
ssh user@host "cd /some/where/in/the/filesystem ; find ."
您可以使用 ! -path skip_this_dir ! -path skip_this_dir_too
跳过带有 find
的目录。此示例跳过其路径上具有“archive”的所有内容:
ssh user@host "cd /some/where/in/the/filesystem ; find . ! -path archive "
你可以用 paramiko 来做:
import paramiko
command = "cd somewhere; find . ! -path archive"
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect("host", "port", "username", "password")
_, stdout, _= ssh.exec_command(command)
all_files= stdout.readlines()
我目前正在将 SFTP 加载到 GCS 存储桶。但是,我可以通过获取文件列表并迭代文件的绝对路径来对任何给定 SFTP 目录中的有限数量的文件执行此操作。但是,如果该目录有太多文件(或另一个文件夹中的文件),我无法执行简单的 ls 并获取要从 SFTP 下载的文件列表。以下是从 sftp 递归获取任何给定目录中的文件列表的工作代码:
import sys
from stat import S_ISDIR, S_ISREG
import paramiko
sftp_url = '<URL>'
sftp_user = '<USER>'
sftp_pwd = '<PWD>'
def get_sftp_obj(sftp_cred_dict):
server = sftp_cred_dict['server']
username = sftp_cred_dict['username']
password = sftp_cred_dict['password']
timeout_min = sftp_cred_dict['timeout_min']
paramiko.sftp_file.SFTPFile.MAX_REQUEST_SIZE = pow(2, 22) #4MB Chunk Default
transport = paramiko.Transport((server, 22))
transport.connect(username=username, password=password)
sftp = paramiko.SFTPClient.from_transport(transport)
sftp.get_channel().settimeout(timeout_min*60)
return sftp
def sftp_get_recursive_files(path, skip_dir_list, sftp, sftp_files=[]):
item_list = sftp.listdir_attr(path)
for item in item_list:
mode = item.st_mode
item = item.filename
if S_ISDIR(mode):
path_build = path + '/' + item
if not(item in skip_dir_list):
sftp_get_recursive_files(path_build, skip_dir_list, sftp, sftp_files)
else:
print('skip directory files: ' + path_build)
elif S_ISREG(mode):
sftp_file_path = path + '/' + item
sftp_files.append(sftp_file_path)
return sftp_files
def main():
sftp_cred_dict = {
"server": sftp_url,
"username": sftp_user,
"password": sftp_pwd,
"timeout_min": 60
}
skip_dir_list = ["archive"]
arguments = sys.argv
ls_dir = arguments[1]
print(ls_dir)
sftp = get_sftp_obj(sftp_cred_dict)
files = sftp_get_recursive_files(ls_dir, skip_dir_list, sftp, [])
print(len(files))
if __name__ == "__main__":
main()
执行一段时间后出现以下异常:
(venv-sftp) user@poc-sftp:~/experiments/sftp-v1$ python ls-sftp.py /BU/SYSTEM/outbound/SYSTEM_Txn_Payment
Traceback (most recent call last):
File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp_client.py", line 852, in _read_response
t, data = self._read_packet()
File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp.py", line 201, in _read_packet
x = self._read_all(4)
File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp.py", line 188, in _read_all
raise EOFError()
EOFError
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "ls-sftp.py", line 62, in <module>
main()
File "ls-sftp.py", line 57, in main
files = sftp_get_recursive_files(ls_dir, skip_dir_list, sftp, [])
File "ls-sftp.py", line 27, in sftp_get_recursive_files
item_list = sftp.listdir_attr(path)
File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp_client.py", line 246, in listdir_attr
t, msg = self._request(CMD_READDIR, handle)
File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp_client.py", line 822, in _request
return self._read_response(num)
File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp_client.py", line 854, in _read_response
raise SSHException("Server connection dropped: {}".format(e))
paramiko.ssh_exception.SSHException: Server connection dropped:
[更新 1]
我尝试使用我得到的 find 命令使用以下代码 paramiko.SSHException: Channel closed.
def sftp_get_all_files(path, sftp_cred_dict):
command = "cd " + path + '; find . ! -path archive'
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect(sftp_cred_dict['server'], username=sftp_cred_dict['username'], password=sftp_cred_dict['password'], port=22)
(stdin, stdout, stderr) = ssh.exec_command(command)
all_files = stdout.readlines()
return all_files
[更新-2] 我尝试使用 rclone
配置 sftp 连接sftp-v1$rclone ls -vv --dump headers --exclude=/archive/** dpprdsftp:/BU/SYSTEM/outbound/SYSTEM_Txn_Payment
DEBUG : rclone: Version "v1.57.0" starting with parameters ["rclone" "ls" "-vv" "--dump" "headers" "--exclude=/archive/**" "dpprdsftp:/BU/SYSTEM/outbound/SYSTEM_Txn_Payment"]
DEBUG : Creating backend with remote "dpprdsftp:/BU/SYSTEM/outbound/SYSTEM_Txn_Payment"
DEBUG : Using config file from "/home/user/.config/rclone/rclone.conf"
DEBUG : sftp://dpprdsftp@xx.xxx.xx.xx:22//BU/SYSTEM/outbound/SYSTEM_Txn_Payment: New connection xx.xxx.x.x:xxxxx->yy.yyy.y.yy:22 to "SSH-2.0-CrushFTPSSHD"
DEBUG : sftp://dpprdsftp@xx.xxx.xx.xx:22//BU/SYSTEM/outbound/SYSTEM_Txn_Payment: Connection failed, closing: connection lost
ERROR : : error listing: error listing "": connection lost
DEBUG : 2 go routines active
Failed to ls with 2 errors: last error was: error listing "": connection lost
我需要请求 sftp 管理员从源中启用某些功能吗?
您可以使用 find(1) 在 ssh 中执行 find 命令快速获取文件列表:
ssh user@host "cd /some/where/in/the/filesystem ; find ."
您可以使用 ! -path skip_this_dir ! -path skip_this_dir_too
跳过带有 find
的目录。此示例跳过其路径上具有“archive”的所有内容:
ssh user@host "cd /some/where/in/the/filesystem ; find . ! -path archive "
你可以用 paramiko 来做:
import paramiko
command = "cd somewhere; find . ! -path archive"
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect("host", "port", "username", "password")
_, stdout, _= ssh.exec_command(command)
all_files= stdout.readlines()