SFTP 如何列出大量文件

SFTP How to List Large # of Files

我目前正在将 SFTP 加载到 GCS 存储桶。但是,我可以通过获取文件列表并迭代文件的绝对路径来对任何给定 SFTP 目录中的有限数量的文件执行此操作。但是,如果该目录有太多文件(或另一个文件夹中的文件),我无法执行简单的 ls 并获取要从 SFTP 下载的文件列表。以下是从 sftp 递归获取任何给定目录中的文件列表的工作代码:

import sys
from stat import S_ISDIR, S_ISREG
import paramiko


sftp_url = '<URL>'
sftp_user = '<USER>'
sftp_pwd = '<PWD>'


def get_sftp_obj(sftp_cred_dict):
     server = sftp_cred_dict['server']
     username = sftp_cred_dict['username']
     password = sftp_cred_dict['password']
     timeout_min = sftp_cred_dict['timeout_min']
     paramiko.sftp_file.SFTPFile.MAX_REQUEST_SIZE = pow(2, 22) #4MB Chunk Default
     transport = paramiko.Transport((server, 22))
     transport.connect(username=username, password=password)
     sftp = paramiko.SFTPClient.from_transport(transport)
     sftp.get_channel().settimeout(timeout_min*60)
     return sftp


 def sftp_get_recursive_files(path, skip_dir_list, sftp, sftp_files=[]):
     item_list = sftp.listdir_attr(path)
     for item in item_list:
         mode = item.st_mode
         item = item.filename
         if S_ISDIR(mode):
             path_build = path + '/' + item
             if not(item in skip_dir_list):
                 sftp_get_recursive_files(path_build, skip_dir_list, sftp, sftp_files)
             else:
                 print('skip directory files: ' + path_build)
         elif S_ISREG(mode):
             sftp_file_path = path + '/' + item
             sftp_files.append(sftp_file_path)
     return sftp_files


 def main():
     sftp_cred_dict = {
         "server": sftp_url,
         "username": sftp_user,
         "password": sftp_pwd,
         "timeout_min": 60
     }
     skip_dir_list = ["archive"]
     arguments = sys.argv
     ls_dir = arguments[1]
     print(ls_dir)
     sftp = get_sftp_obj(sftp_cred_dict)
     files = sftp_get_recursive_files(ls_dir, skip_dir_list, sftp, [])
     print(len(files))


if __name__ == "__main__":
     main()

执行一段时间后出现以下异常:

(venv-sftp) user@poc-sftp:~/experiments/sftp-v1$ python ls-sftp.py /BU/SYSTEM/outbound/SYSTEM_Txn_Payment

  Traceback (most recent call last):
  File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp_client.py", line 852, in _read_response
    t, data = self._read_packet()
  File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp.py", line 201, in _read_packet
    x = self._read_all(4)
  File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp.py", line 188, in _read_all
    raise EOFError()
EOFError


During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "ls-sftp.py", line 62, in <module>
    main()
  File "ls-sftp.py", line 57, in main
    files = sftp_get_recursive_files(ls_dir, skip_dir_list, sftp, [])
  File "ls-sftp.py", line 27, in sftp_get_recursive_files
    item_list = sftp.listdir_attr(path)
  File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp_client.py", line 246, in listdir_attr
    t, msg = self._request(CMD_READDIR, handle)
  File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp_client.py", line 822, in _request
    return self._read_response(num)
  File "/home/user/experiments/sftp-v1/venv-sftp/lib/python3.7/site-packages/paramiko/sftp_client.py", line 854, in _read_response
    raise SSHException("Server connection dropped: {}".format(e))
paramiko.ssh_exception.SSHException: Server connection dropped:

[更新 1]

我尝试使用我得到的 find 命令使用以下代码 paramiko.SSHException: Channel closed.

def sftp_get_all_files(path, sftp_cred_dict):
    command = "cd " + path + '; find . ! -path archive'
    ssh = paramiko.SSHClient()
    ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    ssh.connect(sftp_cred_dict['server'], username=sftp_cred_dict['username'], password=sftp_cred_dict['password'], port=22)
    (stdin, stdout, stderr) = ssh.exec_command(command)
    all_files = stdout.readlines()
    return all_files

[更新-2] 我尝试使用 rclone

配置 sftp 连接
sftp-v1$rclone ls -vv --dump headers --exclude=/archive/** dpprdsftp:/BU/SYSTEM/outbound/SYSTEM_Txn_Payment
DEBUG : rclone: Version "v1.57.0" starting with parameters ["rclone" "ls" "-vv" "--dump" "headers" "--exclude=/archive/**" "dpprdsftp:/BU/SYSTEM/outbound/SYSTEM_Txn_Payment"]
DEBUG : Creating backend with remote "dpprdsftp:/BU/SYSTEM/outbound/SYSTEM_Txn_Payment"
DEBUG : Using config file from "/home/user/.config/rclone/rclone.conf"
DEBUG : sftp://dpprdsftp@xx.xxx.xx.xx:22//BU/SYSTEM/outbound/SYSTEM_Txn_Payment: New connection xx.xxx.x.x:xxxxx->yy.yyy.y.yy:22 to "SSH-2.0-CrushFTPSSHD"
DEBUG : sftp://dpprdsftp@xx.xxx.xx.xx:22//BU/SYSTEM/outbound/SYSTEM_Txn_Payment: Connection failed, closing: connection lost
ERROR : : error listing: error listing "": connection lost
DEBUG : 2 go routines active
Failed to ls with 2 errors: last error was: error listing "": connection lost

我需要请求 sftp 管理员从源中启用某些功能吗?

您可以使用 find(1) 在 ssh 中执行 find 命令快速获取文件列表:

ssh user@host "cd /some/where/in/the/filesystem ; find ."

您可以使用 ! -path skip_this_dir ! -path skip_this_dir_too 跳过带有 find 的目录。此示例跳过其路径上具有“archive”的所有内容:

ssh user@host "cd /some/where/in/the/filesystem ; find . ! -path archive "

你可以用 paramiko 来做:

import paramiko

command = "cd somewhere; find . ! -path archive"
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect("host", "port", "username", "password")
_, stdout, _= ssh.exec_command(command)

all_files= stdout.readlines()