Downloading large datafrom SFTP server to Google Cloud Storage using AsyncSSH fails

178 Views Asked by At

I'm using Python's AsyncSSH library to download files from an SFTP server and upload them to Google Cloud Storage using Cloud Functions. The code works fine when the number of files and directories is small. However, when I try to download a large amount of data (1GB or more), the code fails to download all the files and directories.

Here is the code I am using:

import os
import asyncio
import asyncssh
import stat
from flask import jsonify, make_response
import functions_framework
from google.cloud import storage

async def download_files(server_url, username, private_key_path, remote_path, bucket_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    downloaded_files = []

    async with asyncssh.connect(
        server_url, username=username, client_keys=[private_key_path], known_hosts=None
    ) as conn:
        async with conn.start_sftp_client() as sftp:
            await recursive_download(sftp, remote_path, bucket, downloaded_files)

    return downloaded_files

async def recursive_download(sftp, remote_path, bucket, downloaded_files):
    for filename in await sftp.listdir(remote_path):
        remote_file = remote_path + "/" + filename
        attrs = await sftp.stat(remote_file)
        if stat.S_ISREG(attrs.permissions):  # Check if it's a regular file
            # Open the remote file
            async with sftp.open(remote_file) as remote_file_obj:
                # Read the file data into memory
                file_data = await remote_file_obj.read()

            # Create a blob in GCS
            blob = bucket.blob(remote_file)

            # Upload the file data to GCS
            blob.upload_from_string(file_data)

            print(f"Downloaded {remote_file}")
            downloaded_files.append(remote_file)
        elif stat.S_ISDIR(attrs.permissions):  # Check if it's a directory
            await recursive_download(sftp, remote_file, bucket, downloaded_files)

The download_files function is called in a Flask route:


@functions_framework.http
def main(request):
    try:
        server_url = "my_server_url"
        username = "my_username"
        private_key_path = "my_private_key_path"
        remote_path = "my_remote_path"
        bucket_name = "my_bucket_name" 

        downloaded_files = asyncio.run(download_files(server_url, username, private_key_path, remote_path, bucket_name))

        return make_response(jsonify({"message": f"Files downloaded successfully. Total files: {len(downloaded_files)}. Files: {downloaded_files}"}), 200)
    except Exception as e:
        return make_response(jsonify({"error": str(e)}), 500)


I'm not sure why the code fails when the amount of data is large. Any insights on improving the code would be greatly appreciated.

I am using asyncssh and not paramiko and pysft is because I am facing trouble authenticating to the server. For some reason asyncssh works.

0

There are 0 best solutions below