How can I improve the speed of data downloading from an ftp server?

659 Views Asked by At

I am writing a Python script that downloads the data from the National Digital Forecast Database (NDFD) server. The FTP server files are organized in this way: Year/YearMonth/YearMonthDay and I have to download one file from each days folder, go up the folder and download for next day and so on. My current code is very slow and takes around 20 seconds to download one days data, which translates into 2hours for one year. I would like it to be much faster. Please find my code below.

from ftplib import FTP  
import ftplib
import os
import datetime as dt
import pandas as pd 
import time

def ndfd_download(keyword, days_, forecast_hour):

    # search for the files between 30 minutes on either side 
    # of the forecast hour -40 is to convert 100 to 60 minutes
    time_start = int(float(forecast_hour)*100 - 30 - 40)
    time_end = int(float(forecast_hour)*100 + 30)

    print('Starting connection to NOAA database')

    # Try connecting to the NCDC server
    try:
        ftp = FTP('nomads.ncdc.noaa.gov') 
        ftp.login()
        print('Connect successful')
    except ftplib.all_errors as e:
        errorcode_string = str(e).split(None, 1)[0]
        print(errorcode_string)

    ftp.cwd('/NDFD/')
    print('Current working directory is %s' % ftp.pwd())

    # go through all the days
    for day_ in days_: 

        start = time.time()

        # get year, month, day information from day_
        year = "{:02d}".format(day_.year)
        year_month = "{:02d}".format(day_.year) + "{:02d}".format(day_.month)
        year_month_day = "{:02d}".format(day_.year) + "{:02d}".format(day_.month) + "{:02d}".format(day_.day)

        try: 

            # Change to the desired NDFD directory to get your data
            # print('Changing directory to \"/NDFD/{}/{}/\"'.format(month, day))
            ftp.cwd('/NDFD/{}/{}/'.format(year_month, year_month_day))

            # getting names of all files in the current working directory
            all_files = ftp.nlst()

            # filtering all the files with desired keyword
            all_files = [key for key in (all_files) if key.startswith(keyword)]

            # creating a directory to store the data
            directoryName = '{}/{}/{}'.format(year, year_month, year_month_day)
            if not os.path.exists(directoryName):
                os.makedirs(directoryName)

            # Move into the folder
            directoryPath = '%s/%s' % (os.getcwd(), directoryName)
            os.chdir(directoryPath)

            print('Downloading data for {}'.format(year_month_day))

            # go through all the files in the directory
            for f in all_files: 

                # get the last 4 characters of file name
                # they contain the time of forecast
                file_time = float(f[-4:])

                # check if time of forecast is within our bounds
                if (file_time <= time_end and file_time >= time_start):

                    # open a new file
                    file = open(f, 'wb')

                    try:
                        # save the file with the same name
                        ftp.retrbinary('RETR %s' % f, file.write)
                        # print('Successfully downloaded: {}'.format(f))
                    except ftplib.all_errors as e:
                        errorcode_string = str(e).split(None, 1)[0]
                        print('Error', errorcode_string) 

                    file.close()

            # going 3 directories up 
            os.chdir("../../..")

        except ftplib.error_perm as e:
            errorcode_string = str(e).split(None, 1)[0]
            print('Error', e) 

        print(time.time() - start)


    ftp.close()

if __name__ == "__main__":
    keyword = "YAUZ98"
    years = [2018]
    for year in years: 
        month = 1
        day = 30
        days_ = []
        # no_of_days = 366 if calendar.isleap(year) else 365
        no_of_days = 100
        t = dt.datetime(year,month,day)
        for i in range(no_of_days):
            days_.append((t))
            t = t + dt.timedelta(days = 1)

        forecast_hour = '14'
        ndfd_download(keyword, days_, forecast_hour)
0

There are 0 best solutions below