I am writing a Python script that downloads the data from the National Digital Forecast Database (NDFD) server. The FTP server files are organized in this way: Year/YearMonth/YearMonthDay and I have to download one file from each days folder, go up the folder and download for next day and so on. My current code is very slow and takes around 20 seconds to download one days data, which translates into 2hours for one year. I would like it to be much faster. Please find my code below.
from ftplib import FTP
import ftplib
import os
import datetime as dt
import pandas as pd
import time
def ndfd_download(keyword, days_, forecast_hour):
# search for the files between 30 minutes on either side
# of the forecast hour -40 is to convert 100 to 60 minutes
time_start = int(float(forecast_hour)*100 - 30 - 40)
time_end = int(float(forecast_hour)*100 + 30)
print('Starting connection to NOAA database')
# Try connecting to the NCDC server
try:
ftp = FTP('nomads.ncdc.noaa.gov')
ftp.login()
print('Connect successful')
except ftplib.all_errors as e:
errorcode_string = str(e).split(None, 1)[0]
print(errorcode_string)
ftp.cwd('/NDFD/')
print('Current working directory is %s' % ftp.pwd())
# go through all the days
for day_ in days_:
start = time.time()
# get year, month, day information from day_
year = "{:02d}".format(day_.year)
year_month = "{:02d}".format(day_.year) + "{:02d}".format(day_.month)
year_month_day = "{:02d}".format(day_.year) + "{:02d}".format(day_.month) + "{:02d}".format(day_.day)
try:
# Change to the desired NDFD directory to get your data
# print('Changing directory to \"/NDFD/{}/{}/\"'.format(month, day))
ftp.cwd('/NDFD/{}/{}/'.format(year_month, year_month_day))
# getting names of all files in the current working directory
all_files = ftp.nlst()
# filtering all the files with desired keyword
all_files = [key for key in (all_files) if key.startswith(keyword)]
# creating a directory to store the data
directoryName = '{}/{}/{}'.format(year, year_month, year_month_day)
if not os.path.exists(directoryName):
os.makedirs(directoryName)
# Move into the folder
directoryPath = '%s/%s' % (os.getcwd(), directoryName)
os.chdir(directoryPath)
print('Downloading data for {}'.format(year_month_day))
# go through all the files in the directory
for f in all_files:
# get the last 4 characters of file name
# they contain the time of forecast
file_time = float(f[-4:])
# check if time of forecast is within our bounds
if (file_time <= time_end and file_time >= time_start):
# open a new file
file = open(f, 'wb')
try:
# save the file with the same name
ftp.retrbinary('RETR %s' % f, file.write)
# print('Successfully downloaded: {}'.format(f))
except ftplib.all_errors as e:
errorcode_string = str(e).split(None, 1)[0]
print('Error', errorcode_string)
file.close()
# going 3 directories up
os.chdir("../../..")
except ftplib.error_perm as e:
errorcode_string = str(e).split(None, 1)[0]
print('Error', e)
print(time.time() - start)
ftp.close()
if __name__ == "__main__":
keyword = "YAUZ98"
years = [2018]
for year in years:
month = 1
day = 30
days_ = []
# no_of_days = 366 if calendar.isleap(year) else 365
no_of_days = 100
t = dt.datetime(year,month,day)
for i in range(no_of_days):
days_.append((t))
t = t + dt.timedelta(days = 1)
forecast_hour = '14'
ndfd_download(keyword, days_, forecast_hour)