I am trying to download a large number of HDF5 files to my computer from URLs provided by NASA's Earthdata Search. A download script (third tab) is provided by NASA, but since the rest of this project is written in R, I have been trying to translate this script into R as well. It appears to work in that it creates files with the expected names, but they are not valid HDF5 files when I then go to open them.
My current code (requires Earthdata log-in credentials to reproduce):
library(curl)
library(httr)
cookiejar <- tempfile(fileext = ".cookies")
netrc <- tempfile(fileext = ".netrc")
prompt_credentials <- function() {
cat("Enter your Earthdata Login or other provider supplied credentials\n")
username <- readline("Username (dylan.titmuss): ")
username <- ifelse(nchar(username) == 0, "dylan.titmuss", username)
password <- readline("Password: ")
cat(paste("machine urs.earthdata.nasa.gov login", username, "password", password), file = netrc)
cat("\n")
}
exit_with_error <- function(msg) {
cat("\nUnable to Retrieve Data\n\n", msg, "\n\nhttps://data.nsidc.earthdatacloud.nasa.gov/nsidc-cumulus-prod-protected/ATLAS/ATL07/006/2023/06/20/ATL07-02_20230620165624_00082001_006_01.h5\n\n")
quit(save = "no", status = 1)
}
prompt_credentials()
detect_app_approval <- function() {
res <- GET(
url = "https://data.nsidc.earthdatacloud.nasa.gov/nsidc-cumulus-prod-protected/ATLAS/ATL07/006/2023/06/20/ATL07-02_20230620165624_00082001_006_01.h5",
config = add_headers(`User-Agent` = "Mozilla/5.0"),
cookies = cookiejar,
verbose()
)
if (res$status_code != 200 && res$status_code != 301 && res$status_code != 302) {
exit_with_error("Please ensure that you have authorized the remote application by visiting the link below ")
}
}
setup_auth_curl <- function() {
status <- GET(
url = "https://data.nsidc.earthdatacloud.nasa.gov/nsidc-cumulus-prod-protected/ATLAS/ATL07/006/2023/06/20/ATL07-02_20230620165624_00082001_006_01.h5",
config = add_headers(`User-Agent` = "Mozilla/5.0"),
cookies = cookiejar,
verbose()
)
if (status$status_code != 200 && status$status_code != 304) {
detect_app_approval()
}
}
fetch_urls <- function() {
urls <- readLines(textConnection(urls_data)) # Assuming `urls_data` contains the list of URLs
for (url in urls) {
filename <- basename(url)
stripped_query_params <- sub("\\?.*", "", filename)
# Create a curl handle
handle <- new_handle(
useragent = "Mozilla/5.0",
verbose = TRUE
)
# Perform the GET request
response <- curl_fetch_memory(url, handle = handle)
if (response$status_code == 200) {
# Write the content to a file
bin_file <- file(stripped_query_params, "wb")
writeBin(response$content, bin_file)
close(bin_file)
cat("\n")
} else {
exit_with_error("Command failed with error. Please retrieve the data manually.")
}
}
}
urls_data <- c("https://data.nsidc.earthdatacloud.nasa.gov/nsidc-cumulus-prod-protected/ATLAS/ATL07/006/2023/06/20/ATL07-02_20230620165624_00082001_006_01.h5")
# other URLs are listed above
fetch_urls()
# Clean up temporary files
unlink(c(cookiejar, netrc))
I am very new to using the curl package, so I'm not particularly confident in the above code. Could the issue be with the writeBin function?