Reading a text file with multiple data frames separated by headers in R

40 Views Asked by At

I have a large text file that contains multiple data frames, separated by header lines, that I am trying to read into R. The first header line contains the time variable. I want to separate the data frames according to their time variable. The data looks like this:

data = c("** TIME:          41670", "** PROPERTY: Pressure", "** UNITS:    psi", 
"<      X       > <      Y       > <Layer    1>", "      2106604.41       3434119.83      7952.25", 
"      2111884.40       3434119.83      7970.05", "      2037964.57       3439399.82      7658.27", 
"      2043244.56       3439399.82         7754", "      2048524.55       3439399.82      7828.24", 
"      2053804.53       3439399.82      7879.78", "      2059084.52       3439399.82      7914.57", 
"      2064364.50       3439399.82      7944.66", "      2069644.49       3439399.82      7974.44", 
"      2074924.48       3439399.82      7999.03", "      2080204.46       3439399.82      8014.14", 
"      2085484.46       3439399.82      8016.27", "      2090764.46       3439399.82      8005.63", 
"", "", "** TIME:          41670", "** PROPERTY: Pressure", "** UNITS:    psi", 
"<      X       > <      Y       > <Layer    2>", "      2106604.41       3434119.83      8038.52", 
"      2111884.40       3434119.83      8066.89", "      2037964.57       3439399.82      7723.84", 
"      2043244.56       3439399.82      7821.79", "      2048524.55       3439399.82      7899.46", 
"      2053804.53       3439399.82      7955.23", "      2059084.52       3439399.82      7993.75", 
"      2064364.50       3439399.82      8026.08", "      2069644.49       3439399.82      8056.41", 
"      2074924.48       3439399.82      8080.33", "      2080204.46       3439399.82      8094.15", 
"      2085484.46       3439399.82      8095.07", "      2090764.46       3439399.82      8084.03", 
"      2096044.44       3439399.82      8068.33", "      2101324.41       3439399.82      8060.14", 
"      2106604.41       3439399.82      8073.08", "      2111884.40       3439399.82      8107.82", 
"      2117164.38       3439399.82      8145.84", "      2122444.37       3439399.82      8160.57"
)

I am using readLines to read in the text file.

I would ideally like a list with the timestamp attached to a data frame of values like:

[[1]]$date
[1] "2014-01-31"
[[1]]$data
   X          Y          Layer1
1 2106604.41  3434119.83 7952.25
2 2111884.40  3434119.83 7970.05
3 2037964.57  3439399.82 7658.27
4 2043244.56  3439399.82 7754

[[2]]$date
[1] "2014-01-31"
[[2]]$data
   X          Y          Layer2
1 2106604.41 3434119.83 8038.52
2 2111884.40 3434119.83 8066.89
3 2037964.57 3439399.82 7723.84
4 2043244.56 3439399.82 7821.79

This is what I tried:

data <- readLines("tmp.txt")

# Initialize an empty list to store data frames
dfs <- list()

# Initialize variables
current_time <- NULL
current_df <- NULL
property <- NULL

# Loop through each line of the file
for (line in data) {
  if (startsWith(line, "** TIME:")) {
    # Extract the time from the header line and convert to datetime
    current_time <- as.Date(as.numeric(trimws(sub("\\*{2}\\s+TIME:\\s+", "", line))), origin = "1899-12-30", format = "%Y-%m-%d")
    # Create a new data frame for the current time
    current_df <- data.frame()
  } else if (startsWith(line, "** PROPERTY:")) {
    next
  } else if (startsWith(line, "** UNITS:")) {
    next
  } else if (startsWith(line, "<")) {
    # Extract column names from header line 4
   clean_header <- gsub("<|>", "", line)
   clean_header <- trimws(clean_header)
   col_names <- strsplit(clean_header, " ")
   col_names <- unlist(col_names)
   col_names <- col_names[col_names != ""]
   col_names[3] <- paste0(col_names[3], col_names[4])
   col_names <- col_names[-4]
  } else if (!startsWith(line, "**")) {
    # Split the line by whitespace and create a new row in the data frame
    parts <- strsplit(line, "\\s+")[[1]]
    parts <- parts[parts != ""]
    current_df <- rbind(current_df, as.numeric(parts))
  } else {
    # End of current data frame, store it in the list
    colnames(current_df) <- col_names
    dfs[[length(dfs) + 1]] <- list(date = current_time, data = current_df)
    current_df <- NULL
  }
}

The code is producing the current_df, which stores the most recent data frame it is looped over, but the column names are not being added. Additionally, the current_df is not being saved to the dfs list , so it is overwritten by the new current_df as the loop continues.

2

There are 2 best solutions below

0
A. S. K. On

Very close! Only three small changes are needed:

  1. The penultimate condition, !startsWith(line, "**"), matches an empty line. This means that empty lines were being treated as data, and the final condition (which finalizes the dataframe and adds it to the list) was never reached. I changed the condition to nchar(line) > 0 & !startsWith(line, "**").
  2. In the sample data, there are two blank lines between dataframes. This means that the final condition runs twice before we start building a new dataframe. But the final condition sets current_df to NULL, and then the next time the loop runs, the operations on current_df fail. To avoid this problem, I changed the final else to else if(!is.null(current_df)).
  3. In the sample data, the last line of data contains a row of data; this means that the final condition never runs for the last dataframe, and therefore the last dataframe never gets added to the list. I added one more "" line to data to fix this problem. (Alternatively, we could copy the contents of that last condition and run them one more time after the whole loop has run.)

Here's what the code looks like with those three changes:

data = c(data, "")
for (line in data) {
  if (startsWith(line, "** TIME:")) {
    # Extract the time from the header line and convert to datetime
    current_time <- as.Date(as.numeric(trimws(sub("\\*{2}\\s+TIME:\\s+", "", line))), origin = "1899-12-30", format = "%Y-%m-%d")
    # Create a new data frame for the current time
    current_df <- data.frame()
  } else if (startsWith(line, "** PROPERTY:")) {
    next
  } else if (startsWith(line, "** UNITS:")) {
    next
  } else if (startsWith(line, "<")) {
    # Extract column names from header line 4
    clean_header <- gsub("<|>", "", line)
    clean_header <- trimws(clean_header)
    col_names <- strsplit(clean_header, " ")
    col_names <- unlist(col_names)
    col_names <- col_names[col_names != ""]
    col_names[3] <- paste0(col_names[3], col_names[4])
    col_names <- col_names[-4]
  } else if (nchar(line) > 0 & !startsWith(line, "**")) {
    # Split the line by whitespace and create a new row in the data frame
    parts <- strsplit(line, "\\s+")[[1]]
    parts <- parts[parts != ""]
    current_df <- rbind(current_df, as.numeric(parts))
  } else if(!is.null(current_df)) {
    # End of current data frame, store it in the list
    colnames(current_df) <- col_names
    dfs[[length(dfs) + 1]] <- list(date = current_time, data = current_df)
    current_df <- NULL
  }
}
0
Grzegorz Sapijaszko On

My lazy attempt taking advantage of the fact that the file structure is uniform and the number of header lines is always the same:

a <- readLines(con = "~/projekty/test/f.txt") 

index <- which(grepl(pattern = "** TIME", a, fixed = TRUE))
index <- c(index, length(a)+1)

all <- list()

format_my_list <- function(i) {
  l <- list()
  l$date <- a[index[i]] |>
    stringr::str_split(pattern = "(\\s){2,}") |>
    purrr::pluck(1) |>
    purrr::pluck(2) |>
    as.numeric() |>
    as.Date(origin = "1899-12-30", format = "%Y-%m-%d")
  
  col_names <- a[index[i]+3] |>
    stringr::str_replace_all(pattern = "<|>", replacement = "") |>
    stringr::str_replace(pattern = "Layer(\\s)+", replacement = "Layer ") |>
    stringr::str_trim() |>
    strsplit(split = "(\\s\\s)+") |>
    purrr::pluck(1)
  
  data <- a[(index[i]+4):(index[i+1]-1)] |>
    stringr::str_replace_all(pattern = "<|>", replacement = "") |>
    stringr::str_replace(pattern = "Layer(\\s)+", replacement = "") |>
    stringr::str_trim() |>
    strsplit(split = "(\\s\\s)+") |>
    unlist() |>
    as.numeric() |>
    matrix(ncol = 3, byrow = TRUE) |>
    as.data.frame()

  names(data) <- col_names
  l$data <- data
  all <- all |>
    rbind(l)
}

all <- lapply(c(1:(length(index)-1)), function(i) format_my_list(i))

all
#> [[1]]
#>   date       data        
#> l 2014-01-31 data.frame,3
#> 
#> [[2]]
#>   date       data        
#> l 2014-01-31 data.frame,3

all[[1]][1,2]
#> [[1]]
#>          X       Y Layer 1
#> 1  2106604 3434120 7952.25
#> 2  2111884 3434120 7970.05
#> 3  2037965 3439400 7658.27
#> 4  2043245 3439400 7754.00
#> 5  2048525 3439400 7828.24
#> 6  2053805 3439400 7879.78
#> 7  2059085 3439400 7914.57
#> 8  2064364 3439400 7944.66
#> 9  2069644 3439400 7974.44
#> 10 2074924 3439400 7999.03
#> 11 2080204 3439400 8014.14
#> 12 2085484 3439400 8016.27
#> 13 2090764 3439400 8005.63

Created on 2024-03-20 with reprex v2.1.0