Merge multiple dataframes while retaining their names as column names?

73 Views Asked by At

I'm trying to import multiple CSV files in the RStudio while keeping their filenames.

library(readr)
library(dplyr)
library(purrr)

#importing all csv files at once
csv_files = list.files(pattern ="*Con.csv")
myfiles = lapply(csv_files , read.delim, header = TRUE, sep = "," )

#merging all files by identifiers
Samp_merg <- myfiles %>% reduce(full_join, by=c("chr", "start","end"))

After doing this I could import the files but the names of the files were missing from the list myfiles.

myfiles <- dir(pattern = "*Con.csv", full.names = FALSE) 
myfiles_data <- lapply(myfiles, data.table::fread) 

# assign names to list items
names(myfiles_data) <- myfiles 

#merging the files
dat_merg <- myfiles_data %>% reduce(full_join, by=c("chr", "start", "end"))

Here, using this script I can import the files by keeping their names in the myfiles_data object. However, after joining by three identifiers I'm unable to retain their file names as column names. I want to keep the colname of the merged df as the individual file name without extension (.csv).

There are around 90 CSV files present in the directory with the same header.

$ls
01AvPMPpCon.csv
02AvPMPpCon.csv
03AvPMPpCon.csv
04AvPMPpCon.csv
05AvPMPpCon.csv

$head 01AvPMPpCon.csv 
chr,start,end,CpG
chr1,2017424,2017750,10
chr1,24901325,24901700,11
chr1,24902268,24902701,25
chr1,24927215,24927416,4
chr1,26861926,26862173,5
chr1,26864186,26864613,15
chr1,35576334,35576451,3
chr1,36304606,36304817,7

At now, the merged file looks like this,

$head(dat_merg)
    chr    start      end CpG.x CpG.y CpG.x.x CpG.y.y CpG.x.x.x CpG.y.y.y
1: chr1  3903250  3903277     4    NA      NA      NA         4        NA
2: chr1  4657240  4657314     3    NA      NA      NA        NA        NA
3: chr1 24900249 24900468     5    NA       5      NA        NA        NA
4: chr1 46484938 46485047     4    NA       4      NA        NA        NA
5: chr1 47223634 47223758     4    NA      NA      NA         4         4
6: chr1 66752822 66753167    12    12      NA      NA        12        NA

So, my expected output should look like,

 $head(dat_merg)
        chr    start      end   01Av  02Av    03Av    04Av      05Av      06Av
    1: chr1  3903250  3903277     4    NA      NA      NA         4        NA
    2: chr1  4657240  4657314     3    NA      NA      NA        NA        NA
    3: chr1 24900249 24900468     5    NA       5      NA        NA        NA
    4: chr1 46484938 46485047     4    NA       4      NA        NA        NA
    5: chr1 47223634 47223758     4    NA      NA      NA         4         4
    6: chr1 66752822 66753167    12    12      NA      NA        12        NA
2

There are 2 best solutions below

0
margusl On BEST ANSWER

What about pivot_wider() instead of reduce(full_join, ...)?

Prepare reprex, 99 4-row csv files:

library(dplyr, warn.conflicts = FALSE)
library(tidyr)
library(readr)
library(purrr)

csv_ <- read_csv("chr,start,end,CpG
chr1,2017424,2017750,10
chr1,24901325,24901700,11
chr1,24902268,24902701,25
chr1,24927215,24927416,4
chr1,26861926,26862173,5
chr1,26864186,26864613,15
chr1,35576334,35576451,3
chr1,36304606,36304817,7", show_col_types = FALSE)

sprintf("%.2dAvPMPpCon.csv", 1:99) |>
  walk(\(f_) slice_sample(csv_, n = 4) |> write_csv(f_))

read_csv() can read from a list of files and stores file names in id column, c(chr, start, end) will be used for pivot_wider() id_cols :

list.files(pattern ="*Con.csv") |>
  read_csv(id = "src") |>
  mutate(src = substr(src, 1, 4)) |>
  pivot_wider(names_from = src, values_from = CpG)
#> Rows: 396 Columns: 5
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (1): chr
#> dbl (3): start, end, CpG
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Result:

#> # A tibble: 8 × 102
#>   chr      start     end `01Av` `02Av` `03Av` `04Av` `05Av` `06Av` `07Av` `08Av`
#>   <chr>    <dbl>   <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
#> 1 chr1  26864186  2.69e7     15     NA     15     15     NA     NA     NA     NA
#> 2 chr1  26861926  2.69e7      5      5     NA      5      5     NA     NA     NA
#> 3 chr1  24902268  2.49e7     25     25     25     25     25     25     NA     NA
#> 4 chr1  24927215  2.49e7      4      4     NA     NA      4     NA      4      4
#> 5 chr1  35576334  3.56e7     NA      3      3     NA     NA      3      3     NA
#> 6 chr1   2017424  2.02e6     NA     NA     10     10     NA     NA     10     10
#> 7 chr1  24901325  2.49e7     NA     NA     NA     NA     11     11     11     11
#> 8 chr1  36304606  3.63e7     NA     NA     NA     NA     NA      7     NA      7
#> # ℹ 91 more variables: `09Av` <dbl>, `10Av` <dbl>, `11Av` <dbl>, `12Av` <dbl>,
#> #   `13Av` <dbl>, `14Av` <dbl>, `15Av` <dbl>, `16Av` <dbl>, `17Av` <dbl>,
#> #   `18Av` <dbl>, `19Av` <dbl>, `20Av` <dbl>, `21Av` <dbl>, `22Av` <dbl>,
#> #   `23Av` <dbl>, `24Av` <dbl>, `25Av` <dbl>, `26Av` <dbl>, `27Av` <dbl>,
#> #   `28Av` <dbl>, `29Av` <dbl>, `30Av` <dbl>, `31Av` <dbl>, `32Av` <dbl>,
#> #   `33Av` <dbl>, `34Av` <dbl>, `35Av` <dbl>, `36Av` <dbl>, `37Av` <dbl>,
#> #   `38Av` <dbl>, `39Av` <dbl>, `40Av` <dbl>, `41Av` <dbl>, `42Av` <dbl>, …

Created on 2024-01-18 with reprex v2.0.2

0
M-- On
## libraries

library(dplyr)
library(readr)
library(purrr)
## sample data (credit to margusl)

csv_ <- read_csv("
chr , start,    end,      CpG
chr1, 2017424,  2017750,  10
chr1, 24901325, 24901700, 11
chr1, 24902268, 24902701, 25
chr1, 24927215, 24927416, 4
chr1, 26861926, 26862173, 5
chr1, 26864186, 26864613, 15
chr1, 35576334, 35576451, 3
chr1, 36304606, 36304817, 7", 
show_col_types = FALSE)

set.seed(123)
sprintf("%.2dAvPMPpCon.csv", 1:3) |>
  walk(\(f_) slice_sample(csv_, n = 4) |> write_csv(f_))
## import and pre-process

# importing all csv files at once
myfiles = lapply(list.files(pattern ="*Con.csv"), 
                 read.delim, 
                 header = TRUE, sep = "," )

# set the names (removing "PMPpCon.csv" and only keeping "##Av" part)
names(myfiles) <- gsub("PMPpCon.csv", "", csv_files)
## solution

# renaming the columns and appending the dataframe name prior to joining
myfiles %>%  
  imap(function(x, y) x %>% 
         rename_with(~paste(y, ., sep = "_"), 
                     -c("chr", "start", "end"))) %>%  
  reduce(full_join, by = c("chr", "start", "end"))
## results

#>    chr    start      end 01Av_CpG 02Av_CpG 03Av_CpG
#> 1 chr1 35576334 35576451        3       NA       NA
#> 2 chr1 36304606 36304817        7        7       NA
#> 3 chr1 24902268 24902701       25       25       NA
#> 4 chr1 26864186 26864613       15       15       15
#> 5 chr1 24901325 24901700       NA       11       NA
#> 6 chr1 26861926 26862173       NA       NA        5
#> 7 chr1 24927215 24927416       NA       NA        4
#> 8 chr1  2017424  2017750       NA       NA       10

Created on 2024-01-18 with reprex v2.0.2