Saving CSV files as tab delimited files while keeping the row names

141 Views Asked by At

I have a folder that is full of CSV files. Each file has genes as row names and samples as columns. I need them as TSV, so I saved each one of them as TSV and into a new path. This is the code:

folder_path <- "/original/folder/path"
csv_files <- list.files(path=folder_path, pattern="^tpm.*\\.csv$", full.names=TRUE)

new_TSV_files_path<- "/new/folder/path/"

lapply(csv_files , function(file) {
  data <- read.csv(file, stringsAsFactors=FALSE, fill=TRUE)
  
  filename <- basename(file)
  filename <- sub(".csv$", "", filename)
  
  new_file_path <- file.path(new_TSV_files_path , paste0(filename, ".txt"))
  
  write.table(data, 
              file=new_file_path, 
              sep="\t", 
              row.names=FALSE, 
              quote=FALSE)
})

The problem is after I save them as TSV, and I view them, the first column is the genes, named "X" and the row names is just numbers. I tired in many ways to keep the genes as the row names but it is not working. This is a code that I tried for example:

new_TSV_files_path <- "/tsv/files/path/"
file_list <- list.files(path=TSV_files_path , pattern="^tpm.*\\.tsv$", full.names=TRUE)

for (file_path in file_list) {
  dt <- fread(file_path, data.table=FALSE)
  rownames(dt) <- dt[, 1]
  
  dt <- dt[, -1]
  
  # Save the data back to the same tsv file
  fwrite(dt, file_path, sep="\t", row.names=TRUE)
}

I need the genes as row names and I also need to delete the X column (it is the first column) or else the web tool I'm using won't work. What can be done? Why is it that after the transition from CSV to TSV the row names become as a separate column ?

EDIT - a few line of one of the CSVs

c("\"\",\"Riaz_Nivolumab_2017-p001-ar-8813\",\"Riaz_Nivolumab_2017-p002-ar-8815\",\"Riaz_Nivolumab_2017-p017-ar-8890\",\"Riaz_Nivolumab_2017-p026-ar-8920\",\"Riaz_Nivolumab_2017-p034-ar-8929\",\"Riaz_Nivolumab_2017-p036-ar-8898\",\"Riaz_Nivolumab_2017-p037-ar-8900\",\"Riaz_Nivolumab_2017-p038-ar-8895\",\"Riaz_Nivolumab_2017-p039-ar-8819\",\"Riaz_Nivolumab_2017-p046-ar-8904\",\"Riaz_Nivolumab_2017-p047-ar-8836\",\"Riaz_Nivolumab_2017-p048-ar-8897\",\"Riaz_Nivolumab_2017-p049-ar-8824\",\"Riaz_Nivolumab_2017-p052-ar-8839\",\"Riaz_Nivolumab_2017-p065-ar-8857\",\"Riaz_Nivolumab_2017-p067-ar-8840\",\"Riaz_Nivolumab_2017-p078-ar-8864\",\"Riaz_Nivolumab_2017-p079-ar-8849\",\"Riaz_Nivolumab_2017-p082-ar-8822\",\"Riaz_Nivolumab_2017-p085-ar-8829\",\"Riaz_Nivolumab_2017-p090-ar-8866\",\"Riaz_Nivolumab_2017-p092-ar-8867\",\"Riaz_Nivolumab_2017-p098-ar-8853\",\"Riaz_Nivolumab_2017-p101-ar-8834\",\"Riaz_Nivolumab_2017-p103-ar-8872\",\"Riaz_Nivolumab_2017-p106-ar-8926\"", 
"\"A1BG\",1.0008835087938,-1.91493588495907,-0.620734484990514,-1.75775209550157,-1.23665217147917,-2.49624508855533,0.727023994594072,0.328722516597936,-0.108114351387649,-0.0836309292648272,-0.325627003998877,4.41269552131938,-0.229120400945704,3.7248031834164,-0.504990077186003,-2.28542936373545,-1.1031748886154,-0.0826211449622227,1.44674204021314,1.3682306825028,0.544742458853122,-2.21733388578132,0.231763591509675,0.0236879027615116,1.79591432017568,-0.638847949374409", 
"\"A1CF\",-0.543733010666938,-0.537513815032929,-0.481855746353474,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,5.49732418197924,-0.543733010666938,4.6852457132221,-0.543733010666938,0.117380356180772,-0.543733010666938,-0.543733010666938,1.53929625176185,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.48894973908573,-0.543733010666938,-0.543733010666938,-0.543733010666938", 
"\"A2M\",0.0979791979801679,-0.446028724661623,-0.465953763246008,-2.08487244739613,2.01411421717692,1.19837532976838,-0.745540508622708,0.356022989838996,-2.37641175531056,0.517622991895147,2.67515559703126,1.74369521852923,1.19154956592325,2.00377301045288,-0.130083699768657,-0.354493861811036,-3.736442059777,1.59429773965362,1.2081229778692,-2.95554013952561,-1.60581260283625,1.03219514758484,0.925657747621587,-2.02343920511453,-0.246282066712528,0.612339103457181", 
"\"A2ML1\",-0.363502635555151,-0.363502635555151,-0.298573188895895,-0.363502635555151,-0.363502635555151,-0.313253413224058,3.64328505792818,-0.363502635555151,-0.363502635555151,-0.363502635555151,0.0272101525164028,-0.363502635555151,-0.107062171894006,-0.363502635555151,-0.363502635555151,-0.363502635555151,-0.363502635555151,-0.212720613261966,-0.363502635555151,-0.363502635555151,3.46425626517263,-0.363502635555151,-0.197854673973831,-0.363502635555151,0.174257390070118,-0.363502635555151"
)
2

There are 2 best solutions below

9
jay.sf On

Essentially you may want to set row.names=1 in read.csv to indicate which column contains the row names.

csv_files <- list.files(folder_path, full.names=TRUE)

lapply(csv_files, \(x) {
  read.csv(x, row.names=1) |>
    write.table(file=sprintf('%s/%s', new_tsv, sub('\\.csv$', '.txt', basename(x))), 
                sep='\t', quote=FALSE)
}) |> invisible()

The invisible avoids to clutter the console with NULLs.

Test:

read.table(sprintf('%s/%s', new_tsv, 'tpm_01.txt'))
#       sample1 sample2 sample3
# gene1      14       9      14
# gene2       8      14      17
# gene3      11       9       5
# gene4      12      16      14
# gene5      11       9      13

Data:

tdir <- tempdir()
folder_path <- paste0(tdir, '/old')
dir.create(folder_path)

set.seed(42)
for (i in 1:3) {
  write.csv(matrix(rpois(15, 10), 5, 3, dimnames=list(paste0('gene', 1:5), paste0('sample', 1:3))),
            sprintf('%s/tpm_%02d.csv', folder_path, i))
}

new_tsv <- paste0(tdir, '/new')
dir.create(new_tsv)
0
margusl On

Check provided sample structure and prepare reprex:

# sample file content, truncated lines, rownames are in first column named "":
stringr::str_view(stringr::str_trunc(sample_lines, 80))
#> [1] │ "","Riaz_Nivolumab_2017-p001-ar-8813","Riaz_Nivolumab_2017-p002-ar-8815","Ria...
#> [2] │ "A1BG",1.0008835087938,-1.91493588495907,-0.620734484990514,-1.75775209550157...
#> [3] │ "A1CF",-0.543733010666938,-0.537513815032929,-0.481855746353474,-0.5437330106...
#> [4] │ "A2M",0.0979791979801679,-0.446028724661623,-0.465953763246008,-2.08487244739...
#> [5] │ "A2ML1",-0.363502635555151,-0.363502635555151,-0.298573188895895,-0.363502635...
writeLines(sample_lines, "tpm_sample.csv")

Modify original code to import row names (read.csv(..., row.names = 1)) and NOT to drop those on writing (remove row.names=FALSE from write.table() ):

folder_path <- "./"
csv_files <- list.files(path=folder_path, pattern="^tpm.*\\.csv$", full.names=TRUE)

new_TSV_files_path<- "./"
lapply(csv_files , function(file) {
  # enable reading rownames from 1st column:
  data <- read.csv(file, row.names = 1, stringsAsFactors=FALSE, fill=TRUE)
  
  filename <- basename(file)
  filename <- sub(".csv$", ".txt", filename)
  
  new_file_path <- file.path(new_TSV_files_path , filename)
  
  # do NOT diable rownmes with row.names=FALSE
  write.table(data, 
              file=new_file_path, 
              sep="\t",
              quote=FALSE
              )
})
#> [[1]]
#> NULL

Check resulting files:

tsv_files <- list.files(path=new_TSV_files_path, pattern="^tpm.*\\.txt$", full.names=TRUE)

# as long as read.table / write.table compatibe rownmes are expected, we are all set
# written tsv content, truncated lines:
stringr::str_view(stringr::str_trunc(readLines(tsv_files[1]), 80))
#> [1] │ Riaz_Nivolumab_2017.p001.ar.8813{\t}Riaz_Nivolumab_2017.p002.ar.8815{\t}Riaz_Nivolu...
#> [2] │ A1BG{\t}1.0008835087938{\t}-1.91493588495907{\t}-0.620734484990514{\t}-1.75775209550157{\t}-...
#> [3] │ A1CF{\t}-0.543733010666938{\t}-0.537513815032929{\t}-0.481855746353474{\t}-0.543733010666...
#> [4] │ A2M{\t}0.0979791979801679{\t}-0.446028724661623{\t}-0.465953763246008{\t}-2.0848724473961...
#> [5] │ A2ML1{\t}-0.363502635555151{\t}-0.363502635555151{\t}-0.298573188895895{\t}-0.36350263555...

# re-import with read.table(), check first columns and rownames attribute:
tsv_in <- read.table(tsv_files[1])
tsv_in[,1:2]
#>       Riaz_Nivolumab_2017.p001.ar.8813 Riaz_Nivolumab_2017.p002.ar.8815
#> A1BG                         1.0008835                       -1.9149359
#> A1CF                        -0.5437330                       -0.5375138
#> A2M                          0.0979792                       -0.4460287
#> A2ML1                       -0.3635026                       -0.3635026
rownames(tsv_in)
#> [1] "A1BG"  "A1CF"  "A2M"   "A2ML1"

# fread handles rownames differently and also throws a relevant warning, 
# rownmaes ending up in column "V1" is expected behaviour
dt_in <- data.table::fread(tsv_files[1])
#> Warning in data.table::fread(tsv_files[1]): Detected 26 column names but the
#> data has 27 columns (i.e. invalid file). Added 1 extra default column name for
#> the first column which is guessed to be row names or an index. Use setnames()
#> afterwards if this guess is not correct, or fix the file write command that
#> created the file to create a valid file.
dt_in[,1:3]
#>       V1 Riaz_Nivolumab_2017.p001.ar.8813 Riaz_Nivolumab_2017.p002.ar.8815
#> 1:  A1BG                        1.0008835                       -1.9149359
#> 2:  A1CF                       -0.5437330                       -0.5375138
#> 3:   A2M                        0.0979792                       -0.4460287
#> 4: A2ML1                       -0.3635026                       -0.3635026
rownames(dt_in)
#> [1] "1" "2" "3" "4"

If the issue remains, it has to do with mentioned webtool and it's format expectations.

Sample file lines:

sample_lines <- c("\"\",\"Riaz_Nivolumab_2017-p001-ar-8813\",\"Riaz_Nivolumab_2017-p002-ar-8815\",\"Riaz_Nivolumab_2017-p017-ar-8890\",\"Riaz_Nivolumab_2017-p026-ar-8920\",\"Riaz_Nivolumab_2017-p034-ar-8929\",\"Riaz_Nivolumab_2017-p036-ar-8898\",\"Riaz_Nivolumab_2017-p037-ar-8900\",\"Riaz_Nivolumab_2017-p038-ar-8895\",\"Riaz_Nivolumab_2017-p039-ar-8819\",\"Riaz_Nivolumab_2017-p046-ar-8904\",\"Riaz_Nivolumab_2017-p047-ar-8836\",\"Riaz_Nivolumab_2017-p048-ar-8897\",\"Riaz_Nivolumab_2017-p049-ar-8824\",\"Riaz_Nivolumab_2017-p052-ar-8839\",\"Riaz_Nivolumab_2017-p065-ar-8857\",\"Riaz_Nivolumab_2017-p067-ar-8840\",\"Riaz_Nivolumab_2017-p078-ar-8864\",\"Riaz_Nivolumab_2017-p079-ar-8849\",\"Riaz_Nivolumab_2017-p082-ar-8822\",\"Riaz_Nivolumab_2017-p085-ar-8829\",\"Riaz_Nivolumab_2017-p090-ar-8866\",\"Riaz_Nivolumab_2017-p092-ar-8867\",\"Riaz_Nivolumab_2017-p098-ar-8853\",\"Riaz_Nivolumab_2017-p101-ar-8834\",\"Riaz_Nivolumab_2017-p103-ar-8872\",\"Riaz_Nivolumab_2017-p106-ar-8926\"", 
  "\"A1BG\",1.0008835087938,-1.91493588495907,-0.620734484990514,-1.75775209550157,-1.23665217147917,-2.49624508855533,0.727023994594072,0.328722516597936,-0.108114351387649,-0.0836309292648272,-0.325627003998877,4.41269552131938,-0.229120400945704,3.7248031834164,-0.504990077186003,-2.28542936373545,-1.1031748886154,-0.0826211449622227,1.44674204021314,1.3682306825028,0.544742458853122,-2.21733388578132,0.231763591509675,0.0236879027615116,1.79591432017568,-0.638847949374409", 
  "\"A1CF\",-0.543733010666938,-0.537513815032929,-0.481855746353474,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,5.49732418197924,-0.543733010666938,4.6852457132221,-0.543733010666938,0.117380356180772,-0.543733010666938,-0.543733010666938,1.53929625176185,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.48894973908573,-0.543733010666938,-0.543733010666938,-0.543733010666938", 
  "\"A2M\",0.0979791979801679,-0.446028724661623,-0.465953763246008,-2.08487244739613,2.01411421717692,1.19837532976838,-0.745540508622708,0.356022989838996,-2.37641175531056,0.517622991895147,2.67515559703126,1.74369521852923,1.19154956592325,2.00377301045288,-0.130083699768657,-0.354493861811036,-3.736442059777,1.59429773965362,1.2081229778692,-2.95554013952561,-1.60581260283625,1.03219514758484,0.925657747621587,-2.02343920511453,-0.246282066712528,0.612339103457181", 
  "\"A2ML1\",-0.363502635555151,-0.363502635555151,-0.298573188895895,-0.363502635555151,-0.363502635555151,-0.313253413224058,3.64328505792818,-0.363502635555151,-0.363502635555151,-0.363502635555151,0.0272101525164028,-0.363502635555151,-0.107062171894006,-0.363502635555151,-0.363502635555151,-0.363502635555151,-0.363502635555151,-0.212720613261966,-0.363502635555151,-0.363502635555151,3.46425626517263,-0.363502635555151,-0.197854673973831,-0.363502635555151,0.174257390070118,-0.363502635555151"
)

Created on 2023-06-26 with reprex v2.0.2