Replace certain spaces to tabs - delimiters

642 Views Asked by At

I have one column data.frame where some spaces should be delimiters some just a space.

#input data
dat <- data.frame(x=c("A 2 2 textA1 textA2 Z1",
                      "B 4 1 textX1 textX2 textX3 Z2",
                      "C 3 5 textA1 Z3"))
#                               x
# 1        A 2 2 textA1 textA2 Z1
# 2 B 4 1 textX1 textX2 textX3 Z2
# 3               C 3 5 textA1 Z3

Need to convert it to 5 column data.frame:

#expected output
output <- read.table(text="
A   2   2   textA1 textA2   Z1
B   4   1   textX1 textX2 textX3    Z2
C   3   5   textA1  Z3",sep="\t")
#   V1 V2 V3                   V4 V5
# 1  A  2  2        textA1 textA2 Z1
# 2  B  4  1 textX1 textX2 textX3 Z2
# 3  C  3  5               textA1 Z3

Essentially, need to change 1st, 2nd, 3rd, and the last space to a tab (or any other delimiter if it makes it easier to code).

Playing with regex is not giving anything useful yet...

Note1: In real data I have to replace 1st, 2nd, 3rd,...,19th and the last spaces to tabs.
Note2: There is no pattern in V4, text can be anything.
Note3: Last column is one word text with variable length.

3

There are 3 best solutions below

2
On BEST ANSWER

Try

v1 <- gsub("^([^ ]+)\\s+([^ ]+)\\s+([^ ]+)\\s+", '\\1,\\2,\\3,', dat$x)
read.table(text=sub(' +(?=[^ ]+$)', ',', v1, perl=TRUE), sep=",")
#  V1 V2 V3                   V4 V5
#1  A  2  2        textA1 textA2 Z1
#2  B  4  1 textX1 textX2 textX3 Z2
#3  C  3  5               textA1 Z3

Or an option inspired from @Tensibai's post

n <- 3
fpat <- function(n){
   paste0('^((?:\\w+ ){', n,'})([\\w ]+)\\s+(\\w+)$')
}

read.table(text=gsub(fpat(n), "\\1'\\2' \\3", dat$x, perl=TRUE))
#  V1 V2 V3                   V4 V5
#1  A  2  2        textA1 textA2 Z1
#2  B  4  1 textX1 textX2 textX3 Z2
#3  C  3  5               textA1 Z3

For more columns,

 n <- 19
 v1 <- "A 24 34343 212 zea4 2323 12343 111 dsds 134d 153xd 153xe 153de 153dd dd dees eese tees3 zee2 2353 23335 23353 ddfe 3133"

 read.table(text=gsub(fpat(n), "\\1'\\2' \\3", v1, perl=TRUE), sep='')
 # V1 V2    V3  V4   V5   V6    V7  V8   V9  V10   V11   V12   V13   V14 V15
 #1  A 24 34343 212 zea4 2323 12343 111 dsds 134d 153xd 153xe 153de 153dd  dd
 #  V16  V17   V18  V19                   V20  V21
 #1 dees eese tees3 zee2 2353 23335 23353 ddfe 3133
0
On

With a variable number of columns:

library(stringr)
cols <- 3
m <- str_match(dat$x, paste0("((?:\\w+ ){" , cols , "})([\\w ]+) (\\w+)"))
t <- paste0(gsub(" ", "\t", m[,2]), m[,3], "\t", m[,4])

> read.table(text=t,sep="\t")
  V1 V2 V3                   V4 V5
1  A  2  2        textA1 textA2 Z1
2  B  4  1 textX1 textX2 textX3 Z2
3  C  3  5               textA1 Z3

Change the number of columns to tell how many you wish before. For the regex:

  • ((?:\\w+ ){3}) Capture the 3 repetitions {3} of the non capturing group (?:\w+ ) which matche at least one alphanumeric character w+ followed by a space
  • ([\\w ]+) (\w+) capture the free text from alphanumeric char or space [\w ]+ followed by a space and the capture the last word with \w+

Once that done, paste the 3 parts returned by str_match taking care of replacing the spaces in the first group m[,2] by tabs.

m[,1] is the whole match so it's unused here.


Old answer:

A basic one matching based on a fixed number of fields:

> read.table(text=gsub("(\\w+) (\\w+) (\\w+) ([\\w ]+) (\\w+)$","\\1\t\\2\t\\3\t\\4\t\\5",dat$x,perl=TRUE),sep="\t")
  V1 V2 V3                   V4 V5
1  A  2  2        textA1 textA2 Z1
2  B  4  1 textX1 textX2 textX3 Z2
3  C  3  5               textA1 Z3

Add as many (\w+) you wish before, and increase the number of \1 (back references)

0
On

Here can be one twisted way to go that will work whatever the number of "words" you have (and that works on your data); it's based on the number of alphanum characters in your "words" compared to the number of alphanum characters in the other fields:

res <- gsub("\\w{3,}\\K\\t(?=\\w{3,})", " ", gsub(" ", "\t", dat$x), perl=T)
res
# [1] "A\t2\t2\ttextA1 textA2\tZ1"        "B\t4\t1\ttextX1 textX2 textX3\tZ2" "C\t3\t5\ttextA1\tZ3"

read.table(text=res, sep="\t")
#  V1 V2 V3                   V4 V5
#1  A  2  2        textA1 textA2 Z1
#2  B  4  1 textX1 textX2 textX3 Z2
#3  C  3  5               textA1 Z3

EDIT: A completely different way to go, only based on the number of the spaces k you need to replace before the last one:

k <- 3 # in your example
res <- sapply(as.character(dat$x), 
              function(x, k){
                 pos_sp <- gregexpr(" ", x)[[1]]
                 x <- strsplit(x, "")[[1]]
                 if (length(pos_sp) > k+1) pos_sp <- pos_sp[c(1:k, length(pos_sp))]
                 x[pos_sp] <- "\t"
                 x <- paste(x, collapse="")
               }, k=k)

read.table(text=res, sep="\t")
#   V1 V2 V3                   V4 V5
# 1  A  2  2        textA1 textA2 Z1
# 2  B  4  1 textX1 textX2 textX3 Z2
# 3  C  3  5               textA1 Z3