Create categorical variable based on numeric intervals

78 Views Asked by At

I have this dataframe

data<-data.frame(class1=c("A","A","A","A","A","A","A","A","B","B","B","B","B","B","B","B"),
                 class2=c(1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8),
                        observations=c(444,475, 531,560,650,668,705,717,456,876,123,47,249,180,500,654))

and need to create a new categorical variable "class3" based on 2 unit intervals of "class2". If class2 is between 1 and 2, then "class3" is 1, and so on. "class2" is sequential.

I can create a new table with the defined intervals and then join.

intv<-data.frame(class2=c(1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8),
                 class3=c(1,1,2,2,3,3,4,4,1,1,2,2,3,3,4,4))

data.2<-left_join(data,intv,by = join_by(class2)) 



> data.2
   class1 class2 observations class3
1       A      1          444      1
2       A      1          444      1
3       A      2          475      1
4       A      2          475      1
5       A      3          531      2
6       A      3          531      2
7       A      4          560      2
8       A      4          560      2
9       A      5          650      3
10      A      5          650      3
11      A      6          668      3
12      A      6          668      3
13      A      7          705      4
14      A      7          705      4
15      A      8          717      4
16      A      8          717      4
17      B      1          456      1
18      B      1          456      1
19      B      2          876      1
20      B      2          876      1
21      B      3          123      2
22      B      3          123      2
23      B      4           47      2
24      B      4           47      2
25      B      5          249      3
26      B      5          249      3
27      B      6          180      3
28      B      6          180      3
29      B      7          500      4
30      B      7          500      4
31      B      8          654      4
32      B      8          654      4

But the real dataframe has lots of observations, so it would take a lot of time.

Is there a function to do so automatically just indicating the interval size?

3

There are 3 best solutions below

0
margusl On BEST ANSWER

For included example data, dividing by 2 and rounding up should be enough:

data<-data.frame(class1=c("A","A","A","A","A","A","A","A","B","B","B","B","B","B","B","B"),
                 class2=c(1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8),
                 observations=c(444,475, 531,560,650,668,705,717,456,876,123,47,249,180,500,654))

data$class3 <- ceiling(data$class2 / 2)
# if you need it to be categorical / factor :
data$class3_fct <- as.factor(data$class3)

head(data, n = 10)
#>    class1 class2 observations class3 class3_fct
#> 1       A      1          444      1          1
#> 2       A      2          475      1          1
#> 3       A      3          531      2          2
#> 4       A      4          560      2          2
#> 5       A      5          650      3          3
#> 6       A      6          668      3          3
#> 7       A      7          705      4          4
#> 8       A      8          717      4          4
#> 9       B      1          456      1          1
#> 10      B      2          876      1          1
str(data)
#> 'data.frame':    16 obs. of  5 variables:
#>  $ class1      : chr  "A" "A" "A" "A" ...
#>  $ class2      : num  1 2 3 4 5 6 7 8 1 2 ...
#>  $ observations: num  444 475 531 560 650 668 705 717 456 876 ...
#>  $ class3      : num  1 1 2 2 3 3 4 4 1 1 ...
#>  $ class3_fct  : Factor w/ 4 levels "1","2","3","4": 1 1 2 2 3 3 4 4 1 1 ...

Created on 2024-01-19 with reprex v2.0.2

0
Uana On

Try like this:

data$class3 <- cut(data$class2, breaks = seq(0, max(data$class2)+1, by = 2), labels = FALSE)
0
Friede On

Have a look at ?findInterval:

# wrap in as.factor() if needed
data$class3 = findInterval(data$class2, seq(1L, max(data$class2), 2L)) 

gives

> data
   class1 class2 observations class3
1       A      1          444      1
2       A      2          475      1
3       A      3          531      2
4       A      4          560      2
5       A      5          650      3
6       A      6          668      3
7       A      7          705      4
8       A      8          717      4
9       B      1          456      1
10      B      2          876      1
11      B      3          123      2
12      B      4           47      2
13      B      5          249      3
14      B      6          180      3
15      B      7          500      4
16      B      8          654      4
> 

Your sample data:

data = data.frame(class1=c("A","A","A","A","A","A","A","A","B","B","B","B","B","B","B","B"),
                 class2=c(1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8),
                 observations=c(444,475, 531,560,650,668,705,717,456,876,123,47,249,180,500,654))