R: Categorizing nested sequences in sequential data into a block or set

85 Views Asked by At

I am trying to create a block/set of nested sequences for a user. Following is a sample dataset of one user with multiple sessions:

df<- structure(home(serial_num = 1:155, 
session_id = c("a8b5", "a8b5", "1e37", "1e37", "a8b5", "a8b5", "c568", "c568", "a8b5", "11cf", "11cf", "c6fa", "c6fa", "11cf", "11cf", "c6fa", "c6fa", "11cf", "11cf", "5d87", "5d87", "11cf", "11cf", "5d87", "5d87", "11cf", "11cf", "5d87", "5d87", "11cf", "11cf", "5d87", "5d87", "11cf", "11cf", "5d87", "5d87", "11cf", "11cf", "5d87", "11cf", "11cf", "5d87", "5d87", "11cf", "11cf", "5d87", "5d87", "11cf", "11cf", "5d87", "11cf", "11cf", "5d87", "11cf", "11cf", "5d87", "5d87", "11cf", "11cf", "5d87", "5d87", "11cf", "11cf", "517b", "517b", "290a", "290a", "517b", "517b", "290a", "290a", "4322", "4322", "e6fa", "e6fa", "4322", "4322", "e6fa", "e6fa", "4322", "e6fa", "4322", "4322", "e6fa", "4322", "4322", "e6fa", "e6fa", "4322", "4322", "e6fa", "e6fa", "4322", "4322", "e6fa", "e6fa", "4322", "4322", "e6fa", "e6fa", "4322", "4322", "e6fa", "e6fa", "4322", "4322", "e6fa", "e6fa", "4322", "4322", "e6fa", "e6fa", "4322", "4322", "e6fa", "e6fa", "4322", "4322", "e6fa", "e6fa", "4322", "4322", "e6fa", "e6fa", "4322", "4322", "e6fa", "4322", "4322", "9ce3", "9ce3", "e6fa", "9ce3", "e6fa", "9ce3", "9ce3", "e6fa", "9ce3", "9ce3", "e6fa", "e6fa", "9ce3", "9ce3", "6db8", "6db8", "e6fa", "e6fa", "6db8", "6db8", "e6fa", "e6fa", "6db8", "e6fa", "e6fa"), 
Activity = c("home", "prodpg", "home", "prodpg", "home", "prodpg", "home", "prodpg", "home", "home", "prodpg", "home", "prodpg", "home", "prodpg", "home", "prodpg", "prodpg", "home", "home", "prodpg", "prodpg", "home", "prodpg", "home", "prodpg", "home", "home", "prodpg", "prodpg", "home", "home", "prodpg", "prodpg", "home", "home", "prodpg", "prodpg", "home", "home", "home", "prodpg", "prodpg", "home", "prodpg", "home", "prodpg", "home", "home", "prodpg", "prodpg", "prodpg", "home", "home", "home", "prodpg", "prodpg", "home", "prodpg", "home", "prodpg", "home", "prodpg", "home", "home", "prodpg", "home", "prodpg", "home", "prodpg", "home", "prodpg", "home",  "prodpg", "home", "prodpg", "home", "prodpg", "home", "prodpg", "home", "home", "prodpg", "home", "prodpg", "home", "prodpg", "home", "prodpg", "prodpg", "home", "prodpg", "home", "prodpg", "home", "prodpg", "home", "prodpg", "home", "prodpg", "home", "prodpg", "home", "prodpg", "home", "prodpg", "home", "prodpg", "home", "prodpg", "home", "home", "prodpg", "prodpg", "home", "home", "prodpg", "prodpg", "home", "prodpg", "home", "prodpg", "home", "prodpg", "home", "prodpg", "home", "home", "home", "prodpg", "home", "prodpg", "prodpg", "home", "home", "prodpg", "home", "prodpg", "prodpg", "home", "home", "prodpg", "prodpg", "home", "home", "prodpg", "prodpg", "home", "prodpg", "home", "home", "prodpg", "home", "home", "prodpg"), 
session_id1 = c(41029, 41029, 41030, 41030, 41031, 41031, 41032, 41032, 41033, 41036, 41036, 41037, 41037, 41038, 41038, 41039, 41039, 41040, 41040, 41041, 41041, 41042, 41042, 41043, 41043, 41044, 41044, 41045, 41045, 41046, 41046, 41047, 41047, 41048, 41048, 41049, 41049, 41050, 41050, 41051, 41052, 41052, 41053, 41053, 41054, 41054, 41055, 41055, 41056, 41056, 41057, 41058, 41058, 41059, 41060, 41060, 41061, 41061, 41062, 41062, 41063, 41063, 41064, 41064, 41065, 41065, 41066, 41066, 41067, 41067, 41068, 41068, 41070, 41070, 41071, 41071, 41072, 41072, 41073, 41073, 41074, 41075, 41076, 41076, 41077, 41078, 41078, 41079, 41079, 41080, 41080, 41081, 41081, 41082, 41082, 41083, 41083, 41084, 41084, 41085, 41085, 41086, 41086, 41087, 41087, 41088, 41088, 41089, 41089, 41090, 41090, 41091, 41091, 41092, 41092, 41093, 41093, 41094, 41094, 41095, 41095, 41096, 41096, 41097, 41097, 41098, 41098, 41099, 41100, 41100, 41101, 41101, 41102, 41103, 41104, 41105, 41105, 41106, 41107, 41107, 41108, 41108, 41109, 41109, 41110, 41110, 41111, 41111, 41112, 41112, 41113, 41113, 41114, 41115, 41115), 
session_id1_path_id = c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L), 
count_session = c(3L, 3L, 1L, 1L, 3L, 3L, 1L, 1L, 3L, 15L, 15L, 2L, 2L, 15L, 15L, 2L, 2L, 15L, 15L, 12L, 12L, 15L, 15L, 12L, 12L, 15L, 15L, 12L, 12L, 15L, 15L, 12L, 12L, 15L, 15L, 12L, 12L, 15L, 15L, 12L, 15L, 15L, 12L, 12L, 15L, 15L, 12L, 12L, 15L, 15L, 12L, 15L, 15L, 12L, 15L, 15L, 12L, 12L, 15L, 15L, 12L, 12L, 15L, 15L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 16L, 16L, 22L, 22L, 16L, 16L, 22L, 22L, 16L, 22L, 16L, 16L, 22L, 16L, 16L, 22L, 22L, 16L, 16L, 22L, 22L, 16L, 16L, 22L, 22L, 16L, 16L, 22L, 22L, 16L, 16L, 22L, 22L, 16L, 16L, 22L, 22L, 16L, 16L, 22L, 22L, 16L, 16L, 22L, 22L, 16L, 16L, 22L, 22L, 16L, 16L, 22L, 22L, 16L, 16L, 22L, 16L, 16L, 5L, 5L, 22L, 5L, 22L, 5L, 5L, 22L, 5L, 5L, 22L, 22L, 5L, 5L, 3L, 3L, 22L, 22L, 3L, 3L, 22L, 22L, 3L, 22L, 22L), 
session_path_id = c(1L, 2L, 1L, 2L, 3L, 4L, 1L, 2L, 5L, 1L, 2L, 1L, 2L, 3L, 4L, 3L, 4L, 5L, 6L, 1L, 2L, 7L, 8L, 3L, 4L, 9L, 10L, 5L, 6L, 11L, 12L, 7L, 8L, 13L, 14L, 9L, 10L, 15L, 16L, 11L, 17L, 18L, 12L, 13L, 19L, 20L, 14L, 15L, 21L, 22L, 16L, 23L, 24L, 17L, 25L, 26L, 18L, 19L, 27L, 28L, 20L, 21L, 29L, 30L, 1L, 2L, 1L, 2L, 3L, 4L, 3L, 4L, 1L, 2L, 1L, 2L, 3L, 4L, 3L, 4L, 5L, 5L, 6L, 7L, 6L, 8L, 9L, 7L, 8L, 10L, 11L, 9L, 10L, 12L, 13L, 11L, 12L, 14L, 15L, 13L, 14L, 16L, 17L, 15L, 16L, 18L, 19L, 17L, 18L, 20L, 21L, 19L, 20L, 22L, 23L, 21L, 22L, 24L, 25L, 23L, 24L, 26L, 27L, 25L, 26L, 28L, 29L, 27L, 30L, 31L, 1L, 2L, 28L, 3L, 29L, 4L, 5L, 30L, 6L, 7L, 31L, 32L, 8L, 9L, 1L, 2L, 33L, 34L, 3L, 4L, 35L, 36L, 5L, 37L, 38L), 
ts_datetime = structure(c(1655674636.2352, 1655674664.2193, 1655690537.121, 1655690551.7965, 1655691194.4165, 1655691199.1931, 1655698425.8044, 1655698435.2695, 1655698847.7784, 1655761324.3295, 1655761364.564, 1655762231.7465, 1655762313.5116, 1655762499.0529, 1655762504.6023, 1655762758.2714, 1655762763.3526, 1655762842.5405, 1655763285.7821, 1655763685.4536, 1655763698.7537, 1655763870.5954, 1655763956.5466, 1655764162.7307, 1655764568.3824, 1655766222.3856, 1655766233.9399, 1655766655.3613, 1655766660.8198, 1655767211.1262, 1655767285.9723, 1655767370.2404, 1655767375.6795, 1655768915.2796, 1655768923.0008, 1655769003.2995, 1655769034.5777, 1655769220.1281, 1655769222.8482, 1655770155.1085, 1655770606.488, 1655770612.8104, 1655770861.4025, 1655770909.4154, 1655770932.9201, 1655770953.0211, 1655773826.9808, 1655773830.968, 1655774154.4172, 1655774164.56, 1655782682.4358, 1655782731.5654, 1655782744.3249, 1655784163.1011, 1655784757.6255, 1655784776.0425, 1655785195.9602, 1655785488.8018, 1655785977.7008, 1655785982.4743, 1655786068.1609, 1655786536.73, 1655786980.6594, 1655786984.6581, 1655791102.6506, 1655791168.3472, 1655792822.8475, 1655792833.498, 1655792904.5437, 1655792910.0887, 1655794671.8648, 1655794688.3931, 1655848440.6548, 1655848479.9303, 1655848821.7101, 1655848840.6029, 1655848855.622, 1655848858.8962, 1655848870.756, 1655848873.6855, 1655848893.4526, 1655848958.6219, 1655848972.1148, 1655849007.1902, 1655850020.245, 1655850201.9252, 1655850208.1445, 1655850711.0854, 1655850726.0265, 1655851158.78, 1655851176.8804, 1655851183.5749, 1655851194.4651, 1655851285.9132, 1655851299.5342, 1655852337.5937, 1655852404.3482, 1655852496.2435, 1655852499.9042, 1655852761.555, 1655852765.7347, 1655852791.8767, 1655852801.2688, 1655854420.2235, 1655854713.4631, 1655854752.1309, 1655854755.8025, 1655856276.8895, 1655856646.2767, 1655857790.6265, 1655857806.8367, 1655859568.7553, 1655859576.3453, 1655860585.6185, 1655860598.1842, 1655860612.5926, 1655860626.4001, 1655861495.1906, 1655861791.0385, 1655861806.0011, 1655861824.6202, 1655861846.5502, 1655861859.9434, 1655862538.8729, 1655862696.9316, 1655863394.0939, 1655863436.2152, 1655864000.972, 1655864994.4364, 1655865003.4833, 1655865339.4813, 1655865382.1872, 1655865447.5564, 1655865449.6552, 1655865611.7772, 1655865629.7218, 1655865658.8231, 1655865675.4274, 1655865721.084, 1655865781.0875, 1655866797.208, 1655866809.4637, 1655866890.6427, 1655866900.8175, 1655869866.9963, 1655871272.8328, 1655875804.706, 1655876134.4226, 1655878129.4573, 1655878207.0818, 1655878224.8019, 1655878391.0742, 1655878684.007, 1655878755.5302, 1655878793.0587), 
tzone = "UTC", 
class = c("POSIXct", "POSIXt"))), 
row.names = c(NA, -155L), 
class = c("tbl_df", "tbl", "data.frame"))

My idea is to create a block of nested sequences as so:

serial_num 1 to 9 is block 1

serial num 10 to 64 is block 2

serial num 65 to 72 is block 3

serial num 73 to 155 is block 4

What I tried: I created session_id1 which tells me how many times one session_id has been fragmented. This count is in count_session. I created two separate path ids: session_id1_path_id which is the row index corresponding to session_id1 and session_path_id which is the row index corresponding to session_id.

I am trying to put all this information together so that I can create blocks of shopping sessions (as explained earlier).

Any ideas on how to go about this would be much much appreciated. Thank you.

1

There are 1 best solutions below

9
On BEST ANSWER

Here's an idea using a data.table anti-join.

library(data.table)

# get the fist/last row index for each session_id
dt <- setDT(df)[, as.list(range(.I)), session_id]
# get the length of each sequential "block"
blockLens <- diff(c(0, setorder(data.table(X = dt[[3]])[!dt, on = .(X > V1, X < V2)])[[1]]))
df[, block := rep.int(seq(blockLens), blockLens)][]
#>      serial_num session_id Activity session_id1 session_id1_path_id count_session session_path_id         ts_datetime block
#>   1:          1       a8b5     home       41029                   1             3               1 2022-06-19 21:37:16     1
#>   2:          2       a8b5   prodpg       41029                   2             3               2 2022-06-19 21:37:44     1
#>   3:          3       1e37     home       41030                   1             1               1 2022-06-20 02:02:17     1
#>   4:          4       1e37   prodpg       41030                   2             1               2 2022-06-20 02:02:31     1
#>   5:          5       a8b5     home       41031                   1             3               3 2022-06-20 02:13:14     1
#>  ---                                                                                                                       
#> 151:        151       e6fa     home       41113                   1            22              35 2022-06-22 06:10:24     4
#> 152:        152       e6fa   prodpg       41113                   2            22              36 2022-06-22 06:13:11     4
#> 153:        153       6db8     home       41114                   1             3               5 2022-06-22 06:18:04     4
#> 154:        154       e6fa     home       41115                   1            22              37 2022-06-22 06:19:15     4
#> 155:        155       e6fa   prodpg       41115                   2            22              38 2022-06-22 06:19:53     4

I'll attempt to explain what is going on.

dt is a table showing the first and last row where each session_id appears. We simply want to check which session_ids end outside any other session_id's range. This means the "block" of grouped session_ids is ending. This is accomplished with a data.table anti-join:

data.table(X = dt[[3]])[!dt, on = .(X > V1, X < V2)]

Those are the rows at which each block ends. Assign the block column accordingly.