sankey diagram visualisation

155 Views Asked by At

I am trying to visualize my data via a sankey diagram.

I have the following dataframe:

sankey1 <- structure(list(pat_id = c(10037, 10264, 10302, 10302, 10302, 
10344, 10482, 10482, 10482, 10613, 10613, 10613, 10628, 10851, 
11052, 11203, 11214, 11214, 11566, 11684, 11821, 11945, 11945, 
11952, 11952, 12122, 12183, 12774, 13391, 13573, 13643, 14298, 
14556, 14556, 14648, 14862, 14935, 14935, 14999, 15514, 15811, 
16045, 16045, 16190, 16190, 16190, 16220, 16220, 16220, 16220
), contactnummer = c(1, 1, 1, 2, 3, 1, 1, 2, 3, 1, 2, 3, 1, 1, 
1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 
1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 3, 1, 2, 3, 99), Combo2 = c(1, 
1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 
2, 4, 4, 1, 5, 1, 1, 1, 1, 3, 3, 1, 5, 1, 1, 3, 1, 1, 1, 1, 1, 
3, 6, 3, 1, 1, 1, 1), treatment = c(99, 0, 0, 1, 1, 0, 99, 99, 
99, 99, 99, 1, 1, 0, 1, 99, 99, 99, 0, 99, 99, 0, 0, 0, 1, 99, 
99, 0, 0, 0, 0, 0, 1, 1, 1, 99, 99, 1, 0, 0, 1, 0, 0, 0, 1, 1, 
99, 99, 99, 99)), row.names = c(NA, 50L), class = c("data.table", 
"data.frame"))

# A tibble: 50 x 4
   pat_id contactnummer Combo2 treatment
    <dbl>         <dbl>  <dbl>     <dbl>
 1  10037             1      1        99
 2  10264             1      1         0
 3  10302             1      1         0
 4  10302             2      1         1
 5  10302             3      2         1
 6  10344             1      1         0
 7  10482             1      2        99
 8  10482             2      1        99
 9  10482             3      1        99
10  10613             1      1        99

The dataframe contains information about participants ("pat_id") who visit a GP. In a visit, or contact ("contactnummer"), the GP evaluates the combination of symptoms ("combo2") and gives them a treatment ("treatment"). Some participants (not all) visit the GP for a second (or even third) contact. For each contact the GP will evaluate the symptoms and give them a treatment.

The aim is to illustrates the path of these participants. Which symptoms lead to which treatment and when (what contact). I hope to do this in an sankey diagram.(https://r-graph-gallery.com/321-introduction-to-interactive-sankey-diagram-2.html)

I aim to visualize it like this:

  • to visualize each combination of symptoms with a certain color
  • to visualize each treatment option (the nodes) with a certain color

Ideally the desired output would look like this: enter image description here or this: enter image description here

I would like to have the combinations ("Combo2") as arrows, showed in different colours per unique combination. These arrows should then lead to a treatment. But then i would like them continue, so after contact 1 - if an ID number has a second contact, the arrow shows again what combinations after that treatment occurs and to what treatment it leads in the second contact.

AFTER EDIT

With help from user s__, I've used the following script

# messing up with data: the goal is to create data.frame
# with source and targets to feed the sankey
df <-
sankey1 %>%  
  # wide format to gives an order
  pivot_wider(id_cols = pat_id
               , names_from = contactnummer
               , values_from = c(Combo2,treatment)
               ,names_glue = "{contactnummer}_{.value}"
               ,names_sort=TRUE) %>% 
  # put in a long format
  pivot_longer(!pat_id, names_to = 'variable', values_to = 'value') %>%
  # remove nas
  filter(!is.na(value)) %>%
  # grouping and creating the source field by pat_id
  group_by(pat_id) %>% 
  mutate(source = paste(substr(variable,1,15),value, sep = '_')) %>% 
  # useful columns
  select(pat_id, source) %>% 
  # arrange 
  arrange(pat_id, source) %>% 
  # adding by group the target column
  mutate(target = c(source[2:length(source)],NA)) 

# define source and target
links <- data.frame(source =df$source,
                    target   =df$target) %>% 
  filter(!is.na(target))

# getting unique nodes
nodes <- data.frame(name = as.character(unique(c(links$source, links$target)))) 


# now convert as character
links$source <- as.character(links$source)
links$target<- as.character(links$target)

# matching links and node, then indexing to 0
links$source <- match(links$source, nodes$name) - 1
links$target <- match(links$target, nodes$name) - 1

# group by (we are grouping by number of rows)
links <- links %>% group_by(source, target) %>% tally()
   
# plot it!
sankeyNetwork(Links = links
              , Nodes = nodes
              , Source = 'source'
              , Target = 'target'
              , Value = 'n'
              , NodeID = 'name'
              ,fontSize = 15)

This comes pretty close, but is not yet the desired output. I've tried editing the source, target and nodes like below, however that definitely isn't the desired output.

 df <-
        sankey2 %>%  
        # wide format to gives an order
        pivot_wider(id_cols = pat_id
                    , names_from = contactnummer
                    , values_from = c(Combo2,treatment)
                    ,names_glue = "{contactnummer}_{.value}"
                    ,names_sort=TRUE) %>% 
        # put in a long format
        pivot_longer(!pat_id, names_to = 'variable', values_to = 'value') %>%
        # remove nas
        filter(!is.na(value)) %>%
        # grouping and creating the source field by pat_id
        group_by(pat_id) %>% 
        mutate(source = paste(substr(variable,1,15),value, sep = '_')) %>% 
        # useful columns
        select(pat_id, source) %>% 
        # arrange 
        arrange(pat_id, source)  %>% 
        mutate(number = ave(pat_id, FUN = seq_along)) %>%
        # adding by group the target column
        pivot_wider(pat_id, values_from = source, names_from = number  )#
      
      names(df)[names(df) == '1'] <- 'Combo2_1'
      names(df)[names(df) == '2'] <- 'treatment_1'
      names(df)[names(df) == '3'] <- 'Combo2_2'
      names(df)[names(df) == '4'] <- 'treatment_2'
      names(df)[names(df) == '5'] <- 'Combo2_3'
      names(df)[names(df) == '6'] <- 'treatment_3'
      
      df <- df %>%
            pivot_longer(!pat_id, names_to = c(".value", "contact"), names_sep = "_")
      df <- df[!is.na(df$Combo2),]
      df <- df %>%
            select(pat_id, Combo2, treatment)
      
      names(df)[names(df) == 'Combo2'] <- 'source'
      names(df)[names(df) == 'treatment'] <- 'target'
            
      # define source and target
      links <- data.frame(source =df$source,
                          target   =df$target) %>% 
        filter(!is.na(target))
      
      # getting unique nodes
      nodes <- data.frame(name = as.character(unique(c(links$source, links$target)))) 
      
      
      # now convert as character
      links$source <- as.character(links$source)
      links$target<- as.character(links$target)
      
      # matching links and node, then indexing to 0
      links$source <- match(links$source, nodes$name) - 1
      links$target <- match(links$target, nodes$name) - 1
      
      # group by (we are grouping by number of rows)
      links <- links %>% group_by(source, target) %>% tally()
      
      # plot it!
      sankeyNetwork(Links = links
                    , Nodes = nodes
                    , Source = 'source'
                    , Target = 'target'
                    , Value = 'n'
                    , NodeID = 'name'
                    ,fontSize = 15
                    )

I really cant figure it out. Any help would be much appreciated!

2

There are 2 best solutions below

0
RvS On

I came to the conclusion, after also having contact with the current maintainer of the networkD3 package, that the outcome i aimed for was not possible with a sankey diagram.

0
CJ Yetman On
library(tidyverse)

Your "nodes" are Combo2 and treatment, so let's shape your data so that each node (or event) is on its own row while retaining the data that specifies to whom and when they occurred so we can order them properly.

sankey1 %>% 
  pivot_longer(cols = c(Combo2, treatment))
#> # A tibble: 100 × 4
#>    pat_id contactnummer name      value
#>     <dbl>         <dbl> <chr>     <dbl>
#>  1  10037             1 Combo2        1
#>  2  10037             1 treatment    99
#>  3  10264             1 Combo2        1
#>  4  10264             1 treatment     0
#>  5  10302             1 Combo2        1
#>  6  10302             1 treatment     0
#>  7  10302             2 Combo2        1
#>  8  10302             2 treatment     1
#>  9  10302             3 Combo2        2
#> 10  10302             3 treatment     1
#> # ℹ 90 more rows

Each node (event) is a combination of name (whether it's a Combo2 or a treatment) and its value, so let's combine them into one, which will be our "source" node.

sankey1 %>% 
  pivot_longer(cols = c(Combo2, treatment)) %>% 
  mutate(source = paste0(name, "-", value))
#> # A tibble: 100 × 5
#>    pat_id contactnummer name      value source      
#>     <dbl>         <dbl> <chr>     <dbl> <chr>       
#>  1  10037             1 Combo2        1 Combo2-1    
#>  2  10037             1 treatment    99 treatment-99
#>  3  10264             1 Combo2        1 Combo2-1    
#>  4  10264             1 treatment     0 treatment-0 
#>  5  10302             1 Combo2        1 Combo2-1    
#>  6  10302             1 treatment     0 treatment-0 
#>  7  10302             2 Combo2        1 Combo2-1    
#>  8  10302             2 treatment     1 treatment-1 
#>  9  10302             3 Combo2        2 Combo2-2    
#> 10  10302             3 treatment     1 treatment-1 
#> # ℹ 90 more rows

You also want each Combo2 and treatment to have a unique node for when it occurred, so let's also add the contactnummer to its unique "source" name, e.g. now you can have "treatment-1_contact1" and "treatment-1_contact2" as distinct nodes.

sankey1 %>% 
  pivot_longer(cols = c(Combo2, treatment)) %>% 
  mutate(source = paste0(name, "-", value, "_contact", contactnummer))
#> # A tibble: 100 × 5
#>    pat_id contactnummer name      value source               
#>     <dbl>         <dbl> <chr>     <dbl> <chr>                
#>  1  10037             1 Combo2        1 Combo2-1_contact1    
#>  2  10037             1 treatment    99 treatment-99_contact1
#>  3  10264             1 Combo2        1 Combo2-1_contact1    
#>  4  10264             1 treatment     0 treatment-0_contact1 
#>  5  10302             1 Combo2        1 Combo2-1_contact1    
#>  6  10302             1 treatment     0 treatment-0_contact1 
#>  7  10302             2 Combo2        1 Combo2-1_contact2    
#>  8  10302             2 treatment     1 treatment-1_contact2 
#>  9  10302             3 Combo2        2 Combo2-2_contact3    
#> 10  10302             3 treatment     1 treatment-1_contact3 
#> # ℹ 90 more rows

Let's make sure that the nodes are ordered properly by the contactnummer first and also that "Combo2" comes before "treatment", and then we can determine what is the "target" by taking the following node (event) within each patient (group).

sankey1 %>% 
  pivot_longer(cols = c(Combo2, treatment)) %>% 
  mutate(source = paste0(name, "-", value, "_contact", contactnummer)) %>%
  arrange(pat_id, contactnummer, if_else(name == "Combo2", 1, 2)) %>% 
  mutate(target = lead(source), .by = pat_id)
#> # A tibble: 100 × 6
#>    pat_id contactnummer name      value source                target            
#>     <dbl>         <dbl> <chr>     <dbl> <chr>                 <chr>             
#>  1  10037             1 Combo2        1 Combo2-1_contact1     treatment-99_cont…
#>  2  10037             1 treatment    99 treatment-99_contact1 <NA>              
#>  3  10264             1 Combo2        1 Combo2-1_contact1     treatment-0_conta…
#>  4  10264             1 treatment     0 treatment-0_contact1  <NA>              
#>  5  10302             1 Combo2        1 Combo2-1_contact1     treatment-0_conta…
#>  6  10302             1 treatment     0 treatment-0_contact1  Combo2-1_contact2 
#>  7  10302             2 Combo2        1 Combo2-1_contact2     treatment-1_conta…
#>  8  10302             2 treatment     1 treatment-1_contact2  Combo2-2_contact3 
#>  9  10302             3 Combo2        2 Combo2-2_contact3     treatment-1_conta…
#> 10  10302             3 treatment     1 treatment-1_contact3  <NA>              
#> # ℹ 90 more rows

The last node (event) for each patient will not have a following event, so its "target" will be NA. Let's get rid of those because they are invalid / links to nowhere.

sankey1 %>% 
  pivot_longer(cols = c(Combo2, treatment)) %>% 
  mutate(source = paste0(name, "-", value, "_contact", contactnummer)) %>%
  arrange(pat_id, contactnummer, if_else(name == "Combo2", 1, 2)) %>% 
  mutate(target = lead(source), .by = pat_id) %>% 
  filter(!is.na(target))
#> # A tibble: 67 × 6
#>    pat_id contactnummer name      value source                target            
#>     <dbl>         <dbl> <chr>     <dbl> <chr>                 <chr>             
#>  1  10037             1 Combo2        1 Combo2-1_contact1     treatment-99_cont…
#>  2  10264             1 Combo2        1 Combo2-1_contact1     treatment-0_conta…
#>  3  10302             1 Combo2        1 Combo2-1_contact1     treatment-0_conta…
#>  4  10302             1 treatment     0 treatment-0_contact1  Combo2-1_contact2 
#>  5  10302             2 Combo2        1 Combo2-1_contact2     treatment-1_conta…
#>  6  10302             2 treatment     1 treatment-1_contact2  Combo2-2_contact3 
#>  7  10302             3 Combo2        2 Combo2-2_contact3     treatment-1_conta…
#>  8  10344             1 Combo2        1 Combo2-1_contact1     treatment-0_conta…
#>  9  10482             1 Combo2        2 Combo2-2_contact1     treatment-99_cont…
#> 10  10482             1 treatment    99 treatment-99_contact1 Combo2-1_contact2 
#> # ℹ 57 more rows

So now we have our links data frame. Let's build a nodes data frame that has a single row for each unique node.

links <-
  sankey1 %>% 
  pivot_longer(cols = c(Combo2, treatment)) %>% 
  mutate(source = paste0(name, "-", value, "_contact", contactnummer)) %>%
  arrange(pat_id, contactnummer, if_else(name == "Combo2", 1, 2)) %>% 
  mutate(target = lead(source), .by = pat_id) %>% 
  filter(!is.na(target))

nodes <- data.frame(id = unique(c(links$source, links$target)))
nodes
#>                        id
#> 1       Combo2-1_contact1
#> 2    treatment-0_contact1
#> 3       Combo2-1_contact2
#> 4    treatment-1_contact2
#> 5       Combo2-2_contact3
#> 6       Combo2-2_contact1
#> 7   treatment-99_contact1
#> 8   treatment-99_contact2
#> 9       Combo2-1_contact3
#> 10      Combo2-3_contact1
#> 11      Combo2-2_contact2
#> 12      Combo2-4_contact1
#> 13      Combo2-4_contact2
#> 14      Combo2-5_contact1
#> 15   treatment-1_contact1
#> 16      Combo2-3_contact2
#> 17      Combo2-6_contact2
#> 18      Combo2-3_contact3
#> 19  treatment-99_contact3
#> 20     Combo2-1_contact99
#> 21   treatment-1_contact3
#> 22   treatment-0_contact2
#> 23 treatment-99_contact99

We probably don't need the "_contact*" bit displayed on the plot because the position on the plot will make it clear when it happened, so let's add a label column for the name we'd like to display for each node on the plot (these will not be unique anymore, but we keep the unique id column).

nodes$name <- sub("_contact[0-9]*$", "", nodes$id)
nodes
#>                        id         name
#> 1       Combo2-1_contact1     Combo2-1
#> 2    treatment-0_contact1  treatment-0
#> 3       Combo2-1_contact2     Combo2-1
#> 4    treatment-1_contact2  treatment-1
#> 5       Combo2-2_contact3     Combo2-2
#> 6       Combo2-2_contact1     Combo2-2
#> 7   treatment-99_contact1 treatment-99
#> 8   treatment-99_contact2 treatment-99
#> 9       Combo2-1_contact3     Combo2-1
#> 10      Combo2-3_contact1     Combo2-3
#> 11      Combo2-2_contact2     Combo2-2
#> 12      Combo2-4_contact1     Combo2-4
#> 13      Combo2-4_contact2     Combo2-4
#> 14      Combo2-5_contact1     Combo2-5
#> 15   treatment-1_contact1  treatment-1
#> 16      Combo2-3_contact2     Combo2-3
#> 17      Combo2-6_contact2     Combo2-6
#> 18      Combo2-3_contact3     Combo2-3
#> 19  treatment-99_contact3 treatment-99
#> 20     Combo2-1_contact99     Combo2-1
#> 21   treatment-1_contact3  treatment-1
#> 22   treatment-0_contact2  treatment-0
#> 23 treatment-99_contact99 treatment-99

The links data frame needs to refer to the nodes in the node data frame using the 0-based-index (the number of the row the node is in, starting with row 0).

links$source_id <- match(links$source, nodes$id) - 1
links$target_id <- match(links$target, nodes$id) - 1
links
#> # A tibble: 67 × 8
#>    pat_id contactnummer name      value source        target source_id target_id
#>     <dbl>         <dbl> <chr>     <dbl> <chr>         <chr>      <dbl>     <dbl>
#>  1  10037             1 Combo2        1 Combo2-1_con… treat…         0         6
#>  2  10264             1 Combo2        1 Combo2-1_con… treat…         0         1
#>  3  10302             1 Combo2        1 Combo2-1_con… treat…         0         1
#>  4  10302             1 treatment     0 treatment-0_… Combo…         1         2
#>  5  10302             2 Combo2        1 Combo2-1_con… treat…         2         3
#>  6  10302             2 treatment     1 treatment-1_… Combo…         3         4
#>  7  10302             3 Combo2        2 Combo2-2_con… treat…         4        20
#>  8  10344             1 Combo2        1 Combo2-1_con… treat…         0         1
#>  9  10482             1 Combo2        2 Combo2-2_con… treat…         5         6
#> 10  10482             1 treatment    99 treatment-99… Combo…         6         2
#> # ℹ 57 more rows

Now we can plot it. To prevent the sankey plot from stretching links across the full width we use sinksRight = FALSE. This will keep nodes from the same "contact" aligned vertically. I also added a group column to nodes and used the NodeGroup = "group" parameter to help distinguish symptom and treatment nodes.

library(dplyr)
library(tidyr)
library(networkD3)

sankey1 <- data.frame(
  pat_id = c(10037,10264,10302,10302,10302,
             10344,10482,10482,10482,10613,10613,10613,10628,10851,
             11052,11203,11214,11214,11566,11684,11821,11945,
             11945,11952,11952,12122,12183,12774,13391,13573,13643,
             14298,14556,14556,14648,14862,14935,14935,14999,
             15514,15811,16045,16045,16190,16190,16190,16220,16220,
             16220,16220),
  contactnummer = c(1,1,1,2,3,1,1,2,3,1,2,3,1,
                    1,1,1,1,2,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,
                    2,1,1,1,2,1,1,1,1,2,1,2,3,1,2,3,99),
  Combo2 = c(1,1,1,1,2,1,2,1,1,1,1,1,1,
             1,3,1,1,1,1,1,1,1,2,4,4,1,5,1,1,1,1,3,3,
             1,5,1,1,3,1,1,1,1,1,3,6,3,1,1,1,1),
  treatment = c(99,0,0,1,1,0,99,99,99,99,99,
                1,1,0,1,99,99,99,0,99,99,0,0,0,1,99,99,0,
                0,0,0,0,1,1,1,99,99,1,0,0,1,0,0,0,1,1,99,
                99,99,99)
)

links <-
  sankey1 %>% 
  pivot_longer(cols = c(Combo2, treatment)) %>% 
  mutate(source = paste0(name, "-", value, "_contact", contactnummer)) %>%
  arrange(pat_id, contactnummer, if_else(name == "Combo2", 1, 2)) %>% 
  mutate(target = lead(source), .by = pat_id) %>% 
  filter(!is.na(target))

nodes <- data.frame(id = unique(c(links$source, links$target)))
nodes$name <- sub("_contact[0-9]*$", "", nodes$id)

links$source_id <- match(links$source, nodes$id) - 1
links$target_id <- match(links$target, nodes$id) - 1

nodes$group <- sub("-[0-9]*$", "", nodes$name)

sankeyNetwork(
  Links = links,
  Nodes = nodes, 
  Source = "source_id",
  Target = "target_id",
  Value = 1, 
  NodeID = "name",
  fontSize = 15,
  sinksRight = FALSE, 
  NodeGroup = "group"
)