How to make a sankey graph using ggalluvial in R?

110 Views Asked by At

I am trying to make a Sankey graph using the ggalluvial package in R.

I have the following data: ID Cluster3(values ranging from 1 to 3) Cluster 6 (values ranging from 1 to 6). x and y coordinates (as this is a spatial dataset)

I merely want to plot a graph showing the relationship between the two (i.e. how is cluster 3 being split into 6 clusters).

I keep getting the following error message:

Error: Continuous value supplied to discrete scale
In addition
1: In to_lodes_form(data = data, axes = axis_ind, discern = params$discern) :
  Some strata appear at multiple axes.
2: In to_lodes_form(data = data, axes = axis_ind, discern = params$discern) :
  Some strata appear at multiple axes.
3: In to_lodes_form(data = data, axes = axis_ind, discern = params$discern) :
  Some strata appear at multiple axes.

I suspect this is because of the ID values which are continious from 1 to 13336.

I tried using the x and y coordinates as "markers/identifiers" but here too, I get the same error.

I first tried using the networkd3 package in R, but the iGraph package within it doesn't seem to work. So, as an alternative I tried with ggalluvial using the code below

library(ggalluvial)
library(raster)
library(dplyr)
library(tidyverse)
Clust3 <- raster(paste0("//.....3Cluster.tif"))
Clus6 <- raster(paste0("//.....6Cluster.tif"))

stack_sdf3 = rasterToPoints(Clust3, spatial = T)
stack_df3 = as.data.frame(stack_sdf3)
stack_df3$ID <- sprintf("%03d", 1:nrow(stack_df3))

stack_sdf6 = rasterToPoints(Clus6, spatial = T)
stack_df6 = as.data.frame(stack_sdf6)
stack_df6$ID <- sprintf("%03d", 1:nrow(stack_df6))

# Edit the column names for better understanding
colnames(stack_df3) = c("Cluster_Number3","x","y","ID")
colnames(stack_df6) = c("Cluster_Number6","x","y","ID")

df3 <- stack_df3[, c("ID", "Cluster_Number3","x","y")]
df6 <- stack_df6[, c("ID", "Cluster_Number6","x","y")]

data <- df3 %>% right_join(df6, by=c("x", "y"))
data2<- select(data,"ID.x", "Cluster_Number3", "Cluster_Number6")

ggplot(data = data2,
       aes(axis1 = Cluster_Number3, axis2 = Cluster_Number6, y = ID.x)) +
  geom_alluvium(aes(fill = Cluster_Number3)) +
  geom_stratum() +
  geom_text(stat = "stratum",
            aes(label = after_stat(stratum))) +
  scale_x_discrete(limits = c("Main Cluster", "Diverging clusters"),
                   expand = c(0.15, 0.05)) +
  scale_fill_viridis_d()+
  theme_void()

enter image description here

Could you please help me with this, as I am a newbie in R.

1

There are 1 best solutions below

0
CJ Yetman On

Given example data in the format you stated (ignoring the x and y values/columns because they're irrelevant here)...

library(tidyverse)

ids <- 1:133

data <- 
  data.frame(
    ID.x = str_pad(ids, width = nchar(max(ids)), pad = "0"),
    Cluster_Number3 = sample.int(n = 3, replace = TRUE, size = length(ids)),
    Cluster_Number6 = sample.int(n = 6, replace = TRUE, size = length(ids)),
    x = sample.int(n = 100, replace = TRUE, size = length(ids)),
    y = sample.int(n = 100, replace = TRUE, size = length(ids))
  )

data
#>     ID.x Cluster_Number3 Cluster_Number6   x   y
#> 1    001               2               3  47  80
#> 2    002               1               3  10  61
#> 3    003               3               6   9  61
#> 4    004               1               5  28  60
#> 5    005               2               1  38  80
#> 6    006               3               5  71  51
#> 7    007               3               6   1  47
#> 8    008               1               6  59  91
#> 9    009               3               4   7  34
#> 10   010               2               2  46  33
# ℹ 123 more rows
# ℹ Use `print(n = ...)` to see more rows

Using {ggalluvial}...

library(ggalluvial)

data %>% 
  mutate(Cluster_Number3 = as.factor(Cluster_Number3)) %>% 
  mutate(Cluster_Number6 = as.factor(Cluster_Number6)) %>% 
  ggplot(
    aes(
      axis1 = Cluster_Number3, 
      axis2 = Cluster_Number6,
      fill = Cluster_Number3
    )
  ) +
  geom_alluvium() +
  geom_stratum() +
  geom_text(stat = "stratum", aes(label = after_stat(stratum))) +
  scale_x_discrete(limits = c("Main Cluster", "Diverging clusters"), 
                   expand = c(0.15, 0.05)) +
  theme_void() +
  theme(axis.text.x = element_text())
#> Warning in to_lodes_form(data = data, axes = axis_ind, discern =
#> params$discern): Some strata appear at multiple axes.

#> Warning in to_lodes_form(data = data, axes = axis_ind, discern =
#> params$discern): Some strata appear at multiple axes.

#> Warning in to_lodes_form(data = data, axes = axis_ind, discern =
#> params$discern): Some strata appear at multiple axes.

Using {networkD3}...

library(networkD3)

links <- 
  data %>%
  mutate(Cluster_Number3 = paste0(Cluster_Number3, " (cl3)")) %>% 
  mutate(Cluster_Number6 = paste0(Cluster_Number6, " (cl6)")) %>% 
  select(
    source = Cluster_Number3,
    target = Cluster_Number6
    ) %>% 
  mutate(value = 1L)

nodes <- data.frame(name = unique(c(links$source, links$target)))

links$source_id <- match(links$source, nodes$name) - 1
links$target_id <- match(links$target, nodes$name) - 1

sankeyNetwork(
  Links = links, 
  Nodes = nodes, 
  Source = 'source_id',
  Target = 'target_id',
  Value = 'value',
  NodeID = 'name'
  )