Take a groups from one table and use other tables to calculate euclidean distance

273 Views Asked by At

I would like to calculate euclidean distance between specific profiles. The biggest problem is how to get specific rows together to calculate a distance between them. In the first table I have stored the groups with the names of rows from different tables which should be taken for distance calculation. First table looks like that:

        Activity     Person ValueOfComp
    1   Football Mark_1_OUT           4
    2   Football Greg_1_OUT           4
    3   Football Mark_1_INT           4
    4   Football Greg_1_INT           4
    5 Volleyball  Tim_1_INT          6
    6 Volleyball  Tim_1_OUT          6
    7 Volleyball  Tom_1_INT          6
    8 Volleyball  Tom_1_OUT          6
    9 Volleyball  Sim_1_INT          6
    10 Volleyball  Sim_1_OUT          6
    11 Handball  Karl_1_OUT          8
    12 Handball  Karl_1_INT          8
    13 Handball  Matt_1_OUT          8
    14 Handball  Matt_1_INT          8
    15 Handball  Jake_1_INT          8
    16 Handball  Jake_1_OUT          8
    17 Handball  Sonya_1_OUT          8
    18 Handball  Sonya_1_INT          8

There are two table which store the profiles of mentioned variables which should be sued for Euclidean distance calculation.

Table 1 lets say that one is for variables ending with INT:

                10         34        59 84        110       134       165       199
Mark_1 0.000000000 0.00000000 0.0000000  1 0.12345123 0.1160406 0.2847189 0.4636836
Greg_1 0.000000000 0.00000000 0.1719200  1 0.68940000 0.2087267 0.2469333 0.2358933
Tim_1 0.000000000 0.00000000 0.0000000  1 0.123415551 0.55321234 0.0000000 0.0000000
Tom_1 0.000000000 0.00000000 0.0000000  0 1 0.11234120 0.1755712 0.2344607
Sim_1 0.000000000 0.00000000 0.0000000  1 0.324532121 0.123412666 0.0000000 0.0000000
Karl_1 1 0.123256312 0.34312334  0 0.00000000 0.0000000 0.0000000 0.0000000
Matt_1 0.000000000 0.03978242 0.1272671  1 0.00000000 0.0000000 0.0000000 0.0000000
Moham_1 0.5123412423 0.12423561 0.1775713  1 0.01186404 0.0000000 0.0000000 0.0000000
Teraq_1 0.009915695 0.13451256 0.2211453  1 0.01186404 0.0000000 0.0000000 0.0000000
Jake_1 0.066915225 0.20623498 0.53215713  1 0.01186404 0.0000000 0.0000000 0.0000000
Sonya_1 0.000000000 0.21341411 0.5323123  1 0.00000000 0.0000000 0.0000000 0.0000000
Monique_1 1 0.4311223 0.22343212  0 0.00000000 0.0000000 0.0000000 0.0000000

Table 2 lets say that one is for variables ending with OUT:

                10         34        59 84        110       134       165       199
Mark_1 0.000000000 0.00000000 0.0000000  1 0.33345123 0.2530406 0.2147189 0.4636836
Greg_1 0.000000000 0.00000000 0.1719200  1 0.48240000 0.22345726 0.2122233 0.2358933
Tim_1 0.000000000 0.00000000 0.0000000  1 0.623415551 0.35321234 0.0000000 0.0000000
Tom_1 0.000000000 0.00000000 0.0000000  0 1 0.4122120 0.3755712 0.2324607
Sim_1 0.000000000 0.00000000 0.0000000  1 0.33352121 0.223412666 0.0000000 0.0000000
Karl_1 1 0.553256312 0.24312334  0 0.00000000 0.0000000 0.0000000 0.0000000
Matt_1 0.000000000 0.11978242 0.1272671  1 0.00000000 0.0000000 0.0000000 0.0000000
Moham_1 0.5123412423 0.52423561 0.6775713  1 0.31186404 0.0000000 0.0000000 0.0000000
Teraq_1 0.119915695 0.16451256 0.2433253  1 0.09186404 0.0000000 0.0000000 0.0000000
Jake_1 0.264915225 0.33123498 0.39215713  1 0.11186404 0.0000000 0.0000000 0.0000000
Sonya_1 0.000000000 0.33341411 0.4323123  1 0.00000000 0.0000000 0.0000000 0.0000000
Monique_1 1 0.5511223 0.44343212  0 0.00000000 0.0000000 0.0000000 0.0000000

So based on the groups from the first table Football, Volleyball, etc I would like to take all the profiles from this group and calculated the euclidean distance between them. The profiles can be found in other tables. The distance should be calculated between all profiles from this group even if there are taken from the same table.

Would be nice if the results would be stored as a separate table with pairs, activity and calculated distance.

My real data consists few thousands of rows but I have CPU power to run the loop as well.

Can someone help me with an answer ?

EDIT: Reproducible example:

> dput(repr_data)
structure(list(Activity = structure(c(1L, 1L, 1L, 1L, 3L, 3L, 
3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Football", 
"Handball", "Volleyball"), class = "factor"), Person = structure(c(8L, 
7L, 2L, 1L, 15L, 16L, 17L, 18L, 11L, 12L, 6L, 5L, 10L, 9L, 3L, 
4L, 14L, 13L), .Label = c("Greg_1_INT", "Greg_1_OUT", "Jake_1_INT", 
"Jake_1_OUT", "Karl_1_INT", "Karl_1_OUT", "Mark_1_INT", "Mark_1_OUT", 
"Matt_1_INT", "Matt_1_OUT", "Sim_1_INT", "Sim_1_OUT", "Sonya_1_INT", 
"Sonya_1_OUT", "Tim_1_INT", "Tim_1_OUT", "Tom_1_INT", "Tom_1_OUT"
), class = "factor"), ValueOfComp = c(4, 4, 4, 4, 6, 6, 6, 6, 
6, 6, 8, 8, 8, 8, 8, 8, 8, 8)), .Names = c("Activity", "Person", 
"ValueOfComp"), row.names = c(NA, -18L), class = "data.frame")

Table 1:

> dput(INT_tbl)
structure(c(0, 0, 0, 0, 0, 1, 0.22123412423, 0.0123915695, 0.0126915225, 
0.4312, 1, 0, 0, 0, 0, 0, 0.323256312, 0.32423561, 0.44451256, 
0.33623498, 0.21341411, 0.321223, 0.232, 0.57192, 0, 0, 0, 0.31312334, 
0.2775713, 0.1311453, 0.63215713, 0.4423123, 0.132212, 1, 1, 
1, 0, 1, 0, 1, 1, 1, 1, 0, 0.55345123, 0.689875, 0.423415551, 
1, 0.444532121, 0, 0.01186404, 0.22132204, 0.21186404, 0, 0, 
0.234126, 0.33347267, 0.35321234, 0.4123412, 0.333412666, 0, 
0, 0, 0.3123, 0, 0, 0.1147189, 0.12343, 0.3155, 0.2755712, 0.123, 
0, 0, 0, 0, 0, 0, 0.1236836, 0.0058933, 0, 0.1344607, 0, 0, 0, 
0, 0, 0, 0), .Dim = c(11L, 8L), .Dimnames = list(c("Mark_1", 
"Greg_1", "Tim_1", "Tom_1", "Sim_1", "Karl_1", "Moham_1", "Teraq_1", 
"Jake_1", "Sonya_1", "Monique_1"), c("10", "34", "59", "84", 
"110", "134", "165", "199")))

Table 2:

> dput(OUT_tbl)
structure(c(0.236915225, 0, 0, 0, 0, 0, 1, 1, 0.22123412423, 
0.0123915695, 0.0126915225, 0.4312, 1, 0.26666498, 0, 0, 0, 0, 
0, 0.323256312, 0.52356312, 0.32423561, 0.44451256, 0.33623498, 
0.21341411, 0.321223, 0.123415713, 0.232, 0.57192, 0, 0, 0, 0.31312334, 
0.12342332, 0.2775713, 0.1311453, 0.63215713, 0.4423123, 0.132212, 
1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0.2235404, 0.55345123, 
0.689875, 0.423415551, 1, 0.444532121, 0, 0, 0.01186404, 0.22132204, 
0.21186404, 0, 0, 0.123, 0.234126, 0.33347267, 0.35321234, 0.4123412, 
0.333412666, 0, 0, 0, 0, 0.3123, 0, 0, 0, 0.1147189, 0.12343, 
0.3155, 0.2755712, 0.123, 0, 0, 0, 0, 0, 0, 0, 0, 0.1236836, 
0.0058933, 0, 0.1344607, 0, 0, 0, 0, 0, 0, 0, 0), .Dim = c(13L, 
8L), .Dimnames = list(c("Karsten_1", "Mark_1", "Greg_1", "Tim_1", 
"Tom_1", "Sim_1", "Karl_1", "Johan_1", "Moham_1", "Teraq_1", 
"Jake_1", "Sonya_1", "Monique_1"), c("10", "34", "59", "84", 
"110", "134", "165", "199")))

Desired output:

        Activity     Person 1   Person 2    EUC.DIST
    1   Football Mark_1_OUT    Greg_1_OUT      XX
    2   Football Mark_1_OUT    Mark_1_INT      XX
    3   Football Mark_1_OUT    Greg_1_INT      XX
    4   Football Greg_1_INT    Greg_1_OUT      XX
    5   Football Greg_1_INT    Mark_1_INT      XX
    6   Football Greg_1_OUT    Mark_1_INT      XX
    ........
    and so on with other combinations withing rest of the groups.
3

There are 3 best solutions below

0
On BEST ANSWER

Here's an alternative using dplyr. I think it works better (and perhaps easier to understand) to combine INT_tbl and OUT_tbl after updating the row names accordingly:

rownames(INT_tbl) <- paste0(rownames(INT_tbl), "_INT")
rownames(OUT_tbl) <- paste0(rownames(OUT_tbl), "_OUT")
BOTH_tbl <- rbind(INT_tbl, OUT_tbl)

You have a name in repr_data that is not present in the data. If you need the pair of people to remain with an NA distance, then use solution number one; if you do not want/need the pair in the data, use solution number two. (Performance is essentially the same.) Regardless, to deal with it, we need to know all possibles ahead of time:

allpeople <- rownames(BOTH_tbl)
library(dplyr)

Solution One

If you need to keep missing people visible with NA distance:

repr_data %>%
  group_by(Activity) %>%
  do({
    people <- as.character(unique(.$Person))
    peoplei <- match(people, allpeople)
    d <- dist(BOTH_tbl[peoplei,])
    n <- length(people) - 1
    data.frame(
      Person1 = rev(people[-1])[unlist(mapply(`:`, n:1, 1))],
      Person2 = rep(people, times = n:0),
      Dist = unclass(d),
      stringsAsFactors = FALSE
    )
  }) %>%
  ungroup()
# # A tibble: 49 × 4
#    Activity    Person1    Person2      Dist
#      <fctr>      <chr>      <chr>     <dbl>
# 1  Football Mark_1_INT Mark_1_OUT 0.0000000
# 2  Football Greg_1_OUT Mark_1_OUT 0.3974635
# 3  Football Greg_1_INT Mark_1_OUT 0.3974635
# 4  Football Greg_1_OUT Mark_1_INT 0.3974635
# 5  Football Greg_1_INT Mark_1_INT 0.3974635
# 6  Football Greg_1_INT Greg_1_OUT 0.0000000
# 7  Handball Karl_1_INT Karl_1_OUT 0.0000000
# 8  Handball Matt_1_OUT Karl_1_OUT        NA
# 9  Handball Matt_1_INT Karl_1_OUT        NA
# 10 Handball Jake_1_INT Karl_1_OUT 1.4896801
# # ... with 39 more rows

Solution Two

If missing people can be omitted from the results.

repr_data %>%
  group_by(Activity) %>%
  do({
    people <- intersect(as.character(unique(.$Person)), allpeople)
    d <- dist(BOTH_tbl[people,])
    n <- length(people) - 1
    data.frame(
      Person1 = rev(people[-1])[unlist(mapply(`:`, n:1, 1))],
      Person2 = rep(people, times = n:0),
      Dist = unclass(d),
      stringsAsFactors = FALSE
    )
  }) %>%
  ungroup()
# # A tibble: 36 × 4
#    Activity     Person1    Person2      Dist
#      <fctr>       <chr>      <chr>     <dbl>
# 1  Football  Mark_1_INT Mark_1_OUT 0.0000000
# 2  Football  Greg_1_OUT Mark_1_OUT 0.3974635
# 3  Football  Greg_1_INT Mark_1_OUT 0.3974635
# 4  Football  Greg_1_OUT Mark_1_INT 0.3974635
# 5  Football  Greg_1_INT Mark_1_INT 0.3974635
# 6  Football  Greg_1_INT Greg_1_OUT 0.0000000
# 7  Handball  Karl_1_INT Karl_1_OUT 0.0000000
# 8  Handball  Jake_1_INT Karl_1_OUT 1.4896801
# 9  Handball  Jake_1_OUT Karl_1_OUT 1.4896801
# 10 Handball Sonya_1_OUT Karl_1_OUT 1.1628794
# # ... with 26 more rows
0
On

Ok, this can get a little messy but bear with me.

First we take INT_tbl & OUT_tbl and do some work on those. We make them data frames, add the rownames as a column and add a suffix in each entry. This is done in order to rbind both Out and Int tables into a full data frame, i.e.

library(dplyr)
library(tidyr)

out <- setNames(data.frame(paste0(rownames(OUT_tbl), '_OUT'), OUT_tbl, 
                row.names = NULL, stringsAsFactors = FALSE), c('Person', colnames(OUT_tbl)))

int <- setNames(data.frame(paste0(rownames(INT_tbl), '_INT'), INT_tbl, 
                row.names = NULL, stringsAsFactors = FALSE), c('Person', colnames(INT_tbl)))

full_d <- rbind(out, int)

#which gives,
rbind(head(full_d, 3), tail(full_d, 3))
#          Person         10        34        59 84       110       134       165       199
#1  Karsten_1_OUT 0.23691523 0.2666650 0.1234157  1 0.2235404 0.1230000 0.0000000 0.0000000
#2     Mark_1_OUT 0.00000000 0.0000000 0.2320000  1 0.5534512 0.2341260 0.1147189 0.1236836
#3     Greg_1_OUT 0.00000000 0.0000000 0.5719200  1 0.6898750 0.3334727 0.1234300 0.0058933
#22    Jake_1_INT 0.01269152 0.3362350 0.6321571  1 0.2118640 0.3123000 0.0000000 0.0000000
#23   Sonya_1_INT 0.43120000 0.2134141 0.4423123  1 0.0000000 0.0000000 0.0000000 0.0000000
#24 Monique_1_INT 1.00000000 0.3212230 0.1322120  0 0.0000000 0.0000000 0.0000000 0.0000000

We then create a function to calculate the distance between all possible pairs of Persons, i.e.

#define the Euclidean distance first
euc.dist <- function(i, j) {sqrt(sum((i - j) ^ 2))}

#Create the function
Get_dist <- function(x){
    d12 <- setNames(as.data.frame(cbind(as.character(x$Activity[1]), t(combn(as.character(x$Person), 2))), 
                                  stringsAsFactors = FALSE), c('Activity', 'Person1', 'Person2')) 
    new_d <- d12 %>% 
      gather(new, label, -Activity) %>% 
      left_join(., full_d, by = c('label' = 'Person'))
    l1 <- split(new_d, new_d$new)
    d12$EUC.DIST <- as.numeric(mapply(euc.dist, as.data.frame(t(l1[[1]][-c(1:3)])), 
                                      as.data.frame(t(l1[[2]][-c(1:3)]))))
    return(d12)
}

To apply the function

we split the data frame by Activity, apply the function and use bind_rows to convert it (from list) to a data frame. i.e,

final_d <- bind_rows(lapply(split(df, df$Activity), Get_dist))

final_d
#     Activity     Person1     Person2  EUC.DIST
#1    Football  Mark_1_OUT  Mark_1_INT 0.0000000
#2    Football  Mark_1_OUT  Greg_1_OUT 0.3974635
#3    Football  Mark_1_OUT  Greg_1_INT 0.3974635
#4    Football  Mark_1_INT  Greg_1_OUT 0.3974635
#5    Football  Mark_1_INT  Greg_1_INT 0.3974635
#6    Football  Greg_1_OUT  Greg_1_INT 0.0000000
#7    Handball  Karl_1_OUT  Karl_1_INT 0.0000000
#8    Handball  Karl_1_OUT  Matt_1_OUT        NA
#9    Handball  Karl_1_OUT  Matt_1_INT        NA
#10   Handball  Karl_1_OUT  Jake_1_INT 1.4896801

If you want to exclude NA values from the final data frame then simply,

final_d <- final_d[!is.na(final_d$EUC.DIST),]
0
On

Please check this.

#Convert to data.frame and cleanup
INT_tbl = as.data.frame(INT_tbl)
OUT_tbl = as.data.frame(OUT_tbl)
INT_tbl$Remarks = "INT"
OUT_tbl$Remarks = "OUT"
INT_tbl$Names = rownames(INT_tbl)
OUT_tbl$Names = rownames(OUT_tbl)
rownames(INT_tbl) = NULL
rownames(OUT_tbl) = NULL

# Initiate empty lists
Name_Pair1 = list()
Name_Pair2 = list()
EDistance = list()

m = 1

#Compute distance between all names in INT and OUT and add to lists
while (m < nrow(INT_tbl)*nrow(OUT_tbl)){
    for (i in 1:nrow(INT_tbl)){
        for (j in 1:nrow(OUT_tbl)){
            Name_Pair1[m] = paste(INT_tbl$Names[i],"_INT-",OUT_tbl$Names[j],"_OUT",sep="")
            Name_Pair2[m] = paste(OUT_tbl$Names[j],"_OUT-",INT_tbl$Names[i],"_INT",sep="")
            EDistance[m] = sqrt((INT_tbl$`10`[i]-OUT_tbl$`10`[i])^2+
                                    (INT_tbl$`34`[i]-OUT_tbl$`34`[i])^2+
                                    (INT_tbl$`59`[i]-OUT_tbl$`59`[i])^2+
                                    (INT_tbl$`84`[i]-OUT_tbl$`84`[i])^2+
                                    (INT_tbl$`110`[i]-OUT_tbl$`110`[i])^2+
                                    (INT_tbl$`134`[i]-OUT_tbl$`134`[i])^2+
                                    (INT_tbl$`165`[i]-OUT_tbl$`165`[i])^2+
                                    (INT_tbl$`199`[i]-OUT_tbl$`199`[i])^2)
            m = m+1
        }
    }
}

#COmbine lists into data.frame and cleanup 
DDistance = data.frame(cbind(Name_Pair1,Name_Pair2,EDistance))
DDistance$Name_Pair1 = as.character(DDistance$Name_Pair1)
DDistance$Name_Pair2 = as.character(DDistance$Name_Pair2)
DDistance$EDistance = as.numeric(DDistance$EDistance)

#Initiate OUTPUT data.frame 
Out.put = data.frame(V1 = NA,V2=NA,Name_Pair=NA,EDistance=NA,Activity=NA)

#Obtain list of unique Activity 
Activity = as.character(unique(repr_data$Activity))

for (i in 1:length(Activity)){
    df = repr_data[repr_data$Activity == Activity[i],] #Subset for unique activity
    x = as.data.frame(combn(df$Person,2,simplify = FALSE)) #Get all combination of names in the subset
    x= t(x)
    rownames(x) = NULL
    x= as.data.frame(x)

    #Lookup distance for each row based on Name1(V1) and Name2(V2)
    for (j in 1:nrow(x)){
        x$Name_Pair[j] = paste(x$V1[j],x$V2[j],sep="-")
        for (k in 1:nrow(DDistance)){
            if (x$Name_Pair[j] == DDistance$Name_Pair1[k] | x$Name_Pair[j] == DDistance$Name_Pair2[k])
                x$EDistance = DDistance$EDistance[k]
            next
        }
        x$Activity = Activity[i]
    }
    Out.put = rbind(Out.put,x) #Append to Out.put
}

Out.put = Out.put[2:nrow(Out.put),] #Cleanup