How to calculate a proportion in R

379 Views Asked by At

I have this reproducible DataFrame:

 structure(list(age = c(62.84998, 60.33899, 52.74698, 42.38498, 
 79.88495, 93.01599, 62.37097, 86.83899, 85.65594, 42.25897), 
     death = c(0, 1, 1, 1, 0, 1, 1, 1, 1, 1), sex = c("male", 
     "female", "female", "female", "female", "male", "male", "male", 
     "male", "female"), hospdead = c(0, 1, 0, 0, 0, 1, 0, 0, 0, 
     0), slos = c(5, 4, 17, 3, 16, 4, 9, 7, 12, 8), d.time = c(2029, 
     4, 47, 133, 2029, 4, 659, 142, 63, 370), dzgroup = c("Lung Cancer", 
     "Cirrhosis", "Cirrhosis", "Lung Cancer", "ARF/MOSF w/Sepsis", 
     "Coma", "CHF", "CHF", "Lung Cancer", "Colon Cancer"), dzclass = c("Cancer", 
     "COPD/CHF/Cirrhosis", "COPD/CHF/Cirrhosis", "Cancer", "ARF/MOSF", 
     "Coma", "COPD/CHF/Cirrhosis", "COPD/CHF/Cirrhosis", "Cancer", 
     "Cancer"), num.co = c(0, 2, 2, 2, 1, 1, 1, 3, 2, 0), edu = c(11, 
     12, 12, 11, NA, 14, 14, NA, 12, 11), income = c("$11-$25k", 
     "$11-$25k", "under $11k", "under $11k", NA, NA, "$25-$50k", 
     NA, NA, "$25-$50k"), scoma = c(0, 44, 0, 0, 26, 55, 0, 26, 
     26, 0), charges = c(9715, 34496, 41094, 3075, 50127, 6884, 
     30460, 30460, NA, 9914), totcst = c(NA_real_, NA_real_, NA_real_, 
     NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
     NA_real_), totmcst = c(NA_real_, NA_real_, NA_real_, NA_real_, 
     NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_
     ), avtisst = c(7, 29, 13, 7, 18.666656, 5, 8, 6.5, 8.5, 8
     ), race = c("other", "white", "white", "white", "white", 
     "white", "white", "white", "black", "hispanic"), sps = c(33.8984375, 
     52.6953125, 20.5, 20.0976562, 23.5, 19.3984375, 17.296875, 
     21.5976562, 15.8984375, 2.2998047), aps = c(20, 74, 45, 19, 
     30, 27, 46, 53, 17, 9), surv2m = c(0.262939453, 0.0009999275, 
     0.790893555, 0.698974609, 0.634887695, 0.284973145, 0.892944336, 
     0.670898438, 0.570922852, 0.952880859), surv6m = c(0.0369949341, 
     0, 0.664916992, 0.411987305, 0.532958984, 0.214996338, 0.820922852, 
     0.498962402, 0.24899292, 0.887939453), hday = c(1, 3, 4, 
     1, 3, 1, 1, 1, 1, 1), diabetes = c(0, 0, 0, 0, 0, 0, 0, 1, 
     0, 0), dementia = c(0, 0, 0, 0, 0, 0, 0, 0, 1, 0), ca = c("metastatic", 
     "no", "no", "metastatic", "no", "no", "no", "no", "metastatic", 
     "metastatic"), prg2m = c(0.5, 0, 0.75, 0.899999619, 0.899999619, 
     0, NA, 0.799999714, 0.049999982, NA), prg6m = c(0.25, 0, 
     0.5, 0.5, 0.8999996, 0, 0.6999998, 0.3999999, 0.0001249999, 
     NA), dnr = c("no dnr", NA, "no dnr", "no dnr", "no dnr", 
     "no dnr", "no dnr", "no dnr", "dnr after sadm", "no dnr"), 
     dnrday = c(5, NA, 17, 3, 16, 4, 9, 7, 2, 8), meanbp = c(97, 
     43, 70, 75, 59, 110, 78, 72, 97, 84), wblc = c(6, 17.0976562, 
     8.5, 9.09960938, 13.5, 10.3984375, 11.6992188, 13.5996094, 
     9.69921875, 11.2988281), hrt = c(69, 112, 88, 88, 112, 101, 
     120, 100, 56, 94), resp = c(22, 34, 28, 32, 20, 44, 28, 26, 
     20, 20), temp = c(36, 34.59375, 37.39844, 35, 37.89844, 38.39844, 
     37.39844, 37.59375, 36.59375, 38.19531), pafi = c(388, 98, 
     231.65625, NA, 173.3125, 266.625, 309.5, 404.75, 357.125, 
     NA), alb = c(1.7998047, NA, NA, NA, NA, NA, 4.7998047, NA, 
     NA, 4.6992188), bili = c(0.19998169, NA, 2.19970703, NA, 
     NA, NA, 0.39996338, NA, 0.39996338, 0.19998169), crea = c(1.19995117, 
     5.5, 2, 0.79992676, 0.79992676, 0.69995117, 1.59985352, 2, 
     1, 0.79992676), sod = c(141, 132, 134, 139, 143, 140, 132, 
     139, 143, 139), ph = c(7.459961, 7.25, 7.459961, NA, 7.509766, 
     7.65918, 7.479492, 7.509766, 7.449219, NA), glucose = c(NA_real_, 
     NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
     NA_real_, NA_real_, NA_real_), bun = c(NA_real_, NA_real_, 
     NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
     NA_real_, NA_real_), urine = c(NA_real_, NA_real_, NA_real_, 
     NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
     NA_real_), adlp = c(7, NA, 1, 0, NA, NA, 0, NA, NA, 0), adls = c(7, 
     1, 0, 0, 2, 1, 1, 0, 7, NA), sfdm2 = c(NA, "<2 mo. follow-up", 
     "<2 mo. follow-up", "no(M2 and SIP pres)", "no(M2 and SIP pres)", 
     "<2 mo. follow-up", "no(M2 and SIP pres)", NA, NA, NA), adlsc = c(7, 
     1, 0, 0, 2, 1, 1, 0, 7, 0.4947999)), row.names = c(NA, 10L
 ), class = "data.frame")

I am needing to calculate the proportion of patients who died in the hospital in patients with an active DNR order on day 3 and in patients without an active DNR order on day 3. To group which patients had an active DNR on day 3 and which did not, I used the subset function below:

SB_xlsx1 = SB_xlsx[!is.na(SB_xlsx$dnrday), ]
YesDNR = subset(SB_xlsx1, dnrday <= 3)
NoDNR = subset(SB_xlsx1, dnrday > 3)

However, I don't know how to calculate the proportion of patients that died in the hospital for those with a DNR and without a DNR. The 'hospdead' variable has all 0s and 1s, where 0 = not dead and 1 = dead. However, I don't know how to get the proportion that died for having a DNR at day 3 and did not have a DNR at day 3. What code could I use for my desired result. SB_xlsx also just represents my DataFrame name.

3

There are 3 best solutions below

0
On

There's a few ways to do this but the simplest is probably via the aggregate function.

> aggregate( hospdead ~ (dnrday<=3) , SB_xlsx1 , mean)

  dnrday <= 3  hospdead
1       FALSE 0.1428571
2        TRUE 0.0000000
0
On

EDIT: I apologize; I misread your post when providing my original answer. I've revised it below.

You referred to the hospdeath variable, but in the toy data set it has just one nonzero entry, so I'm using the death variable instead to demonstrate the principle.

First, abase R approach:

mean(SB_xlsx1[SB_xlsx1$death == 1, ]$dnrday <= 3)
mean(SB_xlsx1[SB_xlsx1$death == 1, ]$dnrday > 3)

The idea is to restrict to the subset of rows for which a death occurred, then perform a logical check to see which entries have dnrday greater than 3.

Note that if you have NA entries in death, you'll want to remove them first as you did with those in dnrday.


For a dplyr approach:

library(dplyr)
SB_xlsx1 %>%
  filter(death == 1) %>%
  summarize(mean(dnrday <= 3), mean(dnrday > 3))

or, for a slightly nicer-looking table,

SB_xlsx1 %>%
  filter(death == 1) %>%
  group_by(dnrday <= 3) %>%
  summarize(prop = n() / nrow(.))
2
On

You may use tapply to group deaths by the condition dnrday <= 3, i.e. with an active DNR on day 3 and calculate the mean.

(res <- proportions(xtabs(death ~ dnrday <= 3, SB_xlsx)))
# dnrday <= 3
#     FALSE      TRUE 
# 0.7142857 0.2857143 

where

sum(res)
# [1] 1