apply vs map: can't filter list-column dataframe with map using tidyverse in R

68 Views Asked by At

I am trying to filter the rows contained in the data column of data_split_ based off information in other columns of data_split_ and my_perts_df_.

So specifically, I want to:

  1. filter the my_perts_df_ table by excluding any pert_iname in exclude
  2. then filter the same table according to grouping_var, which can be either pert_class or pert_iname
  3. Finally, use this filtered table to filter the data column in the data_split_ table.

These are the two tibbles:

my_perts_df_ <- structure(list(pert_iname = c("DMSO", "gsk126", "jq1-s", "unc-0646", 
"geldanamycin", "decitabine", "gsk-j4", "vorinostat", "ly-294002", 
"staurosporine", "ly-294002", "kn-62", "ruxolitinib", "losmapimod", 
"ar a014418", "tofacitinib", "sp600125"), pert_class = c("control", 
"Epigenetic", "Epigenetic", "Epigenetic", "Epigenetic", "Epigenetic", 
"Epigenetic", "Epigenetic", "Epigenetic", "Kinase inhibitor", 
"Kinase inhibitor", "Kinase inhibitor", "Kinase inhibitor", "Kinase inhibitor", 
"Kinase inhibitor", "Kinase inhibitor", "Kinase inhibitor"), 
    P100 = c("P100", "P100", "P100", "P100", "P100", "P100", 
    "P100", "P100", "P100", "P100", "P100", "P100", "P100", "P100", 
    "P100", "P100", "P100"), GCP = c("GCP", "GCP", "GCP", "GCP", 
    "GCP", "GCP", "GCP", "GCP", "GCP", "GCP", "GCP", "GCP", NA, 
    "GCP", "GCP", "GCP", "GCP"), keep = c(TRUE, TRUE, TRUE, TRUE, 
    TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
    TRUE, TRUE, TRUE)), row.names = c(NA, -17L), class = c("tbl_df", 
"tbl", "data.frame"))
data_split_ <- structure(list(dataset_type = c("P100", "P100", "GCP", "GCP"), 
    grouping_var = c("pert_class", "pert_iname", "pert_class", 
    "pert_iname"), filter_vars = list("Kinase inhibitor", "DMSO", 
        "Epigenetic", "DMSO"), exclude = list(NA_character_, 
        NA_character_, NA_character_, NA_character_), output_dir = c("output_final/p100", 
    "output_final/p100", "output_final/gcp", "output_final/gcp"
    ), data = list(structure(list(replicate_id = c("NPC--cabozantinib--VEGFR inhibitor::E01_acq_01::P-0063", 
    "PC3--DMSO--control::A02_acq_01::P-0034", "PC3--bms-345541--Kinase inhibitor::E12_acq_01::P-0034", 
    "MCF7--unc1215--Epigenetic::G09_DIA_acq_01::P-0015", "PC3--dasatinib--CML TKI::B05_acq_01::P-0061", 
    NA, "HUVEC--okadaic acid--Other::F11_acq_01::P-0057", "HUVEC--decitabine--Epigenetic::A05_acq_01::P-0038", 
    "PC3--ruxolitinib--Kinase inhibitor::E02_acq_01::P-0024", 
    "NPC--pd-0332991--Kinase inhibitor::B05_acq_02::P-0027", 
    "A549--vorinostat--Epigenetic::H05_acq_02::P-0058", "YAPC--dasatinib--CML TKI::B06_acq_01::P-0062", 
    "NPC--vorinostat--Epigenetic::H05_DIA_acq_01::P-0020", "A375--tbb--Kinase inhibitor::B10_acq_01::P-0022", 
    "NPC--nilotinib--CML TKI::B11_acq_03::P-0027", "A549--vandetanib--VEGFR inhibitor::E10_acq_01::P-0058", 
    "HUVEC--eplerenone--CV::D02_acq_01::P-0038", NA, "PC3--afuresertib--Kinase inhibitor::G08_acq_01::P-0034", 
    "NPC--ponatinib--CML TKI::A12_acq_01::P-0063", "PC3--sotrastaurin--IMiD::F10_acq_01::P-0024", 
    "HUVEC--atorvastatin--CV::A05_acq_01::P-0057", "HUVEC--losmapimod--Kinase inhibitor::C11_acq_01::P-0038", 
    "HUVEC--aspirin--CV::CS20180418_P100_HUVEC3_Batch2_P-0069_B11_acq_01::P-0069", 
    "A375--belinostat--Epigenetic::D08_DIA_acq_01::P-0017"), 
        pert_iname = c("cabozantinib", "DMSO", "bms-345541", 
        "unc1215", "dasatinib", "gefitinib", "okadaic acid", 
        "decitabine", "ruxolitinib", "pd-0332991", "vorinostat", 
        "dasatinib", "vorinostat", "tbb", "nilotinib", "vandetanib", 
        "eplerenone", "isoproterenol", "afuresertib", "ponatinib", 
        "sotrastaurin", "atorvastatin", "losmapimod", "aspirin", 
        "belinostat"), pert_class = c("VEGFR inhibitor", "control", 
        "Kinase inhibitor", "Epigenetic", "CML TKI", NA, "Other", 
        "Epigenetic", "Kinase inhibitor", "Kinase inhibitor", 
        "Epigenetic", "CML TKI", "Epigenetic", "Kinase inhibitor", 
        "CML TKI", "VEGFR inhibitor", "CV", NA, "Kinase inhibitor", 
        "CML TKI", "IMiD", "CV", "Kinase inhibitor", "CV", "Epigenetic"
        ), pr_gene_symbol = c("pS142 DPF2", "pS222 RBM17", "pS515 ZC3H14", 
        "pS235 RPS6", "pS405 C13orf8", "pS403 RNF169", "pS1035 GPATCH8", 
        "pS1075 AP1GBP1", "pS12 EIF4A3", "pS56 NOC2L", "pS230 SH3KBP1", 
        "pS2218 MAP4", "pS275 NANS", "pS405 C13orf8", "pS75 DDX54", 
        "pS446 BRAF", "pS500 C17orf85", "pS692 FAM129B", "pT596 MARK2", 
        "pS163 MAP3K2", "pS386 PFKP", "pS75 DDX54", "pS652 NUFIP2", 
        "pS275 NANS", "pS207 FASN"), value = c(0.181722044944763, 
        0.6737335, 0.021798491, 0.777894914150238, 0.0691990852355957, 
        0.2050631, 0.19466971, -0.30280375, -0.0263868570327759, 
        -0.196122169494629, 0.0888378620147705, -0.07376492, 
        0.05050297, 0.309347808361053, -0.146979808807373, -0.0501706600189209, 
        0.20668465, -0.279913723468781, 0.15067881, -0.0937120914459229, 
        -0.129467606544495, -0.07893503, -0.09858775, 0.312014818191528, 
        -0.0723450779914856)), row.names = c(NA, -25L), class = c("tbl_df", 
    "tbl", "data.frame")), structure(list(replicate_id = c("MCF7--decitabine--Epigenetic::F02_DIA_acq_01::P-0015", 
    "PC3--kn-93--Kinase inhibitor::E07_acq_01::P-0024", "NPC--verapamil--CV::E06_acq_02::P-0063", 
    "A549--ar a014418--Kinase inhibitor::D12_acq_01::P-0033", 
    "A375--gsk525762a--Epigenetic::C01_DIA_acq_01::P-0017", "YAPC--gsk126--Epigenetic::A05_acq_01::P-0030", 
    "YAPC--trametinib--Kinase inhibitor::D10_acq_01::P-0062", 
    "NPC--pri-724--Other::E07_acq_03::P-0027", "PC3--gossypetin--Kinase inhibitor::F09_acq_01::P-0024", 
    "A549--jq1-s--Epigenetic::D03_DIA_acq_01::P-0019", NA, "HUVEC--calyculin a--Kinase inhibitor::D08_acq_01::P-0057", 
    "MCF7--imatinib--CML TKI::C08_acq_01::P-0060", "NPC--byl719--Kinase inhibitor::H07_acq_03::P-0027", 
    "MCF7--c646--Epigenetic::C07_acq_02::P-0023", "NPC--tbb--Kinase inhibitor::B10_DIA_acq_01::P-0020", 
    "YAPC--ms-275--Epigenetic::A07_acq_01::P-0030", "MCF7--ms-275--Epigenetic::A08_DIA_acq_01::P-0015", 
    "HUVEC--curcumin--Other::D08_acq_01::P-0038", "MCF7--etoposide--Other::B02_acq_01::P-0023", 
    "MCF7--unc-0321--Epigenetic::E06_DIA_acq_01::P-0015", "NPC--ex527--Epigenetic::C20150526_P-0016_NPC_T1_cmpds_P-0016_C11_DIA_acq_01::P-0016", 
    "A549--sch 900776--Kinase inhibitor::B08_acq_01::P-0033", 
    "A549--gsk126--Epigenetic::A06_DIA_acq_01::P-0019", NA), 
        pert_iname = c("decitabine", "kn-93", "verapamil", "ar a014418", 
        "gsk525762a", "gsk126", "trametinib", "pri-724", "gossypetin", 
        "jq1-s", "cyclosporine", "calyculin a", "imatinib", "byl719", 
        "c646", "tbb", "ms-275", "ms-275", "curcumin", "etoposide", 
        "unc-0321", "ex527", "sch 900776", "gsk126", "ceritinib"
        ), pert_class = c("Epigenetic", "Kinase inhibitor", "CV", 
        "Kinase inhibitor", "Epigenetic", "Epigenetic", "Kinase inhibitor", 
        "Other", "Kinase inhibitor", "Epigenetic", NA, "Kinase inhibitor", 
        "CML TKI", "Kinase inhibitor", "Epigenetic", "Kinase inhibitor", 
        "Epigenetic", "Epigenetic", "Other", "Other", "Epigenetic", 
        "Epigenetic", "Kinase inhibitor", "Epigenetic", NA), 
        pr_gene_symbol = c("pS207 FASN", "pS230 SH3KBP1", "pS465 WDR20", 
        "pS652 NUFIP2", "pS601 LARP5", "pT3893 PLEC1_1", "pY321 DYRK1A", 
        "pS163 MAP3K2", "pS2 PAK2", "pS200 FOSL2_1", "pS402 SRRM1", 
        "pS2 TMSB4X", "pS1012 NUP214", "pS1075 AP1GBP1", "pS1219 BAT2", 
        "pS353 SRRM2", "pS142 DPF2", "pT2675 BAT2D1", "pS1035 GPATCH8", 
        "pS275 NANS", "pS556 ULK1", "pS103 DHX16", "pS515 ZC3H14", 
        "pS1075 AP1GBP1", "pS207 FASN"), value = c(0.0880586504936218, 
        -0.1592857837677, -0.232870101928711, 0.145888328552246, 
        -0.233265995979309, 0.659018278121948, 0.03665614, -0.080927848815918, 
        0.115837097167969, 0.234739303588867, 0.192527055740356, 
        0, 0.315743684768677, -0.426863610744476, -0.0367576479911804, 
        -0.16833115, 0.579586148262024, 0.263529172516428, -0.7783067, 
        -0.214210987091064, 0.105207920074463, 0.728229999542236, 
        -0.03392493724823, -0.106773972511292, -0.0157837867736816
        )), row.names = c(NA, -25L), class = c("tbl_df", "tbl", 
    "data.frame")), structure(list(replicate_id = c(NA, "A375--tretinoin--Other::A06_acq_01::G-0022", 
    "HAoSMC--dexamethasone--Other::A08_acq_01::G-0039a", "YAPC--epz-5687--Epigenetic::T20151105_Plate30_YAPC_T1_G-0030_G09_acq_01::G-0030", 
    "NPC--tbb--Kinase inhibitor::B12_acq_01::G-0020", "PC3--vandetanib--VEGFR inhibitor::E12_acq_02::G-0061", 
    "A549--DMSO--control::A01_acq_01::G-0033", "YAPC--flavopiridol--Kinase inhibitor::A04_acq_01::G-0032", 
    "HUVEC--tretinoin--Other::B09_acq_01::G-0038", "A375--sotrastaurin--IMiD::T_F12_acq_02::G-0022", 
    "A375--afuresertib--Kinase inhibitor::G07_acq_01::G-0028", 
    "MCF7--vx-970--Kinase inhibitor::C09_acq_02::G-0029", "A375--axitinib--VEGFR inhibitor::C01_acq_01::G-0059", 
    "NPC--trichostatin a--Epigenetic::C07_acq_02::G-0016R", "MCF7--nilotinib--CML TKI::B10_acq_01::G-0029", 
    "A549--DMSO--control::A01_acq_01::G-0058", "NPC--vx-970--Kinase inhibitor::C07_acq_01::G-0027", 
    "A375--tacrolimus--Other::H01_acq_01::G-0022", "A375--epz004777--Epigenetic::H01_acq_01::G-0017", 
    "A375--axitinib--VEGFR inhibitor::C01_acq_01::G-0059", "NPC--axitinib--VEGFR inhibitor::C01_acq_01::G-0063", 
    "HAoSMC--gsk126--Epigenetic::B05_acq_01::G-0039a", "NPC--ly-294002--Epigenetic::H12_acq_01::G-0016R", 
    "PC3--vemurafenib--Kinase inhibitor::E03_acq_01::G-0034", 
    "A375--lenalidomide--IMiD::C12_acq_01::G-0028"), pert_iname = c("olmesartan", 
    "tretinoin", "dexamethasone", "epz-5687", "tbb", "vandetanib", 
    "DMSO", "flavopiridol", "tretinoin", "sotrastaurin", "afuresertib", 
    "vx-970", "axitinib", "trichostatin a", "nilotinib", "DMSO", 
    "vx-970", "tacrolimus", "epz004777", "axitinib", "axitinib", 
    "gsk126", "ly-294002", "vemurafenib", "lenalidomide"), pert_class = c(NA, 
    "Other", "Other", "Epigenetic", "Kinase inhibitor", "VEGFR inhibitor", 
    "control", "Kinase inhibitor", "Other", "IMiD", "Kinase inhibitor", 
    "Kinase inhibitor", "VEGFR inhibitor", "Epigenetic", "CML TKI", 
    "control", "Kinase inhibitor", "Other", "Epigenetic", "VEGFR inhibitor", 
    "VEGFR inhibitor", "Epigenetic", "Epigenetic", "Kinase inhibitor", 
    "IMiD"), pr_gene_symbol = c("H3K9me2S10ph1K14ac0", "H3K27me1K36me3", 
    "H3K27me2K36me2", "H3K9me2S10ph1K14ac0", "H3K27me3K36me1", 
    "H4(4to17)K5ac1K8ac1K12ac1K16ac1me0", "H3K27me3K36me2", "H3K27me2K36me1", 
    "H3K9ac1S10ph1K14ac1", "H3K9me1K14ac1", "H3K27me2K36me2", 
    "H3K9me1S10ph1K14ac0", "H3K9me1K14ac0", "H3K9me1S10ph1K14ac1", 
    "H3K27me2K36me2", "H3K27me2K36me1", "H3K27me3K36me0", "H3K27ac1K36me0", 
    "H3K9me3S10ph1K14ac0", "H3K9me2S10ph1K14ac1", "H4(20to23)K20me3", 
    "H3NORM(41-49)", "H3K18ac0K23ac1", "H3K27ac1K36me0", "H3K27me3K36me2"
    ), value = c(0.0708658695220947, -0.27732902765274, 0.335925102233887, 
    0.148061037063599, 0.0192266702651978, -0.06201318, -0.32350385, 
    -0.113838315010071, 0.888034820556641, 0.0549858212471008, 
    -0.474118649959564, -0.0354797840118408, -0.1448524, -1.6616924, 
    -0.0739709138870239, -0.191317439079285, 0.404392004013062, 
    -1.12328338623047, -0.106091976, -0.45647645, 0.166058421134949, 
    -0.0442338585853577, -0.59617877, -0.9624169, 0.0658320188522339
    )), row.names = c(NA, -25L), class = c("tbl_df", "tbl", "data.frame"
    )), structure(list(replicate_id = c("NPC--pd0325901--Kinase inhibitor::C02_acq_01::G-0027", 
    "NPC--resveratrol--Other::B10_acq_02::G-0016R", "HAoSMC--sirolimus--Other::F04_acq_01::G-0039a", 
    "PC3--rgfp966--Epigenetic::E12_acq_01::G-0024", "MCF7--rolipram--Other::D10_acq_01::G-0023", 
    "PC3--sirolimus--Other::F11_acq_01::G-0018", "A375--sorafenib--VEGFR inhibitor::C10_acq_01::G-0059", 
    "PC3--tbb--Kinase inhibitor::B10_acq_01::G-0024", NA, "MCF7--staurosporine--Kinase inhibitor::G11_acq_01::G-0029", 
    "A375--pravastatin--CV::H02_acq_01::G-0028", "PC3--selumetinib--Kinase inhibitor::B01_acq_01::G-0034", 
    NA, "A549--sp600125--Kinase inhibitor::G04_acq_01::G-0033", 
    "A549--salermide--Epigenetic::D11_acq_01::G-0019", "MCF7--roscovitine--Kinase inhibitor::C11_acq_01::G-0023", 
    "YAPC--everolimus--Kinase inhibitor::D02_acq_01::G-0032", 
    "YAPC--verteporfin--Other::F06_acq_02::G-0032", "PC3--osi-027--Kinase inhibitor::A12_acq_02::G-0018", 
    "A549--dexamethasone--Other::D04_acq_01::G-0025", "HAoSMC--jq1-s--Epigenetic::G03_acq_01::G-0039a", 
    "HUVEC--tofacitinib--Kinase inhibitor::H08_acq_01::G-0038", 
    "MCF7--methylstat--Epigenetic::E09_acq_02::G-0015", NA, "PC3--DMSO--control::A03_acq_01::G-0018"
    ), pert_iname = c("pd0325901", "resveratrol", "sirolimus", 
    "rgfp966", "rolipram", "sirolimus", "sorafenib", "tbb", "lapatinib", 
    "staurosporine", "pravastatin", "selumetinib", "ceritinib", 
    "sp600125", "salermide", "roscovitine", "everolimus", "verteporfin", 
    "osi-027", "dexamethasone", "jq1-s", "tofacitinib", "methylstat", 
    "lapatinib", "DMSO"), pert_class = c("Kinase inhibitor", 
    "Other", "Other", "Epigenetic", "Other", "Other", "VEGFR inhibitor", 
    "Kinase inhibitor", NA, "Kinase inhibitor", "CV", "Kinase inhibitor", 
    NA, "Kinase inhibitor", "Epigenetic", "Kinase inhibitor", 
    "Kinase inhibitor", "Other", "Kinase inhibitor", "Other", 
    "Epigenetic", "Kinase inhibitor", "Epigenetic", NA, "control"
    ), pr_gene_symbol = c("H3K9me0S10ph1K14ac0", "H3K27me2K36me1", 
    "H3K9me2K14ac0", "H3NORM(41-49)", "H3K4me2", "H3K27me0K36me0", 
    "H3K9me1K14ac1", "H3K9ac1K14ac1", "H4(4to17)K5ac1me0", "H3K18ac0K23ac0", 
    "H3K4me2", "H3K4me1", "H3K27me3K36me0", "H3K27me2K36me2", 
    "H3K9me2K14ac0", "H3K27me1K36me0", "H3K27me3K36me2", "H3K27ac1K36me0", 
    "H3K18ac1K23ac1", "H3K79me0", "H3K27me3K36me1", "H3K27me3K36me2", 
    "H3K27me3K36me0", "H4(4to17)K8ac1K12ac1me0", "H3K4me1"), 
        value = c(-0.601195812225342, -0.056107312, 0.131532583385706, 
        -0.121334552764893, -0.216784358024597, -0.261847794055939, 
        -0.218853, -0.189859390258789, 0.0388565063476562, -0.322605133056641, 
        -0.0750547647476196, 0, -0.29474682, 0.3334179, -0.381944835186005, 
        -0.677833437919617, 0.150129675865173, 0.153744220733643, 
        1.26011848449707, 0.225029110908508, 0.124394655227661, 
        0.0346214175224304, 0.271708130836487, 0.57789421081543, 
        -0.622473895549774)), row.names = c(NA, -25L), class = c("tbl_df", 
    "tbl", "data.frame")))), row.names = c(NA, -4L), class = c("tbl_df", 
"tbl", "data.frame"))

Here's my code so far:

 data_split_ %>%
  mutate(perturbation_filtered_data = pmap(.l = list(grouping_var, filter_vars, exclude, data), .f = function(g_var, f_var, e_var, d){
    
    if (any(is.na(unlist(e_var)))) {
      exclude_term <- ""
    } else {
      exclude_term <- unlist(e_var)
    }
    
    new_pert_df <- my_perts_df_ %>%
      filter(!(pert_iname %in% exclude_term)) %>%
      filter(sym(g_var) == f_var)
    res <- d %>% 
      filter(pert_iname %in% new_pert_df$pert_iname)
    return(res)
  }))

# A tibble: 4 x 7
  dataset_type grouping_var filter_vars exclude   output_dir        data              perturbation_filtered_data
  <chr>        <chr>        <list>      <list>    <chr>             <list>            <list>                    
1 P100         pert_class   <chr [1]>   <chr [1]> output_final/p100 <tibble [25 x 5]> <tibble [0 x 5]>          
2 P100         pert_iname   <chr [1]>   <chr [1]> output_final/p100 <tibble [25 x 5]> <tibble [0 x 5]>          
3 GCP          pert_class   <chr [1]>   <chr [1]> output_final/gcp  <tibble [25 x 5]> <tibble [0 x 5]>          
4 GCP          pert_iname   <chr [1]>   <chr [1]> output_final/gcp  <tibble [25 x 5]> <tibble [0 x 5]>          

The filtered datasets all come back with 0 rows, which is wrong. I'm expecting a couple retained rows in each of the datasets. Any help is much appreciated!

0

There are 0 best solutions below