Follow

Keep Up to Date with the Most Important News

By pressing the Subscribe button, you confirm that you have read and are agreeing to our Privacy Policy and Terms of Use
Contact

Match substrings in a dataframe to another dataframe

The rst.mva.one.leave.out["cpg_ids"] column contains cg* substrings that may correspond with the methreg$probeID.
For all the matches, I want to retrieve the corresponding rows in rst.mva.one.leave.out and store them as rst.subset.

for (i in 1:nrow(rst.mva.one.leave.out)) {
  rst.subset <- rst.mva.one.leave.out[strsplit(rst.mva.one.leave.out["cpg_ids"][i], ",", fixed=T) %in% methreg$probeID,]
}

Traceback:

Error in h(simpleError(msg, call)) : 
  error in evaluating the argument 'x' in selecting a method for function '%in%': non-character argument

Data:

MEDevel.com: Open-source for Healthcare and Education

Collecting and validating open-source software for healthcare, education, enterprise, development, medical imaging, medical records, and digital pathology.

Visit Medevel

dput(rst.mva.one.leave.out[1:20,2:8])

structure(list(sample = c("TCGA.A3.3357.01", "TCGA.A3.3357.01", 
"TCGA.A3.3357.01", "TCGA.A3.3357.01", "TCGA.A3.3357.01", "TCGA.A3.3357.01", 
"TCGA.A3.3357.01", "TCGA.A3.3357.01", "TCGA.A3.3357.01", "TCGA.A3.3357.01", 
"TCGA.A3.3357.01", "TCGA.A3.3357.01", "TCGA.A3.3367.01", "TCGA.A3.3367.01", 
"TCGA.A3.3367.01", "TCGA.A3.3367.01", "TCGA.A3.3367.01", "TCGA.A3.3367.01", 
"TCGA.A3.3367.01", "TCGA.A3.3367.01"), chromosome = c("7", "3", 
"7", "6", "8", "8", "14", "16", "1", "19", "16", "6", "1", "17", 
"16", "5", "7", "5", "6", "3"), start = c(25989524L, 50275446L, 
134142981L, 146348486L, 116679763L, 145747140L, 37641261L, 73125573L, 
27874258L, 37407041L, 57770390L, 43043691L, 46769035L, 26925852L, 
2652948L, 43039592L, 79082898L, 131563095L, 26195697L, 36985996L
), end = c(25989763L, 50275694L, 134144288L, 146348913L, 116680346L, 
145748059L, 37641600L, 73126120L, 27874526L, 37407284L, 57771195L, 
43043815L, 46769320L, 26926511L, 2653839L, 43040885L, 79083753L, 
131563764L, 26197071L, 36986642L), sz = c(239L, 248L, 1307L, 
427L, 583L, 919L, 339L, 547L, 268L, 243L, 805L, 124L, 285L, 659L, 
891L, 1293L, 855L, 669L, 1374L, 646L), cpg_n = c(3, 3, 4, 3, 
3, 4, 3, 3, 3, 3, 3, 3, 5, 6, 6, 6, 6, 3, 5, 6), cpg_ids = c("cg08767938,cg07184013,cg03853208", 
"cg06508783,cg00813746,cg05060704", "cg00174851,cg06864853,cg02215070,cg04663564", 
"cg06121352,cg09179079,cg03478199", "cg06368590,cg03615269,cg05998283", 
"cg03949978,cg01099220,cg02479773,cg08872590", "cg01168865,cg07535928,cg03611555", 
"cg08992827,cg08187089,cg00293191", "cg00178877,cg05590948,cg09060489", 
"cg03584288,cg02370417,cg02459604", "cg00816177,cg03029127,cg01656750", 
"cg02057561,cg08747889,cg00252032", "cg02537149,cg03979582,cg04410181,cg06961071,cg00122254", 
"cg01626899,cg00449941,cg05564086,cg06774283,cg01724566,cg06329022", 
"cg03846022,cg08981282,cg02512202,cg01195053,cg03314158,cg00433159", 
"cg04122815,cg08205639,cg05551979,cg03723510,cg01313313,cg00257271", 
"cg04671932,cg04652097,cg08641579,cg02523844,cg05270344,cg07448060", 
"cg01211041,cg09140281,cg05501285", "cg08117800,cg02612650,cg03181300,cg03785755,cg02902477", 
"cg06686826,cg06163735,cg00325599,cg01756288,cg03450370,cg07696485"
)), row.names = c(NA, 20L), class = "data.frame")

dput(methreg[,1:3])

structure(list(regionID = c("chr7:87152539-87152540", "chr19:51905083-51905084", 
"chr19:36687587-36687588", "chr12:53985031-53985032", "chr8:85177989-85177990", 
"chr1:40161274-40161275", "chr12:51083289-51083290", "chr7:87152539-87152540", 
"chr19:55581239-55581240", "chr19:49927656-49927657", "chr19:55581239-55581240", 
"chr18:75212075-75212076", "chr7:99505269-99505270", "chr20:2692357-2692358", 
"chr2:173964147-173964148", "chr5:179024077-179024078", "chr19:52369920-52369921"
), probeID = c("cg08767938", "cg04425820", "cg09307868", "cg08737755", 
"cg04950789", "cg08707192", "cg04396637", "cg07560681", "cg06560912", 
"cg07469215", "cg06560912", "cg05448504", "cg00155700", "cg03227128", 
"cg07040405", "cg04681963", "cg03005603"), target_symbol = c("DMTF1", 
"ZNF649", "ZNF567", "HOXC10", "E2F5", "RLF", "CSRNP2", "DMTF1", 
"ZNF579", "ATF5", "ZNF579", "TSHZ1", "ZNF394", "EBF4", "SP3", 
"ZNF879", "ZNF880")), class = "data.frame", row.names = c("14", 
"87", "78", "43", "23", "1", "40", "13", "94", "83", "92", "62", 
"15", "53", "4", "131", "51"))

Expected output:

structure(list(sample = "TCGA.A3.3357.01", chromosome = "7", 
    start = 25989524L, end = 25989763L, sz = 239L, cpg_n = 3, 
    cpg_ids = "cg08767938,cg07184013,cg03853208"), row.names = 1L, class = "data.frame")

>Solution :

I don’t know if I have understood you clearly, you can try the following.

rst.subset <- rst.mva.one.leave.out[
  sapply(
    strsplit(rst.mva.one.leave.out$cpg_ids, ','), 
    function(x) any(x %in% methreg$probeID)), ]

rst.subset

#           sample chromosome    start      end  sz cpg_n                          cpg_ids
#1 TCGA.A3.3357.01          7 25989524 25989763 239     3 cg08767938,cg07184013,cg03853208

This splits the strings from rst.mva.one.leave.out$cpg_ids on comma and the row is selected if any value is matched with methreg$probeID.


Another approach that we can use is using pattern matching with grepl provided methreg$probeID is not very huge.

rst.mva.one.leave.out[grepl(paste0(methreg$probeID, collapse = '|'), 
                      rst.mva.one.leave.out$cpg_ids), ]
Add a comment

Leave a Reply

Keep Up to Date with the Most Important News

By pressing the Subscribe button, you confirm that you have read and are agreeing to our Privacy Policy and Terms of Use

Discover more from Dev solutions

Subscribe now to keep reading and get access to the full archive.

Continue reading