R: How to subset the string in a dataframe column?

March 16, 2022

Samples are either wild-type or mutant. I want to find the number of samples that are wild-type wt versus mutant mut with respect to the respondents Responder in the Group1 column of the res.sig dataframe. In the n_mutated_group1 column, the first letter indicates the number of mutant and the last two digits (after or) indicates the total number for that group. wt is the difference between total and mut.

# Number of samples in wild-type versus mutant with respect to the most significantly enriched genes 
total <- sum(as.numeric(res.sig %>%
                       filter(Group1=="Responder") %>%
                       mutate(last_letter = substr(n_mutated_group1,-2,-1)) %>%
                       pull(last_letter)))

mut <- sum(as.numeric(res.sig %>%
                        filter(Group1=="Responder") %>%
                        mutate(first_letter = substr(n_mutated_group1,1,1)) %>%
                        pull(first_letter)))

wt <- total-mut

res.sig

> dput(res.sig)
structure(list(Hugo_Symbol = c("ERCC2", "ERCC2", "AKAP9", "AKAP9", 
"HERC1", "HERC1", "HECTD1", "HECTD1", "MACF1", "MACF1", "MROH2B", 
"MROH2B", "KMT2C", "KMT2C"), Group1 = c("Non-Responder", "Responder", 
"Non-Responder", "Responder", "Non-Responder", "Responder", "Non-Responder", 
"Responder", "Non-Responder", "Responder", "Non-Responder", "Responder", 
"Non-Responder", "Responder"), Group2 = c("Rest", "Rest", "Rest", 
"Rest", "Rest", "Rest", "Rest", "Rest", "Rest", "Rest", "Rest", 
"Rest", "Rest", "Rest"), n_mutated_group1 = c("0 of 25", "9 of 25", 
"0 of 25", "6 of 25", "0 of 25", "6 of 25", "0 of 25", "6 of 25", 
"0 of 25", "6 of 25", "0 of 25", "6 of 25", "1 of 25", "7 of 25"
), n_mutated_group2 = c("9 of 25", "0 of 25", "6 of 25", "0 of 25", 
"6 of 25", "0 of 25", "6 of 25", "0 of 25", "6 of 25", "0 of 25", 
"6 of 25", "0 of 25", "7 of 25", "1 of 25"), p_value = c(0.00163083541184905, 
0.00163083541184905, 0.022289766970618, 0.022289766970618, 0.022289766970618, 
0.022289766970618, 0.022289766970618, 0.022289766970618, 0.022289766970618, 
0.022289766970618, 0.022289766970618, 0.022289766970618, 0.0487971536957187, 
0.0487971536957187), OR = c(0, Inf, 0, Inf, 0, Inf, 0, Inf, 0, 
Inf, 0, Inf, 0.111488645279478, 8.96952328636894), OR_low = c(0, 
2.56647319276964, 0, 1.33358819424024, 0, 1.33358819424024, 0, 
1.33358819424024, 0, 1.33358819424024, 0, 1.33358819424024, 0.00228988507629356, 
1.0079479819766), OR_high = c(0.38963976043749, Inf, 0.749856668137133, 
Inf, 0.749856668137133, Inf, 0.749856668137133, Inf, 0.749856668137133, 
Inf, 0.749856668137133, Inf, 0.992114690322592, 436.703138665198
), fdr = c(0.109265972593886, 0.109265972593886, 0.248902397838568, 
0.248902397838568, 0.248902397838568, 0.248902397838568, 0.248902397838568, 
0.248902397838568, 0.248902397838568, 0.248902397838568, 0.248902397838568, 
0.248902397838568, 0.467058471087594, 0.467058471087594)), row.names = c(NA, 
-14L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x000002adab171ef0>)

Problem:

Here, I’m getting total as NA_real_ instead of the total number.

>Solution :

You can extract for total everything after the of string in your column and for mut you can extract everything before of. You can use this code:

library(tidyverse)
  total <- sum(as.numeric(res.sig %>%
                          filter(Group1=="Responder") %>%
                          mutate(first_letter = sub('.+of(.+)', '\\1', n_mutated_group1)) %>%
                          pull(first_letter)))
  
  mut <- sum(as.numeric(res.sig %>%
                            filter(Group1 == "Responder") %>%
                            mutate(last_letter = sub("\\of.*", "", n_mutated_group1)) %>%
                            pull(last_letter)))
  
  wt <- total-mut
  wt

Output wt: