With a dataset like original:
id <- c("JF", "GH", "GH", "ANN", "GH", "ROG", "JF")
group <- c("most", "least", "most", "least", "least", "most", "least")
NP <- c(4,6,18,1,3,12,8)
iso_USA <- c(1, 0, 0, 0, 0, 1, 1)
iso_CHN <- c(0, 1, 1, 0, 0, 0, 0)
color <- c("blue", "orange", "blue", "blue", "red", "orange", "black")
original <- data.frame(id, group, NP, iso_USA, iso_CHN, color)
numeric <- unlist(lapply(original, is.numeric))
numeric <- names(original[ , numeric])
char <- unlist(lapply(original, is.character))
char <- names(original[ , char])
char <- char[-1] #remove id from variables of interest
I want to group by "group" and calculate the median for the numeric variables and the mode for the character variables. Therefore, the data looks like original2. Note that my actual dataset has way more columns than the mock version presented here:
group <- c("least", "most")
NP <- c(6,12)
iso_USA <- c(0,1)
iso_CHN <- c(0, 0)
color <- c("orange", "blue")
original2 <- data.frame(group, NP, iso_USA, iso_CHN, color)
Any clue?
>Solution :
Using dplyr‘s across functionality and the the accepted answer at the FAQ about implementing a mode function:
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
library(dplyr)
original %>%
select(-id) %>%
group_by(group) %>%
summarize(
across(where(is.numeric), median),
across(where(is.character), Mode)
)
# # A tibble: 2 × 6
# group NP iso_USA iso_CHN color
# <chr> <dbl> <dbl> <dbl> <chr>
# 1 least 4.5 0 0 orange
# 2 most 12 1 0 blue