Advertisements
In a data.frame of differential expression values, count the genes per group that are significantly up and down-regulated. Significance shall be defined by FDR (false discovery rate = adjusted p-value from Benjamini) and fold-change. Results should be a plot with up and down regs per group.
(Sweet bonus: show in the plot the different Fc levels (eg: 0.5, 1, 2, 4, >4).
My solution seems way too complicated, there must be an easier way.
Example data
# create dex df
gene_creator <- paste("gene",1:1000,sep="")
genes = sample(gene_creator,100)
dex_A <- data.frame(
gene = genes,
group = "group_A",
logFC = sample(c(-5:5), replace=T, size=100),
FDR = sample(c(0.01,1), replace=T, size=100)
)
dex_B <- data.frame(
gene = genes,
group = "group_B",
logFC = sample(c(-5:5), replace=T, size=100),
FDR = sample(c(0.01,1), replace=T, size=100)
)
dex_C <- data.frame(
gene = genes,
group = "group_C",
logFC = sample(c(-5:5), replace=T, size=100),
FDR = sample(c(0.01,1), replace=T, size=100)
)
dex_D <- data.frame(
gene = genes,
group = "group_D",
logFC = sample(c(-5:5), replace=T, size=100),
FDR = sample(c(0.01,1), replace=T, size=100)
)
dex_df <- rbind(dex_A, dex_B, dex_C, dex_D)
Solution
library("tidyverse")
# FC up
dex_up <- dex_df %>%
group_by(group) %>%
filter(FDR <= 0.05) %>%
filter(logFC > 0.5 ) %>%
summarise(n_up = n())
# Fc down
dex_down <- dex_df %>%
group_by(group) %>%
filter(FDR <= 0.05) %>%
filter(logFC < 0.5 ) %>%
summarise(n_down = n())
# format
dex_comb <- left_join(dex_up, dex_down, by = c("group"))
dex_comb$n_down <- dex_comb$n_down * -1
dex_comb_long <- dex_comb %>% pivot_longer(!group, names_to = "direction", values_to = "n")
# plot
dex_comb_long %>%
ggplot(aes(x = group, y = n, fill = direction)) +
geom_bar(stat="identity", position="identity") +
geom_text(aes(label=n, vjust = -sign(n))) +
ggtitle("Dex numbers per group")
>Solution :
The usual way to count the number of times a condition is met is to sum()
that condition:
dex_summary = dex_df %>%
group_by(group) %>%
summarize(
n_up = sum(FDR <= 0.05 & logFC > 0.5),
n_down = -sum(FDR <= 0.05 & logFC < 0.5)
) %>%
pivot_longer(-group, names_to = "direction", values_to = "n")
# plot
dex_summary %>%
ggplot(aes(x = group, y = n, fill = direction)) +
## using geom_col() instead of geom_bar(stat = "identity")
geom_col() +
geom_text(aes(label=n, vjust = -sign(n))) +
## adding a little padding to the y scale for the numbers
scale_y_continuous(expand = expansion(add = 0.5)) +
ggtitle("Dex numbers per group")
Illustrated with this simplified sample data:
set.seed(47)
gene_creator <- paste("gene",1:100,sep="")
genes = sample(gene_creator,8)
dex_A <- data.frame(
gene = genes,
group = "group_A",
logFC = sample(c(-5:5), replace=T, size=8),
FDR = sample(c(0.01,1), replace=T, size=8)
)
dex_B <- data.frame(
gene = genes,
group = "group_B",
logFC = sample(c(-5:5), replace=T, size=8),
FDR = sample(c(0.01,1), replace=T, size=8)
)
dex_df <- rbind(dex_A, dex_B)
Here’s my suggestion for including logFC
values in the plot:
## re-ran sample data with 20 samples per group
dex_df %>%
filter(FDR <= 0.05 & abs(logFC) > 0.5) %>%
count(group, logFC) %>%
mutate(
direction = sign(logFC),
n_dir = n * sign(direction)
) %>%
ggplot(aes(x = factor(logFC), y = n_dir, fill = factor(direction))) +
geom_col() +
facet_wrap(~group)