Summarize number of significantly up and down-regulated genes per group

Advertisements

In a data.frame of differential expression values, count the genes per group that are significantly up and down-regulated. Significance shall be defined by FDR (false discovery rate = adjusted p-value from Benjamini) and fold-change. Results should be a plot with up and down regs per group.
(Sweet bonus: show in the plot the different Fc levels (eg: 0.5, 1, 2, 4, >4).
My solution seems way too complicated, there must be an easier way.

Example data

# create dex df
gene_creator <- paste("gene",1:1000,sep="")
genes = sample(gene_creator,100)

dex_A <- data.frame(
  gene = genes,
  group = "group_A",
  logFC = sample(c(-5:5), replace=T, size=100),
  FDR = sample(c(0.01,1), replace=T, size=100)
)

dex_B <- data.frame(
  gene = genes,
  group = "group_B",
  logFC = sample(c(-5:5), replace=T, size=100),
  FDR = sample(c(0.01,1), replace=T, size=100)
)

dex_C <- data.frame(
  gene = genes,
  group = "group_C",
  logFC = sample(c(-5:5), replace=T, size=100),
  FDR = sample(c(0.01,1), replace=T, size=100)
)

dex_D <- data.frame(
  gene = genes,
  group = "group_D",
  logFC = sample(c(-5:5), replace=T, size=100),
  FDR = sample(c(0.01,1), replace=T, size=100)
)


dex_df <- rbind(dex_A, dex_B, dex_C, dex_D)

Solution

library("tidyverse")

# FC up
dex_up <- dex_df %>% 
  group_by(group) %>%
  filter(FDR <= 0.05) %>% 
  filter(logFC > 0.5 ) %>%
  summarise(n_up = n())

# Fc down
dex_down <- dex_df %>% 
  group_by(group) %>%
  filter(FDR <= 0.05) %>% 
  filter(logFC < 0.5 ) %>%
  summarise(n_down = n())

# format
dex_comb <- left_join(dex_up, dex_down, by = c("group"))
dex_comb$n_down <- dex_comb$n_down * -1
dex_comb_long <- dex_comb %>% pivot_longer(!group, names_to = "direction", values_to = "n")

# plot
dex_comb_long %>%
  ggplot(aes(x = group, y = n, fill = direction)) + 
  geom_bar(stat="identity", position="identity") +
  geom_text(aes(label=n, vjust = -sign(n))) +
  ggtitle("Dex numbers per group")

>Solution :

The usual way to count the number of times a condition is met is to sum() that condition:

dex_summary = dex_df %>%
  group_by(group) %>%
  summarize(
    n_up = sum(FDR <= 0.05 & logFC > 0.5),
    n_down = -sum(FDR <= 0.05 & logFC < 0.5)
  ) %>%
  pivot_longer(-group, names_to = "direction", values_to = "n")

# plot
dex_summary %>%
  ggplot(aes(x = group, y = n, fill = direction)) + 
  ## using geom_col() instead of geom_bar(stat = "identity")
  geom_col() +
  geom_text(aes(label=n, vjust = -sign(n))) +
  ## adding a little padding to the y scale for the numbers
  scale_y_continuous(expand = expansion(add = 0.5)) +
  ggtitle("Dex numbers per group")


Illustrated with this simplified sample data:

set.seed(47)
gene_creator <- paste("gene",1:100,sep="")
genes = sample(gene_creator,8)

dex_A <- data.frame(
  gene = genes,
  group = "group_A",
  logFC = sample(c(-5:5), replace=T, size=8),
  FDR = sample(c(0.01,1), replace=T, size=8)
)

dex_B <- data.frame(
  gene = genes,
  group = "group_B",
  logFC = sample(c(-5:5), replace=T, size=8),
  FDR = sample(c(0.01,1), replace=T, size=8)
)

dex_df <- rbind(dex_A, dex_B)

Here’s my suggestion for including logFC values in the plot:

## re-ran sample data with 20 samples per group
dex_df %>%
  filter(FDR <= 0.05 & abs(logFC) > 0.5) %>%
  count(group, logFC) %>%
  mutate(
    direction = sign(logFC),
    n_dir = n * sign(direction)
  ) %>%
  ggplot(aes(x = factor(logFC), y = n_dir, fill = factor(direction))) +
  geom_col() +
  facet_wrap(~group)

Leave a ReplyCancel reply