Remove NAs before group_by and summarise in user-defined function

December 14, 2023

Mock data:

df = structure(list(country = c("USA", "USA", "Japan", NA), dimension = c("economic", 
"cultural", "economic", "economic"), score = c(NA, "high", "high", 
"low")), class = "data.frame", row.names = c(NA, -4L))

I wrote this function to summarise frequencies of each variable, then export them to a csv with the name of the variable as the file name:

export <- function(df){   
  for (col in colnames(df)) {  
    table <- df %>%
      group_by(df[col]) %>%
      summarise(Count = n()) %>% 
      mutate(Percent = Count / sum(Count)*100,
             N = sum(Count))
    write.csv(table, paste0(col, ".csv"), row.names = F)
    print(table)
  }                            
}             

export(df) --> this works

But I would like to remove the NA before grouping the data and computing frequencies. I did so:

export <- function(df){   
  for (col in colnames(df)) {  
    table <- df %>%
      filter(!is.na(df[col])) %>%  # Attempt to filter out NAs
      group_by(df[col]) %>%
      summarise(Count = n()) %>% 
      mutate(Percent = Count / sum(Count)*100,
             N = sum(Count))
    write.csv(table, paste0(col, ".csv"), row.names = F)
    print(table)
  }                            
}  

export(df) --> this does not work, and I get this error message:

Error in `group_by()`:
ℹ In argument: `df[col]`.
Caused by error:
! `df[col]` must be size 3 or 1, not 4.

How do I remove these NAs?? I must have made a silly mistake here.

CURRENT OUTPUT (only the first of the three variables to iterate over is shown):

country Count   Percent N
Japan   1       25      4
USA     2       50      4
NA      1       25      4

DESIRED OUTPUT (only the first of the three variables to iterate over is shown):

country Count   Percent     N
Japan   1       33.33333    3
USA     2       66.66667    3

Note that NA are discarded and not included in frequencies.

>Solution :

Here’s a few options, all giving the same result:

export2 <- function(df){   
  for (col in colnames(df)) {  
    table <- df %>%
      filter(if_any(all_of(col), \(x) !is.na(x))) |>
      summarise(Count = n(), .by = all_of(col)) %>% 
      mutate(Percent = Count / sum(Count)*100,
             N = sum(Count))
    #write.csv(table, paste0(col, ".csv"), row.names = F)
    print(table)
  }                            
}             

export2(df)
#   country Count  Percent N
# 1     USA     2 66.66667 3
# 2   Japan     1 33.33333 3
#   dimension Count Percent N
# 1  economic     3      75 4
# 2  cultural     1      25 4
#   score Count  Percent N
# 1  high     2 66.66667 3
# 2   low     1 33.33333 3


export3 <- function(df){   
  for (col in colnames(df)) {  
    table <- df %>%
      select(all_of(col)) |>
      na.omit() |>
      summarise(Count = n(), .by = all_of(col)) %>% 
      mutate(Percent = Count / sum(Count)*100,
             N = sum(Count))
    #write.csv(table, paste0(col, ".csv"), row.names = F)
    print(table)
  }                            
}            

export3(df)
# same as above 

export4 <- function(df){   
  for (col in colnames(df)) {  
    table <- df %>%
      select(all_of(col)) |>
      na.omit() |> 
      count(.data[[col]], name = "Count") |>
      mutate(Percent = Count / sum(Count)*100,
             N = sum(Count))
    #write.csv(table, paste0(col, ".csv"), row.names = F)
    print(table)
  }                            
}    

export4(df)
# same as above