Follow

Keep Up to Date with the Most Important News

By pressing the Subscribe button, you confirm that you have read and are agreeing to our Privacy Policy and Terms of Use
Contact

How to perform for loop to apply custom function with grouping

I’m trying to perform a forloop to apply a custom summarise function to all the numeric columns in the dataframe. The forloop output seems to ignore the grouping factor- however, if I perform the function alone on a single column (without the for loop), it provides the correct output.

#sample df
structure(list(participant = c("pt04", "pt75", "pt21", "pt73", 
"pt27", "pt39", "pt43", "pt52", "pt69", "pt49", "pt50", "pt56", 
"pt62", "pt68", "pt22", "pt64", "pt54", "pt79", "pt36", "pt26", 
"pt65", "pt38"), group = structure(c(1L, 2L, 2L, 1L, 1L, 2L, 
1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L
), .Label = c("c", "e"), class = "factor"), sex = structure(c(2L, 
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 
2L, 1L, 1L, 2L, 1L), .Label = c("m", "f"), class = "factor"), 
    fm_bdc3 = c(18.535199635968, 23.52996574649, 17.276246451976, 
    11.526088555461, 23.805048656112, 23.08597823716, 28.691020942436, 
    28.968097858499, 23.378093165331, 22.491725344661, 14.609015054932, 
    19.734914019306, 31.947412973684, 25.152298171274, 12.007356801787, 
    20.836128108938, 22.322230884349, 14.777652101515, 21.389572717608, 
    16.992853675086, 14.138189878472, 17.777235203826), fm_rec3 = c(18.545007190636, 
    23.017181869742, 17.031403417007, 11.227201061887, 23.581434653208, 
    21.571120542136, 28.919246372213, 28.138632765662, 22.990408911436, 
    22.274932676852, 14.012586350504, 19.066675709151, 30.897705534847, 
    24.491614222412, 11.670939246332, 20.306494543464, 22.052263684182, 
    14.252973638341, 21.028701096846, 17.207104923059, 13.172159777361, 
    17.610831079442), fm_chg = c(0.00980755466799721, -0.512783876747999, 
    -0.244843034968998, -0.298887493573998, -0.223614002904, 
    -1.514857695024, 0.228225429777002, -0.829465092836998, -0.387684253894999, 
    -0.216792667809003, -0.596428704428, -0.668238310155001, 
    -1.049707438837, -0.660683948862001, -0.336417555455, -0.529633565474001, 
    -0.269967200167002, -0.524678463173998, -0.360871620761998, 
    0.214251247972999, -0.966030101111, -0.166404124383998), 
    fm_percchg = c(0.00052913132097943, -0.0217928016671462, 
    -0.0141722355981437, -0.0259313896588437, -0.00939355370091154, 
    -0.0656180855522784, 0.00795459423472242, -0.0286337438132355, 
    -0.0165832282022865, -0.00963877445980213, -0.0408260722701251, 
    -0.0338607155572751, -0.0328573534170568, -0.0262673392452288, 
    -0.028017619615079, -0.025419001203338, -0.0120940958619099, 
    -0.0355048596062299, -0.0168713805332318, 0.0126083147698213, 
    -0.0683277073949869, -0.00936051767758492)), row.names = c(NA, 
-22L), class = "data.frame")

#my function:
 summbygrp <- function(x) {
  group_by(dexadf, group) %>%
        summarise(
          count = n(),
          mean = mean({{x}}, na.rm = TRUE),
          sd = sd({{x}}, na.rm = TRUE)
        ) %>%
        mutate(se = sd / sqrt(11),
               lower.ci = mean - qt(1 - (0.05 / 2), 11 - 1) * se,
               upper.ci = mean + qt(1 - (0.05 / 2), 11 - 1) * se
              ) 
} 

#apply function to all numeric columns and print column names before output
coln = 1

for (col in dexadf) {
  print(colnames(dexadf)[coln])
  coln = coln + 1
  if(is.numeric(col)) {
    print(summbygrp(col))
  } else {next}
}

#output: 
[1] "fm_bdc3"
# A tibble: 2 × 7
  group count  mean    sd    se lower.ci upper.ci
  <fct> <int> <dbl> <dbl> <dbl>    <dbl>    <dbl>
1 c        11  20.6  5.48  1.65     16.9     24.3
2 e        11  20.6  5.48  1.65     16.9     24.3
[1] "fm_rec3"
# A tibble: 2 × 7
  group count  mean    sd    se lower.ci upper.ci
  <fct> <int> <dbl> <dbl> <dbl>    <dbl>    <dbl>
1 c        11  20.1  5.41  1.63     16.5     23.8
2 e        11  20.1  5.41  1.63     16.5     23.8
[1] "fm_chg"
# A tibble: 2 × 7
  group count   mean    sd    se lower.ci upper.ci
  <fct> <int>  <dbl> <dbl> <dbl>    <dbl>    <dbl>
1 c        11 -0.450 0.406 0.122   -0.723   -0.178
2 e        11 -0.450 0.406 0.122   -0.723   -0.178
[1] "fm_percchg"
# A tibble: 2 × 7
  group count    mean     sd      se lower.ci upper.ci
  <fct> <int>   <dbl>  <dbl>   <dbl>    <dbl>    <dbl>
1 c        11 -0.0227 0.0198 0.00598  -0.0360 -0.00942
2 e        11 -0.0227 0.0198 0.00598  -0.0360 -0.00942

As you can see, all the means for both groups are the same, and I know this shouldn’t be true. Could someone identify the error in the code? Thank you!

MEDevel.com: Open-source for Healthcare and Education

Collecting and validating open-source software for healthcare, education, enterprise, development, medical imaging, medical records, and digital pathology.

Visit Medevel

>Solution :

So instead of using for-loops you can do better,

library(dplyr)
library(rlang)
library(purrr)
library(tibble)


dexadf <- data.frame(
  stringsAsFactors = FALSE,
  participant = c("pt04","pt75","pt21","pt73",
                  "pt27","pt39","pt43","pt52","pt69","pt49","pt50",
                  "pt56","pt62","pt68","pt22","pt64","pt54","pt79",
                  "pt36","pt26","pt65","pt38"),
  fm_bdc3 = c(18.535199635968,23.52996574649,
              17.276246451976,11.526088555461,23.805048656112,
              23.08597823716,28.691020942436,28.968097858499,
              23.378093165331,22.491725344661,14.609015054932,19.734914019306,
              31.947412973684,25.152298171274,12.007356801787,
              20.836128108938,22.322230884349,14.777652101515,
              21.389572717608,16.992853675086,14.138189878472,17.777235203826),
  fm_rec3 = c(18.545007190636,
              23.017181869742,17.031403417007,11.227201061887,23.581434653208,
              21.571120542136,28.919246372213,28.138632765662,
              22.990408911436,22.274932676852,14.012586350504,19.066675709151,
              30.897705534847,24.491614222412,11.670939246332,
              20.306494543464,22.052263684182,14.252973638341,
              21.028701096846,17.207104923059,13.172159777361,17.610831079442),
  fm_chg = c(0.00980755466799721,
             -0.512783876747999,-0.244843034968998,-0.298887493573998,
             -0.223614002904,-1.514857695024,0.228225429777002,
             -0.829465092836998,-0.387684253894999,-0.216792667809003,
             -0.596428704428,-0.668238310155001,-1.049707438837,
             -0.660683948862001,-0.336417555455,-0.529633565474001,
             -0.269967200167002,-0.524678463173998,-0.360871620761998,
             0.214251247972999,-0.966030101111,-0.166404124383998),
  fm_percchg = c(0.00052913132097943,
                 -0.0217928016671462,-0.0141722355981437,-0.0259313896588437,
                 -0.00939355370091154,-0.0656180855522784,
                 0.00795459423472242,-0.0286337438132355,-0.0165832282022865,
                 -0.00963877445980213,-0.0408260722701251,-0.0338607155572751,
                 -0.0328573534170568,-0.0262673392452288,-0.028017619615079,
                 -0.025419001203338,-0.0120940958619099,
                 -0.0355048596062299,-0.0168713805332318,0.0126083147698213,
                 -0.0683277073949869,-0.00936051767758492),
  group = as.factor(c("c","e",
                      "e","c","c","e","c","e","c","e","e","c",
                      "e","c","c","e","e","c","e","c","e",
                      "c")),
  sex = as.factor(c("f","m",
                    "m","m","m","m","m","f","m","f","f","f",
                    "f","f","f","f","m","f","m","m","f",
                    "m"))
)

dexadf <- as_tibble(dexadf)

# Note the use of .data pronoun, since columns will passed to this function as characters

summbygrp <- function(df, x) {
  df %>% 
    group_by(group) %>%
    summarise(
      count = n(),
      mean = mean(.data[[x]], na.rm = TRUE), # use of .data
      sd = sd(.data[[x]], na.rm = TRUE) # use of .data
    ) %>%
    mutate(se = sd / sqrt(11),
           lower.ci = mean - qt(1 - (0.05 / 2), 11 - 1) * se,
           upper.ci = mean + qt(1 - (0.05 / 2), 11 - 1) * se
    )
}

# Here we extract the numerical columns of the dataset
cols <- dexadf %>% 
  select(where(is.numeric)) %>% colnames(.)

cols 
#> [1] "fm_bdc3"    "fm_rec3"    "fm_chg"     "fm_percchg"

# Then instead of for loops we can simply use this map function
map(.x = cols, ~ summbygrp(dexadf, .x))

#> [[1]]
#> # A tibble: 2 × 7
#>   group count  mean    sd    se lower.ci upper.ci
#>   <fct> <int> <dbl> <dbl> <dbl>    <dbl>    <dbl>
#> 1 c        11  19.3  5.49  1.66     15.6     23.0
#> 2 e        11  21.9  5.40  1.63     18.2     25.5
#> 
#> [[2]]
#> # A tibble: 2 × 7
#>   group count  mean    sd    se lower.ci upper.ci
#>   <fct> <int> <dbl> <dbl> <dbl>    <dbl>    <dbl>
#> 1 c        11  19.1  5.54  1.67     15.3     22.8
#> 2 e        11  21.2  5.31  1.60     17.7     24.8
#> 
#> [[3]]
#> # A tibble: 2 × 7
#>   group count   mean    sd     se lower.ci upper.ci
#>   <fct> <int>  <dbl> <dbl>  <dbl>    <dbl>    <dbl>
#> 1 c        11 -0.256 0.311 0.0938   -0.465  -0.0470
#> 2 e        11 -0.645 0.407 0.123    -0.918  -0.371 
#> 
#> [[4]]
#> # A tibble: 2 × 7
#>   group count    mean     sd      se lower.ci upper.ci
#>   <fct> <int>   <dbl>  <dbl>   <dbl>    <dbl>    <dbl>
#> 1 c        11 -0.0149 0.0167 0.00503  -0.0261 -0.00368
#> 2 e        11 -0.0306 0.0203 0.00611  -0.0442 -0.0170

# -------------------------------------------------------------------
# we can also bind all the output results (dataframes) in a single dataframe
map_dfr(.x = cols, ~ summbygrp(dexadf, .x), .id = "vars")

#> # A tibble: 8 × 8
#>   vars  group count    mean     sd      se lower.ci upper.ci
#>   <chr> <fct> <int>   <dbl>  <dbl>   <dbl>    <dbl>    <dbl>
#> 1 1     c        11 19.3    5.49   1.66     15.6    23.0    
#> 2 1     e        11 21.9    5.40   1.63     18.2    25.5    
#> 3 2     c        11 19.1    5.54   1.67     15.3    22.8    
#> 4 2     e        11 21.2    5.31   1.60     17.7    24.8    
#> 5 3     c        11 -0.256  0.311  0.0938   -0.465  -0.0470 
#> 6 3     e        11 -0.645  0.407  0.123    -0.918  -0.371  
#> 7 4     c        11 -0.0149 0.0167 0.00503  -0.0261 -0.00368
#> 8 4     e        11 -0.0306 0.0203 0.00611  -0.0442 -0.0170

Created on 2022-07-09 by the reprex package (v2.0.1)

Add a comment

Leave a Reply

Keep Up to Date with the Most Important News

By pressing the Subscribe button, you confirm that you have read and are agreeing to our Privacy Policy and Terms of Use

Discover more from Dev solutions

Subscribe now to keep reading and get access to the full archive.

Continue reading