Calculate propensity-scores within strata

November 2, 2023

Assume the following data:

library(cobalt)
library(dplyr)

set.seed(123)

# Extending lalonde data by `group` variable:
lalonde <- cbind(lalonde,
                 group = sample(c("A", "B", "AB"), size = 614, replace = TRUE))

# Creating variable `set` for AB group:
lalonde$set[lalonde$group == "AB"] <- sample(c(5, 10, 15, 3), sum(lalonde$group == "AB"), replace = TRUE)

# Filling 'set' column for `group` 'A' and 'B':
prob_80 <- sample(c(0, 1), nrow(lalonde), replace = TRUE, prob = c(0.8, 0.2))
lalonde$set[(lalonde$group == "A" | lalonde$group == "B") & prob_80 == 1] <- sample(c(5, 10, 15, 3), sum((lalonde$group == "A" | lalonde$group == "B") & prob_80 == 1), replace = TRUE)
lalonde$set[(lalonde$group == "A" | lalonde$group == "B") & is.na(lalonde$set)] <- sample(1:100, sum((lalonde$group == "A" | lalonde$group == "B") & is.na(lalonde$set)), replace = TRUE)

Now we have a group variable containing either A, B, AB and a variable named set.

Now I want to fit a logistic PS model stratified by the set values of group == "AB", predicting being in group AB.

First I would pull the distinct values of set, being in group == AB.

unique_set_values <- unique(lalonde[lalonde$group == "AB", "set"])  %>% 
  print()

which are:

+   print()
[1]  5 15 10  3

I use those to get all observations which belong to one of the set values:

filtered_data <- lalonde %>% 
  filter(set %in% unique_set_values)

Then I split the data and replace AB with 1, else 0:

# For AB and A:
AB_A <- filtered_data %>% 
  filter(group %in% c("AB", "A")) %>%
  mutate(group = ifelse(group == "AB", 1, 0)) 

# For AB and B:
AB_B <- filtered_data %>% 
  filter(group %in% c("AB", "B")) %>%
  mutate(group = ifelse(group == "AB", 1, 0))

Now I can calculate the stratified PS for AB and A as well as AB and B:

# Creating a formula:
formula <- group ~ age + educ + race + married + nodegree + re74 + re75 + re78

But how to calculate the PS stratified by set in such scenario?

I tried:

AB_A_PS <- AB_A %>%
  group_by(set) %>%
  mutate(pscore = glm(formula, data = ., family = binomial(link = "logit"))$fitted.values)

but all I get is an error:

Error in `mutate()`:
ℹ In argument: `pscore = predict(glm(formula, data = ., family = binomial(link = "logit")))`.
ℹ In group 1: `set = 3`.
Caused by error:
! `pscore` must be size 66 or 1, not 238.

So, obviously, it does not work.

Thank you

>Solution :

You are passing the whole grouped data frame to glm for each group within the data frame, hence the error. Instead, you can pass the subset of the data frame that contains only the rows in the current group:

AB_A %>%
  group_by(set) %>%
  mutate(pscore = glm(formula, data = .[cur_group_rows(),], 
                    family = binomial(link = "logit"))$fitted.values)
#> # A tibble: 238 x 12
#> # Groups:   set [4]
#>    treat   age  educ race   married nodegree  re74  re75   re78 group   set pscore
#>    <int> <int> <int> <fct>    <int>    <int> <dbl> <dbl>  <dbl> <dbl> <dbl>  <dbl>
#>  1     1    37    11 black        1        1     0     0  9930.     1     5  0.706
#>  2     1    22     9 hispan       0        1     0     0  3596.     1    15  0.942
#>  3     1    30    12 black        0        0     0     0 24909.     1    15  0.993
#>  4     1    33     8 black        0        1     0     0   290.     1    15  0.977
#>  5     1    22    16 black        0        0     0     0  2164.     1    15  0.835
#>  6     1    17     7 black        0        1     0     0  3024.     1    15  0.908
#>  7     1    27    13 black        0        0     0     0 14582.     1    10  1    
#>  8     1    23    10 black        0        1     0     0  7693.     1    15  0.939
#>  9     1    26    12 black        0        0     0     0 10747.     0     3  0.809
#> 10     1    38     9 white        0        1     0     0  6409.     1    10  1    
#> # i 228 more rows
#> # i Use `print(n = ...)` to see more rows