R: Random Sampling of Longitudinal Data

December 10, 2022

I have the following dataset in R (e.g. the same students take an exam each year and their results are recorded):

student_id = c(1,1,1,1,1, 2,2,2, 3,3,3,3)
exam_number = c(1,2,3,4,5,1,2,3,1,2,3,4)
exam_result = rnorm(12, 80,10)
my_data = data.frame(student_id, exam_number, exam_result)

  student_id exam_number exam_result
1          1           1    72.79595
2          1           2    81.12950
3          1           3    93.29906
4          1           4    79.33229
5          1           5    76.64106
6          2           1    95.14271

Suppose I take a random sample from this data:

library(dplyr)
random_sample = sample_n(my_data, 5, replace = TRUE)

  student_id exam_number exam_result
1          3           1    76.19691
2          3           3    87.52431
3          2           2    91.89661
4          2           3    80.05088
5          2           2    91.89661

Now, I can take the highest "exam_number" per student from this random sample:

max_value = random_sample %>%
    group_by(student_id) %>%
    summarize(max = max(exam_number))

# A tibble: 2 x 2
  student_id   max
       <dbl> <dbl>
1          2     3
2          3     3

Based on these results – I want to accomplish the following. For the students that were selected in "random_sample":

Create a dataset that contains all rows occurring AFTER the "max exam number" (e.g. call this dataset "data_after")
Create a dataset that contains all rows occurring BEFORE (and equal to) the "max exam number" (e.g. call this dataset "data_before")

In the example I have created, this would look something like this:

# after
  student_id exam_number exam_result
1          3           4    105.5805

# before

  student_id exam_number exam_result
1          2           1    95.14000
2          2           2    91.89000
3          2           3    80.05000
4          3           1    76.19691
5          3           2   102.00875
6          3           3    87.52431

Currently, I am trying to do this in a very indirect way using JOINS and ANTI_JOINS:

    max_3 = as.numeric(max_value[2,2])
    max_s3 =  max_3 - 1
    student_3 = seq(1, max_s3 , by = 1)

    before_student_3 =  my_data[is.element(my_data$exam_number, student_3) & my_data$student_id == 3,]

remainder_student_3 = my_data[my_data$student_id == 3,]
after_student_3 =  anti_join(remainder_student_3, before_student_3)

But I don’t think I am doing this correctly – can someone please show me how to do this?

Thanks!

>Solution :

The code above also uses a join, like it is said in the question. Then, the wanted data sets are created by filtering the join result.

student_id = c(1,1,1,1,1, 2,2,2, 3,3,3,3)
exam_number = c(1,2,3,4,5,1,2,3,1,2,3,4)
exam_result = rnorm(12, 80,10)
my_data = data.frame(student_id, exam_number, exam_result)

suppressPackageStartupMessages({
  library(dplyr)
})

set.seed(2022)
(random_sample = sample_n(my_data, 5, replace = TRUE))
#>   student_id exam_number exam_result
#> 1          1           4    73.97148
#> 2          1           3    84.77151
#> 3          2           2    78.76927
#> 4          3           3    69.35063
#> 5          1           4    73.97148


max_value = random_sample %>%
  group_by(student_id) %>%
  summarize(max = max(exam_number))

# join only once
max_value %>%
  left_join(my_data, by = "student_id") -> join_data
join_data
#> # A tibble: 12 × 4
#>    student_id   max exam_number exam_result
#>         <dbl> <dbl>       <dbl>       <dbl>
#>  1          1     4           1        71.0
#>  2          1     4           2        69.1
#>  3          1     4           3        84.8
#>  4          1     4           4        74.0
#>  5          1     4           5        80.7
#>  6          2     2           1        77.4
#>  7          2     2           2        78.8
#>  8          2     2           3        69.5
#>  9          3     3           1        83.9
#> 10          3     3           2        62.7
#> 11          3     3           3        69.4
#> 12          3     3           4       102.

data_before <- join_data %>%
  group_by(student_id) %>%
  filter(exam_number <= max) %>%
  ungroup() %>%
  select(-max)

data_after <- join_data %>%
  group_by(student_id) %>%
  filter(exam_number > max) %>%
  ungroup() %>%
  select(-max)

data_before
#> # A tibble: 9 × 3
#>   student_id exam_number exam_result
#>        <dbl>       <dbl>       <dbl>
#> 1          1           1        71.0
#> 2          1           2        69.1
#> 3          1           3        84.8
#> 4          1           4        74.0
#> 5          2           1        77.4
#> 6          2           2        78.8
#> 7          3           1        83.9
#> 8          3           2        62.7
#> 9          3           3        69.4

data_after
#> # A tibble: 3 × 3
#>   student_id exam_number exam_result
#>        <dbl>       <dbl>       <dbl>
#> 1          1           5        80.7
#> 2          2           3        69.5
#> 3          3           4       102.

# final clean-up
rm(join_data)

^{Created on 2022-12-10 with reprex v2.0.2}