Follow

Keep Up to Date with the Most Important News

By pressing the Subscribe button, you confirm that you have read and are agreeing to our Privacy Policy and Terms of Use
Contact

`dplyr` wrong results across groups with filter and separate_rows

I have data on pupillary responses during Questions in conversation. The pupil data are stringed together in columns A*, B*, and C*, with columns ending in *intpl_new showing the interpolated pupil area values and the columns ending with *dur giving the durations of each pupil observation. Dataframe df contains the data for two illustrative Question Sequences:

df <- structure(list(Speaker = c("ID08.A", "ID08.A"), Utterance = c("what so you're going to a £!party! in Italy£=", 
                                                                    "so what's your work"), Sequ = c(379, 380), File = c("F08", "F08"
                                                                    ), A_pupil_dur = c("9,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,6", 
                                                                                       "4,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,18,16,16,16"
                                                                    ), B_pupil_dur = c("5,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,10", 
                                                                                       "0,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,3"
                                                                    ), C_pupil_dur = c("13,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,2", 
                                                                                       "8,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,17,17,16,12"
                                                                    ), A_Area_av_intpl_new = c("439.711, 439.711, 644.691, 731.96, 747.196, 748.7275, 754.4505, 762.587, 760.0615, 767.383, 770.2685, 775.8755, 770.246, 780.847, 785.325, 787.0505, 796.164, 792.5955, 803.6015, 805.0865, 812.2755, 811.0335, 817.3775, 809.7155, 804.172, 784.5615, 785.306, 773.668, 787.1165, 780.432, 770.338, 772.0045, 775.238, 765.1055, 770.028, 765.566, 761.882, 762.095, 764.063, 769.38, 767.2175, 787.711, 773.056, 794.4395, 775.7075, 772.3825, 764.177, 759.029, 758.788, 756.673, 755.184, 762.655, 777.8965, 787.1845, 782.783, 779.6265, 790.063, 790.716, 792.483, 813.861, 796.0765, 793.4735, 786.3455, 782.2775, 782.3615, 765.5825, 748.5415, 733.5605, 740.209, 716.731, 714.564, 692.6865, 677.8545, 663.384, 653.542, 661.2205, 642.244, 630.52, 634.3305, 623.638, 614.8775, 598.457, 550.281, 533.3955, 534.857, 556.981, 520.699, 504.2235, 483.121, 533.9275, 517.6205, 564.656, 486.9505, 496.907, 482.0425, 498.3545, 488.9415, 514.838, 485.5285, 458.243, 447.065, 429.248, 408.462, 381.2095, 366.082, 361.209, 351.755, 366.533, 365.6965, 372.575, 381.1985, 374.6075, 374.187, 378.831, 377.856, 380.95, 374.515, 386.3795, 372.8235, 367.6245, 311.0305, 285.302, 285.302", 
                                                                                               "450.732, 450.732, 450.732, 450.732, 450.732, 673.6895, 673.2105, 681.709, 690.044, 685.161, 694.2225, 690.4845, 693.8295, 702.695, 706.5325, 712.255, 706.277, 715.655, 716.3665, 715.771, 738.173, 746.6575, 753.729, 741.008, 747.346, 739.562, 740.895, 741.728, 746.875, 745.368, 750.3755, 740.992, 746.3275, 747.992, 745.482, 737.803, 732.567, 665.135, 590.805, 599.918, 517.5955, 232.453, 185.6525, 177.0585, 349.1175, 610.161, 715.3055, 723.8695, 731.505, 736.0385"
                                                                    ), B_Area_av_intpl_new = c("1127.5655, 1133.472, 1144.686, 1137.3125, 1146.6885, 1146.521, 1146.165, 1136.645, 1107.046, 1169.5485, 1171.6375, 1185.374, 911.0355, 514.7785, 302.262, 201.9995, 221.0165, 364.1835, 554.6255, 830.8855, 982.6595, 1055.199, 1236.219, 1313.743, 1347.7445, 1341.755, 1367.371, 1058.6165, 816.283875, 573.95125, 331.618625, 89.286, 97.243, 783.2605, 921.9195, 1023.9135, 1099.347, 1170.8585, 1191.3015, 1218.248, 1232.114, 1248.672, 1269.5065, 1274.4395, 1284.665, 1289.562, 1290.2045, 1274.3455, 1257.694, 1251.4575, 1241.0405, 1217.684, 1202.611, 1180.6035, 1161.036, 1140.593, 1127.1275, 1108.0965, 1091.227, 1076.481, 1060.9835, 1047.0645, 1026.973, 1020.352, 1010.53, 1001.069, 988.176, 987.625, 979.6735, 966.375, 965.7815, 949.6985, 944.108, 942.71, 938.5365, 936.1955, 932.9745, 928.864, 932.069, 931.5345, 928.105, 925.708, 924.489, 931.071, 939.366, 936.182, 939.049, 940.7355, 943.43, 951.3945, 953.896, 960.797, 964.893, 976.1915, 973.9215, 986.8575, 988.273, 999.3865, 1008.348, 1012.5235, 1026.1885, 1024.859, 1034.6715, 1040.397, 1053.101, 1061.7265, 1069.12, 1072.339, 1073.477, 1080.543, 1081.047, 1089.56, 1094.2435, 1099.5435, 1098.3245, 1098.1435, 1103.4685, 1112.3565, 1111.2075, 1119.047, 1111.2515, 1106.309, 1109.1225", 
                                                                                               "NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA"
                                                                    ), C_Area_av_intpl_new = c("1310.6215, 1315.454, 1322.456, 1318.831, 1323.633, 1320.5265, 1321.4315, 1321.8755, 1327.908, 1328.7995, 1327.1185, 1328.311, 1329.5015, 1275.664, 1318.0675, 1295.021, 1358.674, 1313.429, 1296.748, 1279.574, 1260.8705, 1253.4135, 1275.716, 1256.1905, 1260.968, 1266.579, 1267.513, 1265.8265, 1262.5765, 1252.0955, 1218.903, 1188.4015, 1189.9125, 1171.238, 1160.559, 1146.503, 1139.204, 1132.484, 1122.7765, 1117.295, 1101.205, 1100.462, 1094.296, 1086.83, 1077.7585, 1071.4795, 1064.321, 1066.358, 1026.922, 1055.0745, 1035.7405, 1047.858, 1033.132, 1039.793, 1021.0405, 1029.5575, 1027.678, 1029.3585, 1026.111, 1028.1745, 1025.4395, 1030.1245, 1022.905, 1027.13, 1034.6355, 1036.9165, 1031.264, 1040.098, 1035.224, 1038.8585, 1036.9795, 1046.472, 1049.66, 1051.9625, 1057.0575, 1057.5905, 1057.0835, 1064.5255, 1066.282, 1062.145, 1068.6555, 1068.9065, 1070.3065, 1075.3095, 1077.345, 1076.921, 1082.6835, 1080.1855, 1087.028, 1091.712, 1089.9055, 1097.7805, 1092.521, 1096.772, 1095.7135, 1096.116, 1099.6245, 1094.1805, 1096.947, 1100.037, 1095.832, 1101.221, 1097.206, 1102.7895, 1097.554, 1100.1235, 1105.832, 1101.3535, 1093.1925, 1104.321, 1097.7195, 1074.774, 1075.12, 806.9485, 272.4795, 270.039625, 267.59975, 265.159875, 262.72, 175.0135, 126.3875, 126.3875, 126.3875", 
                                                                                               "1135.087, 1126.958, 1134.757, 1137.2935, 1144.709, 1143.489, 1120.5105, 1168.576, 1175.9775, 1207.8175, 1169.8565, 1187.4535, 1191.8855, 1208.294, 1222.445, 1217.7895, 1213.134, 1216.726, 1223.882, 1214.1995, 1171.062, 1144.856, 1150.527, 1153.6485, 1133.79, 1123.989, 1116.856, 1110.5225, 1098.6115, 1087.8905, 1080.286, 1077.1005, 1067.366, 1058.493, 1048.9075, 1050.1335, 1039.5365, 1036.7075, 1033.1095, 1025.224, 1022.4685, 1017.6755, 1018.1755, 1012.436, 1009.0545, 1010.995, 1009.37, 1010.7515, 1012.3335, 1006.5675"
                                                                    )), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"
                                                                    ))

I’m stuck in a seemingly simple task: to find the maximum *intpl_new value for each participant A, B, and C and the time when that maximum value occurs. When I filter Sequ==379, everything works out fine with this code:

df %>%
  
  # Filter `Sequ`:
  filter(Sequ == 379) %>%
  
  # cast longer:
  pivot_longer(cols = matches("(_Area_av_intpl_new|pupil_dur)$"),
               names_to = c("PP_by", ".value"),
               # 1. capt. gr.: initial character ("A", "B", or "C"); 
               # 2. capt. gr.: everything after underscore                
               names_pattern = "^(.)_(.*)$") %>% 
  # separate each `Area_av_intpl` and corresponding `PP_by` value into their own row:
  separate_rows(c(Area_av_intpl_new, pupil_dur), sep = ",", convert = TRUE) %>%
  # rename:
  rename(Area = Area_av_intpl_new, PP_dur = pupil_dur) %>% 
  
  # Remove `NA` and " NA" (with preceding whitespace!):
  filter(!(is.na(Area) | Area == " NA")) %>%
  
  ### Create `PP_dur_cumsum`:
  group_by(Utterance, PP_by, Sequ) %>%
  mutate(PP_dur_cumsum = cumsum(PP_dur)) %>%
  
  ### Find maximal PP values:
  summarise(across(c(Speaker,File), first),
    max_Area_Q = max(Area),
    time_max_Area_Q = PP_dur_cumsum[Area == max_Area_Q]) %>% 
  arrange(Sequ) %>%
  
  ### Put new values back onto unique `Utterance` row:
  pivot_wider( 
    names_from = PP_by, 
    names_glue = "{PP_by}_{.value}",
    values_from = c(time_max_Area_Q, max_Area_Q)) %>% 
  ungroup()
# A tibble: 1 × 10
  Utterance               Sequ Speaker File  A_time_max_Area_Q B_time_max_Area… C_time_max_Area… A_max_Area_Q B_max_Area_Q C_max_Area_Q
  <chr>                  <dbl> <chr>   <chr>             <int>            <int>            <int>        <dbl>        <dbl>        <dbl>
1 what so you're going …   379 ID08.A  F08                 375              438              279         817.        1367.        1359.

However, when I do not filter (i.e., comment out the filter operation) but run the code over both Sequ, the values get incorrect:

MEDevel.com: Open-source for Healthcare and Education

Collecting and validating open-source software for healthcare, education, enterprise, development, medical imaging, medical records, and digital pathology.

Visit Medevel

# A tibble: 2 × 10
  Utterance               Sequ Speaker File  A_time_max_Area_Q B_time_max_Area… C_time_max_Area… A_max_Area_Q B_max_Area_Q C_max_Area_Q
  <chr>                  <dbl> <chr>   <chr>             <int>            <int>            <int> <chr>        <chr>        <chr>       
1 what so you're going …   379 ID08.A  F08                   9                5               13 439.711      1127.5655    1310.6215   
2 so what's your work      380 ID08.A  F08                   4               NA                8 450.732      NA           1135.087 

My hunch is that the incorrect values are caused by the NA values in Sequ==380. But I can’t seem to find the right way to handle them and to find a right solution. Help with this task and this admittedly difficult type of data is much appreciated!

>Solution :

Indeed, the error occurs because separate_rows(..., convert = TRUE) doesn’t convert to numeric when chars are present. Consider df2 as a base case:

df2 <- df %>%
  # Filter `Sequ`:
  filter(Sequ == 379) %>%
  
  # cast longer:
  pivot_longer(cols = matches("(_Area_av_intpl_new|pupil_dur)$"),
               names_to = c("PP_by", ".value"),
               # 1. capt. gr.: initial character ("A", "B", or "C"); 
               # 2. capt. gr.: everything after underscore                
               names_pattern = "^(.)_(.*)$") %>% 
  # separate each `Area_av_intpl` and corresponding `PP_by` value into their own row:
separate_rows(c(Area_av_intpl_new, pupil_dur), sep = ",", convert = TRUE) %>%
  # rename:
  rename(Area = Area_av_intpl_new, PP_dur = pupil_dur)

And df3 the unfiltered tibble:

df3 <- df %>%
  # cast longer:
  pivot_longer(cols = matches("(_Area_av_intpl_new|pupil_dur)$"),
               names_to = c("PP_by", ".value"),
               # 1. capt. gr.: initial character ("A", "B", or "C"); 
               # 2. capt. gr.: everything after underscore                
               names_pattern = "^(.)_(.*)$") %>% 
  # separate each `Area_av_intpl` and corresponding `PP_by` value into their own row:
  separate_rows(c(Area_av_intpl_new, pupil_dur), sep = ",", convert = TRUE) %>%
  # rename:
  rename(Area = Area_av_intpl_new, PP_dur = pupil_dur)

res2 <- df3 %>% filter(Sequ == 379) 

This isn’t identical because Area is of character.

identical(df2, res2)
[1] FALSE

Adding an explicit conversion after the filter fixes the problem.

df3 <- df %>%
  # cast longer:
  pivot_longer(cols = matches("(_Area_av_intpl_new|pupil_dur)$"),
               names_to = c("PP_by", ".value"),
               # 1. capt. gr.: initial character ("A", "B", or "C"); 
               # 2. capt. gr.: everything after underscore                
               names_pattern = "^(.)_(.*)$") %>% 
  separate_rows(c(Area_av_intpl_new, pupil_dur), sep = ",", convert = TRUE) %>%
  rename(Area = Area_av_intpl_new, PP_dur = pupil_dur) %>% 
  # filtering
  filter(!(is.na(Area) | Area == " NA")) %>%
  # ...and then converting 
  mutate(Area=as.numeric(Area))


res2 <- df3 %>% filter(Sequ == 379) 
identical(df2, res2)
[1] TRUE

This allows us to continue chaining, and yields your expected result.

df3 %>%       
group_by(Utterance, PP_by, Sequ) %>%
  mutate(PP_dur_cumsum = cumsum(PP_dur)) %>%
  
  ### Find maximal PP values:
  summarise(across(c(Speaker,File), first),
    max_Area_Q = max(Area),
    time_max_Area_Q = PP_dur_cumsum[Area == max_Area_Q]) %>% 
  arrange(Sequ) %>%
  
  ### Put new values back onto unique `Utterance` row:
  pivot_wider( 
    names_from = PP_by, 
    names_glue = "{PP_by}_{.value}",
    values_from = c(time_max_Area_Q, max_Area_Q)) %>% 
  ungroup()
# A tibble: 2 × 10
  Utterance                                      Sequ Speaker File  A_time_max_Area_Q B_time_max_Area_Q C_time_max_Area_Q A_max_Area_Q
  <chr>                                         <dbl> <chr>   <chr>             <int>             <int>             <int>        <dbl>
1 what so you're going to a £!party! in Italy£=   379 ID08.A  F08                 375               438               279         817.
2 so what's your work                             380 ID08.A  F08                 372                NA               308         754.
  B_max_Area_Q C_max_Area_Q
         <dbl>        <dbl>
1        1367.        1359.
2          NA         1224.

The values you saw were probably representing levels of factor conversion from character.

For added clarity, I would suggest creating intermediate results if memory is not an issue.

Add a comment

Leave a Reply

Keep Up to Date with the Most Important News

By pressing the Subscribe button, you confirm that you have read and are agreeing to our Privacy Policy and Terms of Use

Discover more from Dev solutions

Subscribe now to keep reading and get access to the full archive.

Continue reading