How I can sort these data out in R?

March 31, 2023

Here is my data:

data <- data.frame(


   ID = 1:6,
      Course1A = c("A1", "A1", "A1", "A1", "A1", "A1"),
      Time1A = c(1, 1, 1, 1, 1, 1),
      Score1A = c(20, 17, 20, 16, 20, 11),
      Course2B = c("B1", "B1", "B1", "B1", "B1", "B1"),
      Time2B = c(1, 1, 1, 1, 1, 1),
      Score2B = c(12, 11, 15, 15, 10, 15),
      Course3C = c("C1", "C1", "C1", "C1", "C1", "C1"),
      TimeC3 = c(1, 1, 1, 1, 1, 1),
      ScoreC3 = c(10, 12, 12, 10, 10, 10),
      Course4D = c("D1", "D1", "D1", "D1", "D1", "D1"),
      TimeD4 = c(1, 1, 1, 1, 1, 1),
      ScoreD4 = c(20, 20, 18, 20, 17, 20),
      Course5E = c("E1", "E1", "E1", "E1", "E1", "E1"),
      TimeE5 = c(2, 2, 2, 2, 2, 2),
      Score5E = c(12, 12, 12, 12, 11, 11),
      Course6F = c("F1", "F1", "F1", "F1", "F1", "F1"),
      TimeF6 = c(2, 2, 2, 2, 2, 2),
      ScoreF6 = c(10, 10, 10, 10, 10, 9),
      Course7G = c("G1", "G1", "G1", "G1", "G1", "G1"),
      TimeG7 = c(3, 3, 3, 3, 3, 3),
      Score7G = c(12, 11, 6, 4, 12, 9),
      Course8H = c("H1", "H1", "H1", "H1", "H1", "H1"),
      TimeH8 = c(3, 3, 3, 3, 3, 3),
      Score8H = c(12, 12, 12, 12, 10, 10),
      Gender = c("F", "F", "F", "F", "F", "F"),
      Race = c("A", "B", "C", "C", "C", "C"),
      Health = c("Yes", "Yes", "Yes", "Yes", "Yes", "Yes"),
      stringsAsFactors = FALSE

I want to get columns for ID, Course, Time, Gender, Race and health and Score.

I have used the following codes, but I get a wrong table

reshaped_data <- data %>% 
  pivot_longer(
    cols = -c(ID, Gender, Race, Health),
    names_to = c(".value", "Group"),
    names_pattern = "([A-Za-z]+)([0-9]+[A-Z])"
  )

>Solution :

I think it might be easiest if you rationalized the names for the variables to make the suffixes the same. For example, you want to make it so that 3C and C3 get the same value, then the reshape will work easily.

library(dplyr)
library(tidyr)
library(stringr)
data <- data.frame(
  ID = 1:6,
  Course1A = c("A1", "A1", "A1", "A1", "A1", "A1"),
  Time1A = c(1, 1, 1, 1, 1, 1),
  Score1A = c(20, 17, 20, 16, 20, 11),
  Course2B = c("B1", "B1", "B1", "B1", "B1", "B1"),
  Time2B = c(1, 1, 1, 1, 1, 1),
  Score2B = c(12, 11, 15, 15, 10, 15),
  Course3C = c("C1", "C1", "C1", "C1", "C1", "C1"),
  TimeC3 = c(1, 1, 1, 1, 1, 1),
  ScoreC3 = c(10, 12, 12, 10, 10, 10),
  Course4D = c("D1", "D1", "D1", "D1", "D1", "D1"),
  TimeD4 = c(1, 1, 1, 1, 1, 1),
  ScoreD4 = c(20, 20, 18, 20, 17, 20),
  Course5E = c("E1", "E1", "E1", "E1", "E1", "E1"),
  TimeE5 = c(2, 2, 2, 2, 2, 2),
  Score5E = c(12, 12, 12, 12, 11, 11),
  Course6F = c("F1", "F1", "F1", "F1", "F1", "F1"),
  TimeF6 = c(2, 2, 2, 2, 2, 2),
  ScoreF6 = c(10, 10, 10, 10, 10, 9),
  Course7G = c("G1", "G1", "G1", "G1", "G1", "G1"),
  TimeG7 = c(3, 3, 3, 3, 3, 3),
  Score7G = c(12, 11, 6, 4, 12, 9),
  Course8H = c("H1", "H1", "H1", "H1", "H1", "H1"),
  TimeH8 = c(3, 3, 3, 3, 3, 3),
  Score8H = c(12, 12, 12, 12, 10, 10),
  Gender = c("F", "F", "F", "F", "F", "F"),
  Race = c("A", "B", "C", "C", "C", "C"),
  Health = c("Yes", "Yes", "Yes", "Yes", "Yes", "Yes"),
  stringsAsFactors = FALSE)

To do this, I would first find the variables you want and save the stem (i.e., Time, Course, Score) and the suffix (i.e., the two-character ending).

sfx <- gsub("(Time|Course|Score)(.*)", "\\2", grep("Time|Score|Course", names(data), value=TRUE))
stem <- gsub("(Time|Course|Score)(.*)", "\\1", grep("Time|Score|Course", names(data), value=TRUE))

Next, you can split the two suffix characters apart, then sort them and paste them back together.

sfx <- str_split(sfx, "", simplify=TRUE)
sfx <- apply(sfx, 1, function(x)paste(sort(x), collapse=""))

Then, you can paste the newly rationalized suffixes on to the stems and rename the relevant columns:

names(data)[grep("Time|Score|Course", names(data))] <- paste(stem, sfx, sep="")

Then, the reshape works as expected.

reshaped_data <- data %>% 
  pivot_longer(
    cols = -c(ID, Gender, Race, Health),
    names_to = c(".value", "Group"),
    names_pattern = "(Course|Time|Score)(.{2})$"
  )
reshaped_data
#> # A tibble: 48 × 8
#>       ID Gender Race  Health Group Course  Time Score
#>    <int> <chr>  <chr> <chr>  <chr> <chr>  <dbl> <dbl>
#>  1     1 F      A     Yes    1A    A1         1    20
#>  2     1 F      A     Yes    2B    B1         1    12
#>  3     1 F      A     Yes    3C    C1         1    10
#>  4     1 F      A     Yes    4D    D1         1    20
#>  5     1 F      A     Yes    5E    E1         2    12
#>  6     1 F      A     Yes    6F    F1         2    10
#>  7     1 F      A     Yes    7G    G1         3    12
#>  8     1 F      A     Yes    8H    H1         3    12
#>  9     2 F      B     Yes    1A    A1         1    17
#> 10     2 F      B     Yes    2B    B1         1    11
#> # … with 38 more rows

^{Created on 2023-03-31 with reprex v2.0.2}