IF- function with pivot longer in data frame R

September 12, 2022

So I have the issue that I use the pivot- longer function to separate columns into no rows but the output is wrong. I tried with this code:

    AT_ID <- c(1,2,3,4)
DEPARTURE_AIRPORT <- c("ZRH","ZRH","ZRH", "ZRH")
STOPOVER_1 <- c(NA, "BEL", "DUB", NA)
STOPOVER_2 <- c(NA, "RUO", NA, "IAE")
ARRIVAL_AIRPORT <- c("IAD", "LAX","BUD", "NOZ")

test_df <- data.frame(AT_ID, DEPARTURE_AIRPORT, STOPOVER_1, STOPOVER_2, ARRIVAL_AIRPORT)

print(test_df)

test_df$intinerary_id <- FALSE
  
split_rows <- function(file){

  #Split rows for intinerary_id
  
  file %>% 
  pivot_longer(
    cols =c(DEPARTURE_AIRPORT, STOPOVER_1, STOPOVER_2),
    names_to = "name",
    values_to = "DEPARTURE_AIRPORT"
  ) %>% 
  filter(!is.na(DEPARTURE_AIRPORT)) %>% 
  mutate(intinerary_id = AT_ID,
         AT_ID = row_number()) %>% relocate(DEPARTURE_AIRPORT, .before = ARRIVAL_AIRPORT)
}


test_df_preprocessed <- split_rows(test_df)

The problem is that for instance for AT_ID 2 the Deprature Airport and Arrival Aiport should be ZRH- BEL, then from BEL to RUO and from RUO to to LAX. Instead the Arrival airports are always the original arrival airports and not the correct stop overs, so instead from A – B, B – C, C -D the code creates A -D, B -D and C – D. I hope my explanation is logical and makes sense. Thank you for the help!

>Solution :

Using lag, lead and coalesce you could do:

library(tidyr)
library(dplyr)

split_rows <- function(file) {
  test_df %>%
    pivot_longer(
      cols = c(DEPARTURE_AIRPORT, STOPOVER_1, STOPOVER_2),
      names_to = "name",
      values_to = "DEPARTURE_AIRPORT"
    ) %>%
    filter(!is.na(DEPARTURE_AIRPORT)) %>%
    group_by(AT_ID) %>%
    mutate(
      DEPARTURE_AIRPORT = coalesce(DEPARTURE_AIRPORT, lag(DEPARTURE_AIRPORT)),
      ARRIVAL_AIRPORT = coalesce(lead(DEPARTURE_AIRPORT), ARRIVAL_AIRPORT),
    ) %>%
    ungroup() %>%
    mutate(
      intinerary_id = AT_ID,
      AT_ID = row_number()
    ) %>%
    relocate(DEPARTURE_AIRPORT, .before = ARRIVAL_AIRPORT)
}

split_rows(test_df)
#> # A tibble: 8 × 5
#>   AT_ID DEPARTURE_AIRPORT ARRIVAL_AIRPORT intinerary_id name             
#>   <int> <chr>             <chr>                   <dbl> <chr>            
#> 1     1 ZRH               IAD                         1 DEPARTURE_AIRPORT
#> 2     2 ZRH               BEL                         2 DEPARTURE_AIRPORT
#> 3     3 BEL               RUO                         2 STOPOVER_1       
#> 4     4 RUO               LAX                         2 STOPOVER_2       
#> 5     5 ZRH               DUB                         3 DEPARTURE_AIRPORT
#> 6     6 DUB               BUD                         3 STOPOVER_1       
#> 7     7 ZRH               IAE                         4 DEPARTURE_AIRPORT
#> 8     8 IAE               NOZ                         4 STOPOVER_2