I am quite puzzled. While I know how to count the sum of distinct_values per group with the n_distinct(), its seems challenging to me at the moment to find the sum of the unique of the unique observations.
I want to group by id, and then sum each unique value and…..
library(tidyverse)
df <- tibble(col1 = c("apple","apple","pple", "banana", "banana","bananna"),
col2 = c("pple","app","app", "bananna", "banan", "banan"),
counts_col1 = c(100,100,2,200,200,2),
counts_col2 = c(2,50,50,2,20,20),
id=c(1,1,1,2,2,2))
df1 <- df %>%
pivot_longer(c(counts_col1:counts_col2),names_to ="strings",values_to = "value") %>%
group_by(id,col1,col2) %>%
ungroup() %>%
group_by(id)
df1
#> # A tibble: 12 × 5
#> # Groups: id [2]
#> col1 col2 id strings value
#> <chr> <chr> <dbl> <chr> <dbl>
#> 1 apple pple 1 counts_col1 100
#> 2 apple pple 1 counts_col2 2
#> 3 apple app 1 counts_col1 100
#> 4 apple app 1 counts_col2 50
#> 5 pple app 1 counts_col1 2
#> 6 pple app 1 counts_col2 50
#> 7 banana bananna 2 counts_col1 200
#> 8 banana bananna 2 counts_col2 2
#> 9 banana banan 2 counts_col1 200
#> 10 banana banan 2 counts_col2 20
#> 11 bananna banan 2 counts_col1 2
#> 12 bananna banan 2 counts_col2 20
Created on 2022-03-16 by the reprex package (v2.0.1)
…end up in something like this
#> col1 col2 id strings value sum_distinct
#> <chr> <chr> <dbl> <chr> <dbl>
#> 1 apple pple 1 counts_col1 100 152
#> 2 apple pple 1 counts_col2 2 NA
#> 3 apple app 1 counts_col1 100 NA
#> 4 apple app 1 counts_col2 50 NA
#> 5 pple app 1 counts_col1 2 NA
#> 6 pple app 1 counts_col2 50 NA
#> 7 banana bananna 2 counts_col1 200 222
#> 8 banana bananna 2 counts_col2 2 NA
#> 9 banana banan 2 counts_col1 200 NA
#> 10 banana banan 2 counts_col2 20 NA
#> 11 bananna banan 2 counts_col1 2 NA
#> 12 bananna banan 2 counts_col2 20 NA
>Solution :
We may use replace with unique
library(dplyr)
library(tidyr)
df %>%
pivot_longer(c(counts_col1:counts_col2),
names_to ="strings",values_to = "value") %>%
group_by(id,col1,col2) %>%
group_by(id) %>%
mutate(sum_distinct = replace(rep(NA_real_, n()), 1, sum(unique(value)))) %>%
ungroup
-output
# A tibble: 12 × 6
col1 col2 id strings value sum_distinct
<chr> <chr> <dbl> <chr> <dbl> <dbl>
1 apple pple 1 counts_col1 100 152
2 apple pple 1 counts_col2 2 NA
3 apple app 1 counts_col1 100 NA
4 apple app 1 counts_col2 50 NA
5 pple app 1 counts_col1 2 NA
6 pple app 1 counts_col2 50 NA
7 banana bananna 2 counts_col1 200 222
8 banana bananna 2 counts_col2 2 NA
9 banana banan 2 counts_col1 200 NA
10 banana banan 2 counts_col2 20 NA
11 bananna banan 2 counts_col1 2 NA
12 bananna banan 2 counts_col2 20 NA