scale values in one column based on separate column

September 27, 2023

I’m looking to scale values out of 100% based on the sample. My df contains the col Project that I’d like to group the col n by to normalize those values of n on a scale to 100%.

library("dplyr")

ex %>%
  dplyr::group_by(Project) %>%
  scale(n) -> perc

For my provided data, I’d expect the output perc to be

Project	reaction	norm
Ga0598239	arsenate-reduction	0.0312
Ga0598239	carbon-fixation	0.0033

et cetera.

> dput(ex)
structure(list(Project = c("Ga0598239", "Ga0598239", "Ga0598239", 
"Ga0598239", "Ga0598239", "Ga0598239", "Ga0598239", "Ga0598239", 
"Ga0598239", "Ga0598239", "Ga0598239", "Ga0598239", "Ga0598239", 
"Ga0598239", "Ga0598240", "Ga0598240", "Ga0598240", "Ga0598240", 
"Ga0598240", "Ga0598240"), reaction = c("arsenate-reduction", 
"carbon-fixation", "formaldehyde-oxidation", "halogenated-compounds-breakdown", 
"hydrogen-oxidation", "iron-oxidation", "iron-reduction", "manganese-oxidation", 
"methanol-oxidation", "selenate-reduction", "sulfide-oxidation", 
"sulfite-reduction", "sulfur-oxidation", "thiosulfate-disproportionation", 
"arsenate-reduction", "carbon-fixation", "formaldehyde-oxidation", 
"halogenated-compounds-breakdown", "hydrogen-oxidation", "iron-oxidation"
), n = c(103L, 11L, 157L, 90L, 2296L, 85L, 33L, 156L, 17L, 38L, 
8L, 9L, 259L, 13L, 90L, 21L, 202L, 81L, 2090L, 73L)), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -20L), groups = structure(list(
    Project = c("Ga0598239", "Ga0598239", "Ga0598239", "Ga0598239", 
    "Ga0598239", "Ga0598239", "Ga0598239", "Ga0598239", "Ga0598239", 
    "Ga0598239", "Ga0598239", "Ga0598239", "Ga0598239", "Ga0598239", 
    "Ga0598240", "Ga0598240", "Ga0598240", "Ga0598240", "Ga0598240", 
    "Ga0598240"), reaction = c("arsenate-reduction", "carbon-fixation", 
    "formaldehyde-oxidation", "halogenated-compounds-breakdown", 
    "hydrogen-oxidation", "iron-oxidation", "iron-reduction", 
    "manganese-oxidation", "methanol-oxidation", "selenate-reduction", 
    "sulfide-oxidation", "sulfite-reduction", "sulfur-oxidation", 
    "thiosulfate-disproportionation", "arsenate-reduction", "carbon-fixation", 
    "formaldehyde-oxidation", "halogenated-compounds-breakdown", 
    "hydrogen-oxidation", "iron-oxidation"), .rows = structure(list(
        1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 
        14L, 15L, 16L, 17L, 18L, 19L, 20L), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -20L), .drop = TRUE))

>Solution :

You want this:
Note your provided data is grouped. So first ungroup!

library(dplyr) #> 1.1.0

df %>%
  ungroup() %>% 
  mutate(norm = (n / sum(n)), .by=Project)

   Project   reaction                            n    norm
   <chr>     <chr>                           <int>   <dbl>
 1 Ga0598239 arsenate-reduction                103 0.0315 
 2 Ga0598239 carbon-fixation                    11 0.00336
 3 Ga0598239 formaldehyde-oxidation            157 0.0479 
 4 Ga0598239 halogenated-compounds-breakdown    90 0.0275 
 5 Ga0598239 hydrogen-oxidation               2296 0.701  
 6 Ga0598239 iron-oxidation                     85 0.0260 
 7 Ga0598239 iron-reduction                     33 0.0101 
 8 Ga0598239 manganese-oxidation               156 0.0476 
 9 Ga0598239 methanol-oxidation                 17 0.00519
10 Ga0598239 selenate-reduction                 38 0.0116 
11 Ga0598239 sulfide-oxidation                   8 0.00244
12 Ga0598239 sulfite-reduction                   9 0.00275
13 Ga0598239 sulfur-oxidation                  259 0.0791 
14 Ga0598239 thiosulfate-disproportionation     13 0.00397
15 Ga0598240 arsenate-reduction                 90 0.0352 
16 Ga0598240 carbon-fixation                    21 0.00821
17 Ga0598240 formaldehyde-oxidation            202 0.0790 
18 Ga0598240 halogenated-compounds-breakdown    81 0.0317 
19 Ga0598240 hydrogen-oxidation               2090 0.817  
20 Ga0598240 iron-oxidation                     73 0.0285