Here is dummy datasets:
a <- c("CC", "CCAA", "ABB")
id <- c("a", "b", "c")
data <- data.frame(id, a)
head(data)
# id a
#1 a CC
#2 b CCAA
#3 c ABB
And we can calculate percentage of the each string presented in the rows
library(data.table)
data1 <- setDT(data)[, .N, .(a)][, perc := N/sum(N), .()][]
head(data1)
# a N perc
# 1: CC 1 0.3333333
# 2: CCAA 1 0.3333333
# 3: ABB 1 0.3333333
But, How to calculate percentage of each characters from whole "a" variable strings?
# [Expected output]
# a N perc
# 1: A 3 0.33
# 2: B 2 0.22
# 3: C 4 0.44
R base and tidyverse approach more preferred.
>Solution :
tidyverse option –
library(tidyverse)
data %>%
mutate(a = str_split(a, "")) %>%
unnest_longer(a) %>%
count(a, name = "N") %>%
mutate(perc = prop.table(N))
# A tibble: 3 × 3
# a N perc
# <chr> <int> <dbl>
#1 A 3 0.333
#2 B 2 0.222
#3 C 4 0.444
Base R –
a1 <- strsplit(data$a, "") |> unlist()
a2 <- table(a1)
a3 <- prop.table(a2)
data.frame(a = names(a2),
N = as.integer(a2),
perc = as.numeric(a3))
# a N perc
#1 A 3 0.3333333
#2 B 2 0.2222222
#3 C 4 0.4444444