Advertisements
I have a data table with many cols, each cell is a status flag by compare, like following
status_codes <- c()
for (a in c(1, 2, 3, 6, 9)) {
for (b in c(1, 2, 3, 6, 9)) {
status_code <- a * 10 + b
status_codes <- c(status_codes, status_code)
}
}
we can create a example data.table
set.seed(123666)
gt_diff <- data.table(
chrom = rep(1, 1000),
pos = seq(1, 1000),
sample = sample(status_codes[1:9], 1000, replace = TRUE),
sample1 = sample(status_codes, 1000, replace = TRUE),
#pop = rep(c("pop1", "pop2"), 5),
sample2 = sample(status_codes, 1000, replace = TRUE)
)
and we also can generate a list of windows
windows=10
step=1
slide_window <- function(start, end, window, step){
ranges = data.frame(start=integer(), stop=integer())
i = start
while(i <= end){
stop = min(i + window - 1, end)
ranges[nrow(ranges) + 1, ] = c(i, stop)
i = i + step
}
return(ranges)
}
ranges <- slide_window(min(gt_diff$pos), max(gt_diff$pos), windows, step)
For each pos range in ranges, I want to summary each flag number
for example
chrom start end 12 13 16 19 22 23 26 29 32 33 36 39 ... # all status here
1 1 10 0 1 1 0 0 1 ...
>Solution :
Something like this?
library(data.table)
out <- gt_diff[, CJ(chrom = unique(chrom), end = seq(10, max(pos), by = 10))
][, start := shift(end, fill = 0) + 1][]
out[c(1:3, 98:100),]
# chrom end start
# <num> <num> <num>
# 1: 1 10 1
# 2: 1 20 11
# 3: 1 30 21
# 4: 1 980 971
# 5: 1 990 981
# 6: 1 1000 991
melt(gt_diff, c("chrom", "pos"))[out, on = .(chrom, pos >= start, pos <= end)
][, .N, by = .(chrom, start=pos, end=pos.1, value)] |>
dcast(chrom + start + end ~ value, value.var = "N", fill = 0)
# chrom start end 11 12 13 16 19 21 22 23 26 29 31 32 33
# <num> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1: 1 1 10 0 2 2 2 3 3 3 1 1 2 0 1 0
# 2: 1 11 20 1 1 5 1 2 1 1 3 4 0 1 0 1
# 3: 1 21 30 3 1 5 0 2 0 2 0 2 0 2 0 1
# 4: 1 31 40 4 4 2 0 2 3 0 4 1 1 0 1 0
# 5: 1 41 50 3 2 3 3 3 2 0 3 0 1 1 0 1
# 6: 1 51 60 4 1 2 1 2 3 1 4 1 0 0 0 1
# 7: 1 61 70 0 3 3 1 1 1 6 0 1 4 1 2 1
# 8: 1 71 80 2 1 1 1 5 2 1 5 3 0 1 0 1
# 9: 1 81 90 4 0 1 1 3 3 1 1 3 2 0 1 0
# 10: 1 91 100 0 4 2 1 5 2 1 1 2 0 0 1 2
# ---
# 91: 1 901 910 2 1 1 3 4 1 2 2 3 1 0 0 1
# 92: 1 911 920 1 2 4 2 1 3 0 1 3 1 1 0 0
# 93: 1 921 930 0 1 0 3 4 2 2 0 3 1 3 1 1
# 94: 1 931 940 2 2 3 1 2 6 1 1 1 0 0 3 1
# 95: 1 941 950 1 4 4 2 2 2 4 0 0 0 1 0 2
# 96: 1 951 960 3 4 3 3 2 2 3 0 0 0 1 1 1
# 97: 1 961 970 0 3 3 1 3 2 2 1 1 1 2 0 0
# 98: 1 971 980 2 4 2 2 0 0 5 0 2 2 0 2 0
# 99: 1 981 990 5 0 2 2 2 2 1 4 3 0 3 0 0
# 100: 1 991 1000 3 0 5 0 0 1 4 2 0 1 0 1 0
# 12 variables not shown: [36 <int>, 39 <int>, 61 <int>, 62 <int>, 63 <int>, 66 <int>, 69 <int>, 91 <int>, 92 <int>, 93 <int>, ...]