Summary data.table by window

Advertisements

I have a data table with many cols, each cell is a status flag by compare, like following

    status_codes <- c()
    for (a in c(1, 2, 3, 6, 9)) {
            for (b in c(1, 2, 3, 6, 9)) {
            status_code <- a * 10 + b  
            status_codes <- c(status_codes, status_code)  
        }
    }

we can create a example data.table

  set.seed(123666)
  gt_diff <- data.table(
    chrom = rep(1, 1000),
    pos = seq(1, 1000),
    sample = sample(status_codes[1:9], 1000, replace = TRUE),
    sample1 = sample(status_codes, 1000, replace = TRUE),
    #pop = rep(c("pop1", "pop2"), 5),
    sample2 = sample(status_codes, 1000, replace = TRUE)
  )

and we also can generate a list of windows

windows=10
step=1
    slide_window <- function(start, end, window, step){
        ranges = data.frame(start=integer(), stop=integer())
        i = start
        while(i <= end){
          stop = min(i + window - 1, end)
          ranges[nrow(ranges) + 1, ] = c(i, stop)
          i = i + step
        }
        return(ranges)
    }
ranges <- slide_window(min(gt_diff$pos), max(gt_diff$pos), windows, step)

For each pos range in ranges, I want to summary each flag number

for example

chrom start end 12 13 16 19 22 23 26 29 32 33 36 39 ... # all status here
1     1     10  0  1  1  0  0  1  ...

>Solution :

Something like this?

library(data.table)
out <- gt_diff[, CJ(chrom = unique(chrom), end = seq(10, max(pos), by = 10))
  ][, start := shift(end, fill = 0) + 1][]
out[c(1:3, 98:100),]
#    chrom   end start
#    <num> <num> <num>
# 1:     1    10     1
# 2:     1    20    11
# 3:     1    30    21
# 4:     1   980   971
# 5:     1   990   981
# 6:     1  1000   991

melt(gt_diff, c("chrom", "pos"))[out, on = .(chrom, pos >= start, pos <= end)
  ][, .N, by = .(chrom, start=pos, end=pos.1, value)] |>
  dcast(chrom + start + end ~ value, value.var = "N", fill = 0)
#      chrom start   end    11    12    13    16    19    21    22    23    26    29    31    32    33
#      <num> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#   1:     1     1    10     0     2     2     2     3     3     3     1     1     2     0     1     0
#   2:     1    11    20     1     1     5     1     2     1     1     3     4     0     1     0     1
#   3:     1    21    30     3     1     5     0     2     0     2     0     2     0     2     0     1
#   4:     1    31    40     4     4     2     0     2     3     0     4     1     1     0     1     0
#   5:     1    41    50     3     2     3     3     3     2     0     3     0     1     1     0     1
#   6:     1    51    60     4     1     2     1     2     3     1     4     1     0     0     0     1
#   7:     1    61    70     0     3     3     1     1     1     6     0     1     4     1     2     1
#   8:     1    71    80     2     1     1     1     5     2     1     5     3     0     1     0     1
#   9:     1    81    90     4     0     1     1     3     3     1     1     3     2     0     1     0
#  10:     1    91   100     0     4     2     1     5     2     1     1     2     0     0     1     2
#  ---                                                                                                
#  91:     1   901   910     2     1     1     3     4     1     2     2     3     1     0     0     1
#  92:     1   911   920     1     2     4     2     1     3     0     1     3     1     1     0     0
#  93:     1   921   930     0     1     0     3     4     2     2     0     3     1     3     1     1
#  94:     1   931   940     2     2     3     1     2     6     1     1     1     0     0     3     1
#  95:     1   941   950     1     4     4     2     2     2     4     0     0     0     1     0     2
#  96:     1   951   960     3     4     3     3     2     2     3     0     0     0     1     1     1
#  97:     1   961   970     0     3     3     1     3     2     2     1     1     1     2     0     0
#  98:     1   971   980     2     4     2     2     0     0     5     0     2     2     0     2     0
#  99:     1   981   990     5     0     2     2     2     2     1     4     3     0     3     0     0
# 100:     1   991  1000     3     0     5     0     0     1     4     2     0     1     0     1     0
# 12 variables not shown: [36 <int>, 39 <int>, 61 <int>, 62 <int>, 63 <int>, 66 <int>, 69 <int>, 91 <int>, 92 <int>, 93 <int>, ...]

Leave a ReplyCancel reply