Follow

Keep Up to Date with the Most Important News

By pressing the Subscribe button, you confirm that you have read and are agreeing to our Privacy Policy and Terms of Use
Contact

Summary data.table by window

I have a data table with many cols, each cell is a status flag by compare, like following

    status_codes <- c()
    for (a in c(1, 2, 3, 6, 9)) {
            for (b in c(1, 2, 3, 6, 9)) {
            status_code <- a * 10 + b  
            status_codes <- c(status_codes, status_code)  
        }
    }

we can create a example data.table

  set.seed(123666)
  gt_diff <- data.table(
    chrom = rep(1, 1000),
    pos = seq(1, 1000),
    sample = sample(status_codes[1:9], 1000, replace = TRUE),
    sample1 = sample(status_codes, 1000, replace = TRUE),
    #pop = rep(c("pop1", "pop2"), 5),
    sample2 = sample(status_codes, 1000, replace = TRUE)
  )

and we also can generate a list of windows

MEDevel.com: Open-source for Healthcare and Education

Collecting and validating open-source software for healthcare, education, enterprise, development, medical imaging, medical records, and digital pathology.

Visit Medevel

windows=10
step=1
    slide_window <- function(start, end, window, step){
        ranges = data.frame(start=integer(), stop=integer())
        i = start
        while(i <= end){
          stop = min(i + window - 1, end)
          ranges[nrow(ranges) + 1, ] = c(i, stop)
          i = i + step
        }
        return(ranges)
    }
ranges <- slide_window(min(gt_diff$pos), max(gt_diff$pos), windows, step)

For each pos range in ranges, I want to summary each flag number

for example

chrom start end 12 13 16 19 22 23 26 29 32 33 36 39 ... # all status here
1     1     10  0  1  1  0  0  1  ...

>Solution :

Something like this?

library(data.table)
out <- gt_diff[, CJ(chrom = unique(chrom), end = seq(10, max(pos), by = 10))
  ][, start := shift(end, fill = 0) + 1][]
out[c(1:3, 98:100),]
#    chrom   end start
#    <num> <num> <num>
# 1:     1    10     1
# 2:     1    20    11
# 3:     1    30    21
# 4:     1   980   971
# 5:     1   990   981
# 6:     1  1000   991

melt(gt_diff, c("chrom", "pos"))[out, on = .(chrom, pos >= start, pos <= end)
  ][, .N, by = .(chrom, start=pos, end=pos.1, value)] |>
  dcast(chrom + start + end ~ value, value.var = "N", fill = 0)
#      chrom start   end    11    12    13    16    19    21    22    23    26    29    31    32    33
#      <num> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#   1:     1     1    10     0     2     2     2     3     3     3     1     1     2     0     1     0
#   2:     1    11    20     1     1     5     1     2     1     1     3     4     0     1     0     1
#   3:     1    21    30     3     1     5     0     2     0     2     0     2     0     2     0     1
#   4:     1    31    40     4     4     2     0     2     3     0     4     1     1     0     1     0
#   5:     1    41    50     3     2     3     3     3     2     0     3     0     1     1     0     1
#   6:     1    51    60     4     1     2     1     2     3     1     4     1     0     0     0     1
#   7:     1    61    70     0     3     3     1     1     1     6     0     1     4     1     2     1
#   8:     1    71    80     2     1     1     1     5     2     1     5     3     0     1     0     1
#   9:     1    81    90     4     0     1     1     3     3     1     1     3     2     0     1     0
#  10:     1    91   100     0     4     2     1     5     2     1     1     2     0     0     1     2
#  ---                                                                                                
#  91:     1   901   910     2     1     1     3     4     1     2     2     3     1     0     0     1
#  92:     1   911   920     1     2     4     2     1     3     0     1     3     1     1     0     0
#  93:     1   921   930     0     1     0     3     4     2     2     0     3     1     3     1     1
#  94:     1   931   940     2     2     3     1     2     6     1     1     1     0     0     3     1
#  95:     1   941   950     1     4     4     2     2     2     4     0     0     0     1     0     2
#  96:     1   951   960     3     4     3     3     2     2     3     0     0     0     1     1     1
#  97:     1   961   970     0     3     3     1     3     2     2     1     1     1     2     0     0
#  98:     1   971   980     2     4     2     2     0     0     5     0     2     2     0     2     0
#  99:     1   981   990     5     0     2     2     2     2     1     4     3     0     3     0     0
# 100:     1   991  1000     3     0     5     0     0     1     4     2     0     1     0     1     0
# 12 variables not shown: [36 <int>, 39 <int>, 61 <int>, 62 <int>, 63 <int>, 66 <int>, 69 <int>, 91 <int>, 92 <int>, 93 <int>, ...]
Add a comment

Leave a Reply

Keep Up to Date with the Most Important News

By pressing the Subscribe button, you confirm that you have read and are agreeing to our Privacy Policy and Terms of Use

Discover more from Dev solutions

Subscribe now to keep reading and get access to the full archive.

Continue reading