Follow

Keep Up to Date with the Most Important News

By pressing the Subscribe button, you confirm that you have read and are agreeing to our Privacy Policy and Terms of Use
Contact

Interporlation of annual data into monthly data

I’m trying to interpolate annual data into monthly data. I’ve seen several examples online, but I can’t get them to work
Here’s one, here’s another, and I get errors of all kinds.

My data looks like this

structure(list(countryname = c("Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Angola", 
"Angola", "Angola", "Angola", "Angola", "Angola", "Angola", "Angola", 
"Angola", "Angola", "Angola", "Angola", "Angola", "Angola", "Angola", 
"Angola", "Angola", "Angola", "Angola", "Angola", "Anguilla", 
"Anguilla", "Anguilla", "Anguilla", "Anguilla", "Anguilla", "Anguilla", 
"Anguilla", "Anguilla", "Anguilla"), year = c(1986, 1987, 1988, 
1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 
2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 
2011, 2012, 2013, 2014, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 
1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 
1976, 1977, 1978, 1979, 1980, 1981, 1982, 1984, 1985, 1986, 1987, 
1988, 1989, 1990, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 
2014, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 
2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 1990, 
1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999), total = c(286.511, 
344.328, 467.524, 539.787, 609.052, 690.367, 742.735, 793.093, 
946.521, 1017.982, 1121.778, 1185.47, 1248.148, 1376.097, 1502.349, 
1581.668, 1783.635, 2012.886, 2090.128, 2283.757, 2382.169, 2479.276, 
2578.016, 2555.033, 2582.489, 2656.015, 2747.218, 2888.874, 3009.389, 
1527, 2037, 2244, 2162, 1760, 1713, 1867, 2252, 2395, 2449, 2488.4, 
2872.3, 2853.1, 2928, 3660.3, 4280.4, 5214.8, 5016.4, 6243.1, 
5493.6, 6770.4, 9789.5, 9603.1, 10053, 4185.38, 4787.87, 6728.06, 
8773.05, 12698.5, 17579.82, 19972.779, 26547.45, 16855.486058, 
33202.5772169922, 48354.2870012583, 64742.29199, 84061.57373, 
42170.25521, 46008.7436596187, 49215.7766771853, 46365.3341990244, 
0.718514, 26.547404, 90.561909, 90.0346783, 441.87615963, 1845.82841034, 
7453.3254978, 23556.243282338, 54332.057918413, 89699.02591742, 
138396.82327341, 273708.10979785, 492519.2833002, 801048.13352684, 
1285702.23670599, 1532272.06111931, 1973567.61658344, 2451384.32434691, 
2819717.17874511, 2852043.16889253, 135.972994003056, 165.698, 
190, 200.98, 199.009, 212.701, 240.183, 282.136, 308.716, 358.751
), hh = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
12409.359888, 2669.05197895018, 34449.662, 3595.39872, 10187.97902, 
10224.02617, 11409.47128, 9743.61473, 8246.96039, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, 50.331, 55.832, 79.786, 85.04, 93.316, 104.006, 126.771, 
147.306, 171.725), corp = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, 4446.12617, 30533.525238042, 13904.6250012583, 
61146.89327, 73873.59471, 31946.22904, 34599.2723796187, 39472.1619471853, 
38118.3738090244, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 115.367, 134.168, 121.194, 
113.969, 119.385, 136.177, 155.365, 161.41, 187.026), agr = c(NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1.08, 2.09, 1.415, 1.276, 
1.703, 1.85, 2.434, 2.385, 2.161), manufac = c(NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, 0.103556200987511, 0.241631135637525, 
0.691812953819343, 1.02980888759802, 4.50900958466454, 5.02391402846355, 
4.91891954690677, 4.36805808887598, 9.904), real_estate = c(NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 13.327, 13.813, 6.75, 
13.223, 11.223, 13.284, 13.509, 11.515, 11.234), whole_sale = c(NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 77.119, 92.189, 91.358, 
79.837, 76.949, 79.75, 92.241, 95.118, 109.549), transport_comm = c(NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 14.572, 13.047, 12.774, 
11.398, 8.998, 10.174, 10.226, 10.132, 8.803), others = c(NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 9.16544379901249, 12.7873688643625, 
8.20518704618064, 7.20519111240198, 16.0029904153354, 26.0950859715364, 
32.0360804530932, 37.891941911124, 45.375), error = c(NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_)), row.names = c(NA, 
-100L), class = c("tbl_df", "tbl", "data.frame"))

Note that this is a panel of countries, so the interpolation should be done separately for each country.

MEDevel.com: Open-source for Healthcare and Education

Collecting and validating open-source software for healthcare, education, enterprise, development, medical imaging, medical records, and digital pathology.

Visit Medevel

My best attempt so far is adapting the example in the second link

expand_data <- function(x) {
  years <- min(x$year):max(x$year)
  months_data <- 1:12
  grid <- expand.grid(month=months_data, year=years)
  x$quarter <- 1
  merged <- merge(grid, x, by=c('year', 'month'), all.x=TRUE)  # Use merge instead of left_join
  merged$country <- x$country[1]
  return(merged)
}

interpolate_data <- function(data) {
  xout <- 1:nrow(data)
  y <- data$value
  interpolation <- approx(x=xout[!is.na(y)], y=y[!is.na(y)], xout=xout)
  data$value_interpolated <- interpolation$y
  return(data)
}

expand_and_interpolate <- function(x) interpolate_data(expand_data(x))

monthly_data <- credit_data %>%
  group_by(countryname) %>%
  do(expand_and_interpolate(.))

But I get this error

Error in fix.by(by.y, y) : 'by' must specify a uniquely valid column

>Solution :

This process converts year to a Date-class column, then expands it to be every month between the first and last observed dates. From there, it uses approx across all other fields.

out <- credit_data |>
  mutate(date = as.Date(paste0(year, "-01-01"))) |>
  reframe(
    .by = countryname, 
    month = seq(min(date), max(date), by = "month"), 
    year = as.integer(format(month, format = "%Y")), 
    across(-c(year, date, month), 
           ~ if (any(!is.na(.x))) approx(date, .x, xout = month, na.rm = TRUE)$y else rep(.x[1], length(month)))
  )

filter(out, countryname == "Anguilla")[20:30,]
# # A tibble: 11 × 13
#    countryname month       year total    hh  corp   agr manufac real_estate whole_sale transport_comm others error
#    <chr>       <date>     <int> <dbl> <dbl> <dbl> <dbl>   <dbl>       <dbl>      <dbl>          <dbl>  <dbl> <chr>
#  1 Anguilla    1991-08-01  1991  180.  53.5  126.  1.67   0.184        13.6       85.9           13.7   11.3 NA   
#  2 Anguilla    1991-09-01  1991  182.  54.0  128.  1.75   0.195        13.7       87.2           13.6   11.6 NA   
#  3 Anguilla    1991-10-01  1991  184.  54.4  129.  1.84   0.207        13.7       88.4           13.4   11.9 NA   
#  4 Anguilla    1991-11-01  1991  186.  54.9  131.  1.92   0.219        13.7       89.7           13.3   12.2 NA   
#  5 Anguilla    1991-12-01  1991  188.  55.4  133.  2.00   0.230        13.8       90.9           13.2   12.5 NA   
#  6 Anguilla    1992-01-01  1992  190   55.8  134.  2.09   0.242        13.8       92.2           13.0   12.8 NA   
#  7 Anguilla    1992-02-01  1992  191.  57.9  133.  2.03   0.280        13.2       92.1           13.0   12.4 NA   
#  8 Anguilla    1992-03-01  1992  192.  59.8  132.  1.98   0.315        12.7       92.1           13.0   12.0 NA   
#  9 Anguilla    1992-04-01  1992  193.  61.8  131.  1.92   0.354        12.1       92.0           13.0   11.6 NA   
# 10 Anguilla    1992-05-01  1992  194.  63.8  130.  1.87   0.390        11.5       91.9           13.0   11.3 NA   
# 11 Anguilla    1992-06-01  1992  195.  65.8  129.  1.81   0.429        10.9       91.8           12.9   10.9 NA   

The use of .by= requires dplyr_1.1.0 or newer; if you have an older version, change from mutate(.by=c(..), stuff) to group_by(..) |> mutate(stuff) |> ungroup().

Add a comment

Leave a Reply

Keep Up to Date with the Most Important News

By pressing the Subscribe button, you confirm that you have read and are agreeing to our Privacy Policy and Terms of Use

Discover more from Dev solutions

Subscribe now to keep reading and get access to the full archive.

Continue reading