I am new at writing functions in R, and I am trying to calculate Symmetric Mean Absolute Error (SMAPE) performance by month for one of my models. The basic function works but calculates a single value instead of different values for each month in the dataset. Here is a reproducible example:
structure(list(date = structure(c(18948, 18949, 18950, 18951,
18952, 18953, 18954, 18955, 18956, 18957, 18958, 18959, 18960,
18961, 18962, 18963, 18964, 18965, 18966, 18967, 18968, 18969,
18970, 18971, 18972, 18973, 18974, 18975, 18976, 18977, 18978,
18979, 18980, 18981, 18982, 18983, 18984, 18985, 18986, 18987,
18988, 18989, 18990, 18991, 18992, 18993, 18994, 18995, 18996,
18997, 18998, 18999, 19000, 19001, 19002, 19003, 19004, 19005,
19006, 19007, 19008, 19009, 19010, 19011, 19012, 19013, 19014,
19015, 19016, 19017, 19018, 19019, 19020, 19021, 19022, 19023,
19024, 19025, 19026, 19027, 19028, 19029, 19030, 19031, 19032,
19033, 19034, 19035, 19036, 19037, 19038, 19039, 19040, 19041,
19042, 19043), class = "Date"), actual = c(2875, 2755, 2440,
2220, 1378, 1352, 2616, 1709, 1475, 2315, 2223, 4357, 3037, 1725,
2332, 2358, 3135, 3232, 3497, 2876, 2971, 3530, 4268, 4692, 3589,
3496, 4233, 4336, 5810, 6943, 8921, 7491, 8607, 10450, 11309,
13367, 18607, 23426, 19244, 29256, 21001, 27023, 29346, 39840,
41210, 37503, 38473, 35618, 40713, 39363, 43142, 44309, 38706,
34988, 33483, 28847, 32719, 31248, 31502, 19896, 19025, 23586,
20977, 22323, 23900, 22966, 15038, 14283, 15827, 13900, 18274,
18325, 17514, 10616, 8828, 10580, 8888, 15072, 14208, 14426,
7815, 6841, 7257, 8003, 11034, 10637, 10189, 6143, 4401, 5911,
6164, 8030, 10151, 4180, 6929, 3377), consensus2 = c(2899, 2735,
2485, 2199, 1297, 1414, 3026, 1535, 1588, 2435, 2341, 3095, 2241,
2480, 3098, 2513, 2886, 3289, 3427, 3060, 3050, 3564, 3803, 4204,
3188, 3184, 4071, 4063, 4974, 5839, 6641, 6146, 6620, 8446, 11112,
13071, 14963, 18807, 20670, 21149, 22824, 28484, 29376, 31969,
37669, 37706, 42511, 39104, 41362, 44855, 48043, 46670, 40384.96296,
42612.53704, 37730, 38351, 33813, 35651, 31475, 19364, 19364,
19892, 20436, 21114, 21221, 23002, 18035, 15320, 16292, 15735,
14726, 17844, 17635.77778, 11904.48148, 10763.7037, 9986.611111,
9986.611111, 10604.22222, 14246.90741, 14113.55556, 9113.425926,
8236.5, 8759.888889, 7436.462963, 10489.37037, 10507.09259, 9969.5,
5272.111111, 5729.092593, 5989.055556, 6245, 8267.314815, 7844.481481,
3176.703704, 8661.944444, 3320.055556)), row.names = c(NA, -96L
), class = c("tbl_df", "tbl", "data.frame"))
library(lubridate)
library(tidyverse)
data<- data %>% dplyr::select (date, actual, consensus2) %>%
dput()
data$month<- lubridate::month(data$date,label = TRUE)
data<- data %>% mutate(month= as.factor(month))
#Function
smape1 <- function(a, f) {for (i in 1:(nlevels(data$month))) { return (1/length(a) * sum(2*abs(f-a) / (abs(a)+abs(f))*100))}}
SMAPE_bymonth<- by(data,data$month, function(a,f)smape1(data$actual,data$consensus2))
SMAPE_bymonth
>Solution :
Not clear about the for loop inside the smape1 function. If we remove that and create the function with two arguments (a, f) that takes the columns from the data, then we just need to group by the ‘month’ and apply the function by selecting those columns
library(dplyr)
smape2 <- function(a, f)
{
return(1/length(a) * sum(2*abs(f-a) / (abs(a)+abs(f))*100))
}
data %>%
group_by(month) %>%
summarise(smape = smape2(actual, consensus2), .groups = 'drop')
# A tibble: 4 × 2
month smape
<ord> <dbl>
1 Jan 8.87
2 Feb 12.1
3 Nov 11.3
4 Dec 12.0
Or using by, the lambda function function(x) returns the blocks of grouped data from the first argument, which is used as input argument after extracting the column ‘actual’, ‘consensus2’ instead of from the whole data (data$)
by(data, droplevels(data$month), function(x) smape2(x$actual,x$consensus2))
droplevels(data$month): Jan
[1] 8.870074
-----------------------------------------------------------------------------------------------------------------------
droplevels(data$month): Feb
[1] 12.05893
-----------------------------------------------------------------------------------------------------------------------
droplevels(data$month): Nov
[1] 11.26306
-----------------------------------------------------------------------------------------------------------------------
droplevels(data$month): Dec
[1] 11.96994