I have the data below with both categorical and numeric variables I want to automatically replace all my outliers from numeric variables with this method below but I need to do it in all numeric columns automatically and not one by one
df$v3[df$v3 %in% boxplot(df)$out] <- median(df$v3)
df<-structure(list(`Project Title` = c("Cameroun - Projet d’aide humanitaire d’urgence en faveur des réfugiés hors sites et des populations hôtes situés dans les régions de l’Adamaoua, de l’Est et du Nord\r\n(Version française uniquement)",
"Multinational - Kenya-Tanzania Power Interconnection Project",
"Multinational - Kenya-Tanzania Power Interconnection Project",
"Zambia - Madison Finance Company Limited\r\n(under the Africa SME Program)",
"Eritrea - Skills Development for Employability and Entrepreneurship Project",
"Eritrea - Skills Development for Employability and Entrepreneurship Project",
"Uganda - Water Supply and Sanitation Programme - Additional activities",
"Cabo Verde - Aide d’urgence pour assister les populations affectées par l’éruption volcanique de l’île de Fogo\r\n(Version française uniquement)",
"Kenya - Mombasa-Mariakani Highway Project", "Multinational - Africa Banking Corporation Holdings Ltd"
), `Approval date Year Trend` = structure(c(1421107200, 1424217600,
1424217600, 1425254400, 1425340800, 1425340800, 1425427200, 1425600000,
1426032000, 1426032000), tzone = "UTC", class = c("POSIXct",
"POSIXt")), `Type of Financing Source` = c("Special Relief Fund (SRF)",
"ADF", "ADF", "ADB Private Sector", "ADF", "ADF", "Global Environment Facility (GEF)",
"Special Relief Fund (SRF)", "ADF", "ADB Private Sector"), `Approved Amount_UA Equivalent` = c(690221.630165446,
75290000, 27500000, 2131605.31196044, 7580000, 5920000, 5947178.82036962,
191844.478076439, 8e+07, 11842251.7331135), `Approved Amount_USDEquivalent` = c(920065.43301054,
100361570, 36657500, 2841429.88084326, 10104140, 7891360, 7927589.3675527,
255728.689275894, 106640000, 15785721.5602403), `Project ID` = c("P-CM-I00-001",
"P-Z1-FA0-052", "P-Z1-FA0-086", "P-ZM-HAB-003", "P-ER-IA0-001",
"P-ER-IA0-001", "P-UG-E00-013", "P-CV-I00-001", "P-KE-DB0-021",
"P-Z1-HAB-046"), `Employe Name` = c("BISSAKONOU Judes", "MUGUWA Andrew",
"MUGUWA Andrew", "FRENCH Laura Katherine Smith", "ELAHEEBOCUS Bibi Nawsheen",
"ELAHEEBOCUS Bibi Nawsheen", "MBIRO Andrew", "TRAORE Rokhaya",
"MAMMO Zerfu Tessema", "MACHARIA Julius Karuga"), `Financing Source` = c("OTHERS",
"ADF Including TSF", "ADF Including TSF", "ADB Private", "ADF Including TSF",
"ADF Including TSF", "OTHERS", "OTHERS", "ADF Including TSF",
"ADB Private"), `Financial Access Category` = c("Blend Countries",
"Multinational", "Blend Countries", "Blend Countries", "ADF Countries",
"ADF Countries", "ADF Countries", "ADB Countries", "Blend Countries",
"Multinational"), `Country Classifaction` = c("Non-Transition States",
"Multinational", "Non-Transition States", "Non-Transition States",
"Transition States", "Transition States", "Non-Transition States",
"Non-Transition States", "Non-Transition States", "Multinational"
), `Country Name` = c("CAMEROON", "MULTINATIONAL", "KENYA", "ZAMBIA",
"ERITREA", "ERITREA", "UGANDA", "Cape Verde", "KENYA", "MULTINATIONAL"
), `REGION NAME` = c("Central", "Multinational", "East", "South",
"East", "East", "East", "West", "East", "Multinational"), `Sectoral Analysis` = c("Social",
"Power", "Power", "Finance", "Social", "Social", "WASH Sector",
"Social", "Transport", "Finance"), `High Five Prority 1: Feed Africa` = c(NA,
NA, NA, NA, NA, NA, 178415.364611089, NA, NA, NA), `High Five Prority 2: Light Up And Power Africa` = c(NA,
75290000, 27500000, NA, NA, NA, 59471.7882036962, NA, NA, NA),
`High Five Prority 3: Industrialize Africa` = c(NA, NA, NA,
2131605.31196044, NA, NA, NA, NA, NA, 11842251.7331135),
`High Five Prority 4: Integrate Africa` = c(NA, NA, NA, NA,
NA, NA, NA, NA, 8e+07, NA), `High Five Prority 5: Improve Quality Of Life` = c(690221.630165446,
NA, NA, NA, 7580000, 5920000, 5709291.66755484, 191844.478076439,
NA, NA), `Financing Instrument` = c("Emergency Assistance",
"Investment Project", "Investment Project", "Investment Project",
"Investment Project", "Investment Project", "ISP and Investment Project",
"Emergency Assistance", "Investment Project", "Investment Project"
), `Infrastructure vs Sector` = c("Social", "Infrastructure",
"Infrastructure", "Finance", "Social", "Social", "Infrastructure",
"Social", "Infrastructure", "Finance"), `SECTOR DEPARTMENT` = c("AHAI",
"PESD", "PESD", "PIFD", "AHHD", "AHHD", "AHWS", "AHHD", "PICU",
"PIFD"), `Sector Complex Name` = c("AHVP", "PEVP", "PEVP",
"PIVP", "AHVP", "AHVP", "AHVP", "AHVP", "PIVP", "PIVP"),
`Operations Type` = c("Sovereign", "Sovereign", "Sovereign",
"Non-Sovereign", "Sovereign", "Sovereign", "Sovereign", "Sovereign",
"Sovereign", "Non-Sovereign"), `NUMBER OF PROJECT` = c(1,
1, 1, 1, 0.5, 0.5, 1, 1, 1, 1), `Amount in UA Million` = c(0.690221630165446,
75.29, 27.5, 2.13160531196044, 7.58, 5.92, 5.94717882036962,
0.191844478076439, 80, 11.8422517331135)), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
>Solution :
Try this:
outlier_median <- function(z) {
stats <- stats::fivenum(z, na.rm = TRUE)
z[!between(z, stats[2], stats[4])] <- stats[3]
z
}
df %>%
mutate(across(where(is.numeric), ~ outlier_median(.)))
# # A tibble: 10 × 25
# Projec…¹ Approval date Year Tr…² Type …³ Appro…⁴ Appro…⁵ Proje…⁶ Emplo…⁷ Finan…⁸ Finan…⁹ Count…˟ Count…˟ REGIO…˟ Secto…˟ High …˟
# <chr> <dttm> <chr> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
# 1 "Camero… 2015-01-13 00:00:00.000 Specia… 6.76e6 9.02e6 P-CM-I… BISSAK… OTHERS Blend … Non-Tr… CAMERO… Central Social NA
# 2 "Multin… 2015-02-18 00:00:00.000 ADF 6.76e6 9.02e6 P-Z1-F… MUGUWA… ADF In… Multin… Multin… MULTIN… Multin… Power NA
# 3 "Multin… 2015-02-18 00:00:00.000 ADF 2.75e7 3.67e7 P-Z1-F… MUGUWA… ADF In… Blend … Non-Tr… KENYA East Power NA
# 4 "Zambia… 2015-03-02 00:00:00.000 ADB Pr… 2.13e6 2.84e6 P-ZM-H… FRENCH… ADB Pr… Blend … Non-Tr… ZAMBIA South Finance NA
# 5 "Eritre… 2015-03-03 00:00:00.000 ADF 7.58e6 1.01e7 P-ER-I… ELAHEE… ADF In… ADF Co… Transi… ERITREA East Social NA
# 6 "Eritre… 2015-03-03 00:00:00.000 ADF 5.92e6 7.89e6 P-ER-I… ELAHEE… ADF In… ADF Co… Transi… ERITREA East Social NA
# 7 "Uganda… 2015-03-04 00:00:00.000 Global… 5.95e6 7.93e6 P-UG-E… MBIRO … OTHERS ADF Co… Non-Tr… UGANDA East WASH S… 178415.
# 8 "Cabo V… 2015-03-06 00:00:00.000 Specia… 6.76e6 9.02e6 P-CV-I… TRAORE… OTHERS ADB Co… Non-Tr… Cape V… West Social NA
# 9 "Kenya … 2015-03-11 00:00:00.000 ADF 6.76e6 9.02e6 P-KE-D… MAMMO … ADF In… Blend … Non-Tr… KENYA East Transp… NA
# 10 "Multin… 2015-03-11 00:00:00.000 ADB Pr… 1.18e7 1.58e7 P-Z1-H… MACHAR… ADB Pr… Multin… Multin… MULTIN… Multin… Finance NA
# # … with 11 more variables: `High Five Prority 2: Light Up And Power Africa` <dbl>,
# # `High Five Prority 3: Industrialize Africa` <dbl>, `High Five Prority 4: Integrate Africa` <dbl>,
# # `High Five Prority 5: Improve Quality Of Life` <dbl>, `Financing Instrument` <chr>, `Infrastructure vs Sector` <chr>,
# # `SECTOR DEPARTMENT` <chr>, `Sector Complex Name` <chr>, `Operations Type` <chr>, `NUMBER OF PROJECT` <dbl>,
# # `Amount in UA Million` <dbl>, and abbreviated variable names ¹`Project Title`, ²`Approval date Year Trend`,
# # ³`Type of Financing Source`, ⁴`Approved Amount_UA Equivalent`, ⁵`Approved Amount_USDEquivalent`, ⁶`Project ID`,
# # ⁷`Employe Name`, ⁸`Financing Source`, ⁹`Financial Access Category`, ˟`Country Classifaction`, ˟`Country Name`, …
# # ℹ Use `colnames()` to see all variable names
With this sample data, you might want to add a filter for a minimum-N, since some columns here only have 2 or 3 non-NA
values which will be heavily biased on outlier-removal. A suggested edit:
outlier_median <- function(z, min_n = 5) {
if (sum(!is.na(z)) < min_n) return(z)
stats <- stats::fivenum(z, na.rm = TRUE)
z[!between(z, stats[2], stats[4])] <- stats[3]
z
}