Repalce all outliers of numeric columns with column median at once

I have the data below with both categorical and numeric variables I want to automatically replace all my outliers from numeric variables with this method below but I need to do it in all numeric columns automatically and not one by one

df$v3[df$v3 %in% boxplot(df)$out] <- median(df$v3)

df<-structure(list(`Project Title` = c("Cameroun - Projet d’aide humanitaire d’urgence en faveur des réfugiés hors sites et des populations hôtes situés dans les régions de l’Adamaoua, de l’Est et du Nord\r\n(Version française uniquement)", 
"Multinational - Kenya-Tanzania Power Interconnection Project", 
"Multinational - Kenya-Tanzania Power Interconnection Project", 
"Zambia - Madison Finance Company Limited\r\n(under the Africa SME Program)", 
"Eritrea - Skills Development for Employability and Entrepreneurship Project", 
"Eritrea - Skills Development for Employability and Entrepreneurship Project", 
"Uganda - Water Supply and Sanitation Programme - Additional activities", 
"Cabo Verde - Aide d’urgence pour assister les populations affectées par l’éruption volcanique de l’île de Fogo\r\n(Version française uniquement)", 
"Kenya - Mombasa-Mariakani Highway Project", "Multinational - Africa Banking Corporation Holdings Ltd"
), `Approval date Year Trend` = structure(c(1421107200, 1424217600, 
1424217600, 1425254400, 1425340800, 1425340800, 1425427200, 1425600000, 
1426032000, 1426032000), tzone = "UTC", class = c("POSIXct", 
"POSIXt")), `Type of Financing Source` = c("Special Relief Fund (SRF)", 
"ADF", "ADF", "ADB Private Sector", "ADF", "ADF", "Global Environment Facility (GEF)", 
"Special Relief Fund (SRF)", "ADF", "ADB Private Sector"), `Approved Amount_UA Equivalent` = c(690221.630165446, 
75290000, 27500000, 2131605.31196044, 7580000, 5920000, 5947178.82036962, 
191844.478076439, 8e+07, 11842251.7331135), `Approved Amount_USDEquivalent` = c(920065.43301054, 
100361570, 36657500, 2841429.88084326, 10104140, 7891360, 7927589.3675527, 
255728.689275894, 106640000, 15785721.5602403), `Project ID` = c("P-CM-I00-001", 
"P-Z1-FA0-052", "P-Z1-FA0-086", "P-ZM-HAB-003", "P-ER-IA0-001", 
"P-ER-IA0-001", "P-UG-E00-013", "P-CV-I00-001", "P-KE-DB0-021", 
"P-Z1-HAB-046"), `Employe Name` = c("BISSAKONOU Judes", "MUGUWA Andrew", 
"MUGUWA Andrew", "FRENCH Laura Katherine Smith", "ELAHEEBOCUS Bibi Nawsheen", 
"ELAHEEBOCUS Bibi Nawsheen", "MBIRO Andrew", "TRAORE Rokhaya", 
"MAMMO Zerfu Tessema", "MACHARIA Julius Karuga"), `Financing Source` = c("OTHERS", 
"ADF Including TSF", "ADF Including TSF", "ADB Private", "ADF Including TSF", 
"ADF Including TSF", "OTHERS", "OTHERS", "ADF Including TSF", 
"ADB Private"), `Financial Access Category` = c("Blend Countries", 
"Multinational", "Blend Countries", "Blend Countries", "ADF Countries", 
"ADF Countries", "ADF Countries", "ADB Countries", "Blend Countries", 
"Multinational"), `Country Classifaction` = c("Non-Transition States", 
"Multinational", "Non-Transition States", "Non-Transition States", 
"Transition States", "Transition States", "Non-Transition States", 
"Non-Transition States", "Non-Transition States", "Multinational"
), `Country Name` = c("CAMEROON", "MULTINATIONAL", "KENYA", "ZAMBIA", 
"ERITREA", "ERITREA", "UGANDA", "Cape Verde", "KENYA", "MULTINATIONAL"
), `REGION NAME` = c("Central", "Multinational", "East", "South", 
"East", "East", "East", "West", "East", "Multinational"), `Sectoral Analysis` = c("Social", 
"Power", "Power", "Finance", "Social", "Social", "WASH Sector", 
"Social", "Transport", "Finance"), `High Five Prority 1: Feed Africa` = c(NA, 
NA, NA, NA, NA, NA, 178415.364611089, NA, NA, NA), `High Five Prority 2: Light Up And Power Africa` = c(NA, 
75290000, 27500000, NA, NA, NA, 59471.7882036962, NA, NA, NA), 
    `High Five Prority 3: Industrialize Africa` = c(NA, NA, NA, 
    2131605.31196044, NA, NA, NA, NA, NA, 11842251.7331135), 
    `High Five Prority 4: Integrate Africa` = c(NA, NA, NA, NA, 
    NA, NA, NA, NA, 8e+07, NA), `High Five Prority 5: Improve Quality Of Life` = c(690221.630165446, 
    NA, NA, NA, 7580000, 5920000, 5709291.66755484, 191844.478076439, 
    NA, NA), `Financing Instrument` = c("Emergency Assistance", 
    "Investment Project", "Investment Project", "Investment Project", 
    "Investment Project", "Investment Project", "ISP and Investment Project", 
    "Emergency Assistance", "Investment Project", "Investment Project"
    ), `Infrastructure vs Sector` = c("Social", "Infrastructure", 
    "Infrastructure", "Finance", "Social", "Social", "Infrastructure", 
    "Social", "Infrastructure", "Finance"), `SECTOR DEPARTMENT` = c("AHAI", 
    "PESD", "PESD", "PIFD", "AHHD", "AHHD", "AHWS", "AHHD", "PICU", 
    "PIFD"), `Sector Complex Name` = c("AHVP", "PEVP", "PEVP", 
    "PIVP", "AHVP", "AHVP", "AHVP", "AHVP", "PIVP", "PIVP"), 
    `Operations Type` = c("Sovereign", "Sovereign", "Sovereign", 
    "Non-Sovereign", "Sovereign", "Sovereign", "Sovereign", "Sovereign", 
    "Sovereign", "Non-Sovereign"), `NUMBER OF PROJECT` = c(1, 
    1, 1, 1, 0.5, 0.5, 1, 1, 1, 1), `Amount in UA Million` = c(0.690221630165446, 
    75.29, 27.5, 2.13160531196044, 7.58, 5.92, 5.94717882036962, 
    0.191844478076439, 80, 11.8422517331135)), row.names = c(NA, 
-10L), class = c("tbl_df", "tbl", "data.frame"))

>Solution :

Try this:

outlier_median <- function(z) {
  stats <- stats::fivenum(z, na.rm = TRUE)
  z[!between(z, stats[2], stats[4])] <- stats[3]
  z
}
df %>%
  mutate(across(where(is.numeric), ~ outlier_median(.)))
# # A tibble: 10 × 25
#    Projec…¹ Approval date Year Tr…² Type …³ Appro…⁴ Appro…⁵ Proje…⁶ Emplo…⁷ Finan…⁸ Finan…⁹ Count…˟ Count…˟ REGIO…˟ Secto…˟ High …˟
#    <chr>    <dttm>                  <chr>     <dbl>   <dbl> <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>     <dbl>
#  1 "Camero… 2015-01-13 00:00:00.000 Specia…  6.76e6  9.02e6 P-CM-I… BISSAK… OTHERS  Blend … Non-Tr… CAMERO… Central Social      NA 
#  2 "Multin… 2015-02-18 00:00:00.000 ADF      6.76e6  9.02e6 P-Z1-F… MUGUWA… ADF In… Multin… Multin… MULTIN… Multin… Power       NA 
#  3 "Multin… 2015-02-18 00:00:00.000 ADF      2.75e7  3.67e7 P-Z1-F… MUGUWA… ADF In… Blend … Non-Tr… KENYA   East    Power       NA 
#  4 "Zambia… 2015-03-02 00:00:00.000 ADB Pr…  2.13e6  2.84e6 P-ZM-H… FRENCH… ADB Pr… Blend … Non-Tr… ZAMBIA  South   Finance     NA 
#  5 "Eritre… 2015-03-03 00:00:00.000 ADF      7.58e6  1.01e7 P-ER-I… ELAHEE… ADF In… ADF Co… Transi… ERITREA East    Social      NA 
#  6 "Eritre… 2015-03-03 00:00:00.000 ADF      5.92e6  7.89e6 P-ER-I… ELAHEE… ADF In… ADF Co… Transi… ERITREA East    Social      NA 
#  7 "Uganda… 2015-03-04 00:00:00.000 Global…  5.95e6  7.93e6 P-UG-E… MBIRO … OTHERS  ADF Co… Non-Tr… UGANDA  East    WASH S… 178415.
#  8 "Cabo V… 2015-03-06 00:00:00.000 Specia…  6.76e6  9.02e6 P-CV-I… TRAORE… OTHERS  ADB Co… Non-Tr… Cape V… West    Social      NA 
#  9 "Kenya … 2015-03-11 00:00:00.000 ADF      6.76e6  9.02e6 P-KE-D… MAMMO … ADF In… Blend … Non-Tr… KENYA   East    Transp…     NA 
# 10 "Multin… 2015-03-11 00:00:00.000 ADB Pr…  1.18e7  1.58e7 P-Z1-H… MACHAR… ADB Pr… Multin… Multin… MULTIN… Multin… Finance     NA 
# # … with 11 more variables: `High Five Prority 2: Light Up And Power Africa` <dbl>,
# #   `High Five Prority 3: Industrialize Africa` <dbl>, `High Five Prority 4: Integrate Africa` <dbl>,
# #   `High Five Prority 5: Improve Quality Of Life` <dbl>, `Financing Instrument` <chr>, `Infrastructure vs Sector` <chr>,
# #   `SECTOR DEPARTMENT` <chr>, `Sector Complex Name` <chr>, `Operations Type` <chr>, `NUMBER OF PROJECT` <dbl>,
# #   `Amount in UA Million` <dbl>, and abbreviated variable names ¹​`Project Title`, ²​`Approval date Year Trend`,
# #   ³​`Type of Financing Source`, ⁴​`Approved Amount_UA Equivalent`, ⁵​`Approved Amount_USDEquivalent`, ⁶​`Project ID`,
# #   ⁷​`Employe Name`, ⁸​`Financing Source`, ⁹​`Financial Access Category`, ˟​`Country Classifaction`, ˟​`Country Name`, …
# # ℹ Use `colnames()` to see all variable names

With this sample data, you might want to add a filter for a minimum-N, since some columns here only have 2 or 3 non-NA values which will be heavily biased on outlier-removal. A suggested edit:

outlier_median <- function(z, min_n = 5) {
  if (sum(!is.na(z)) < min_n) return(z)
  stats <- stats::fivenum(z, na.rm = TRUE)
  z[!between(z, stats[2], stats[4])] <- stats[3]
  z
}

Leave a Reply