Getting cutoff values for each ntile group

February 17, 2023

I have some data which looks like:

# A tibble: 52 × 3
   provincia     mean_price number_properties
   <chr>              <dbl>             <int>
 1 A Coruña         179833.              2508
 2 Albacete         148584.              1311
 3 Alicante         418148.             22676
 4 Almería          142338.              3902
 5 Araba - Álava    243705.               786

I am trying to compute the ntiles of the data which I can do.

df %>% 
  mutate(
    bins = ntile(mean_price, 5)
  )

So, I want to bin the data into 5 groups. However, I also would like to extract the cutoff points that were used to create each bin. I know the quantiles function would get me what I wanted but I have to pass it 52 as an input parameter (number of observations I have)- However, I would like to have just 5 groups.

Code:

x = df %>% 
  mutate(
    bins = ntile(mean_price, 52),
    bins_cutpoints = quantile(mean_price, probs = seq(0, 1, length.out = 52), na.rm = TRUE)
  )

How can I get the values for the cutoffs from the mean_price column for each ntile?

Data:

df <- structure(list(provincia = c("A Coruña", "Albacete", "Alicante", 
"Almería", "Araba - Álava", "Asturias", "Badajoz", "Barcelona", 
"Bizkaia", "Burgos", "Cantabria", "Castellón", "Ceuta", "Ciudad Real", 
"Cuenca", "Cáceres", "Cádiz", "Córdoba", "Gipuzkoa", "Girona", 
"Granada", "Guadalajara", "Huelva", "Huesca", "Illes Balears", 
"Jaén", "La Rioja", "Las Palmas", "León", "Lleida", "Lugo", 
"Madrid", "Melilla", "Murcia", "Málaga", "Navarra", "Ourense", 
"Palencia", "Pontevedra", "Salamanca", "Santa Cruz de Tenerife", 
"Segovia", "Sevilla", "Soria", "Tarragona", "Teruel", "Toledo", 
"Valencia", "Valladolid", "Zamora", "Zaragoza", "Ávila"), mean_price = c(179833.167862839, 
148583.87109077, 418148.151437643, 142337.792926704, 243704.750636132, 
149179.732438607, 147802.894486692, 462813.775190776, 286461.604484305, 
134635.540239044, 201214.437926878, 139819.15323646, 204318.181818182, 
113856.401715511, 116992.998609179, 143137.441025641, 251270.65520481, 
166586.746650426, 340663.440746753, 365733.360551724, 219325.194605466, 
173221.569037657, 142430.992912371, 141754.603535354, 749037.540922619, 
131396.292428198, 160838.235963581, 656553.777123633, 140312.919506463, 
160485.155614973, 129787.667711599, 433665.652781242, 234306.12244898, 
188635.237652749, 687127.622056842, 197388.050991501, 137829.640394089, 
123450.853813559, 230344.604904632, 159999.046077894, 328912.419004007, 
184829.536144578, 180692.511036468, 155759.235955056, 193651.956693536, 
128909.006756757, 132623.629553967, 177825.529404212, 165240.297002725, 
116820.125531915, 164934.932635983, 124977.724215247), number_properties = c(2508L, 
1311L, 22676L, 3902L, 786L, 3502L, 2104L, 35906L, 3345L, 1255L, 
2489L, 4542L, 44L, 1399L, 719L, 1170L, 5322L, 5747L, 1232L, 7250L, 
16906L, 956L, 1552L, 396L, 5376L, 1915L, 1318L, 2378L, 1702L, 
1870L, 638L, 16036L, 49L, 7856L, 15587L, 706L, 1421L, 472L, 2569L, 
1823L, 3494L, 498L, 6252L, 89L, 6327L, 296L, 2937L, 15576L, 1468L, 
470L, 2390L, 446L)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-52L))

>Solution :

We can use quantile output within cut as mentioned in the comments

library(dplyr)
df %>%   
  mutate(bins = ntile(mean_price, 52),    
    bins_cutpoints = quantile(mean_price, 
    probs = seq(0, 1, length.out = 52), na.rm = TRUE), 
  cut_bins = cut(mean_price, breaks = bins_cutpoints)   )

-output

# A tibble: 52 × 6
   provincia     mean_price number_properties  bins bins_cutpoints cut_bins             
   <chr>              <dbl>             <int> <int>          <dbl> <fct>                
 1 A Coruña         179833.              2508    30        113856. (1.778e+05,1.798e+05]
 2 Albacete         148584.              1311    19        116820. (1.478e+05,1.486e+05]
 3 Alicante         418148.             22676    47        116993. (3.657e+05,4.181e+05]
 4 Almería          142338.              3902    15        123451. (1.418e+05,1.423e+05]
 5 Araba - Álava    243705.               786    41        124978. (2.343e+05,2.437e+05]
 6 Asturias         149180.              3502    20        128909. (1.486e+05,1.492e+05]
 7 Badajoz          147803.              2104    18        129788. (1.431e+05,1.478e+05]
 8 Barcelona        462814.             35906    49        131396. (4.337e+05,4.628e+05]
 9 Bizkaia          286462.              3345    43        132624. (2.513e+05,2.865e+05]
10 Burgos           134636.              1255    10        134636. (1.326e+05,1.346e+05]
# … with 42 more rows