I have some data which looks like:
# A tibble: 52 × 3
provincia mean_price number_properties
<chr> <dbl> <int>
1 A Coruña 179833. 2508
2 Albacete 148584. 1311
3 Alicante 418148. 22676
4 Almería 142338. 3902
5 Araba - Álava 243705. 786
I am trying to compute the ntiles of the data which I can do.
df %>%
mutate(
bins = ntile(mean_price, 5)
)
So, I want to bin the data into 5 groups. However, I also would like to extract the cutoff points that were used to create each bin. I know the quantiles function would get me what I wanted but I have to pass it 52 as an input parameter (number of observations I have)- However, I would like to have just 5 groups.
Code:
x = df %>%
mutate(
bins = ntile(mean_price, 52),
bins_cutpoints = quantile(mean_price, probs = seq(0, 1, length.out = 52), na.rm = TRUE)
)
How can I get the values for the cutoffs from the mean_price column for each ntile?
Data:
df <- structure(list(provincia = c("A Coruña", "Albacete", "Alicante",
"Almería", "Araba - Álava", "Asturias", "Badajoz", "Barcelona",
"Bizkaia", "Burgos", "Cantabria", "Castellón", "Ceuta", "Ciudad Real",
"Cuenca", "Cáceres", "Cádiz", "Córdoba", "Gipuzkoa", "Girona",
"Granada", "Guadalajara", "Huelva", "Huesca", "Illes Balears",
"Jaén", "La Rioja", "Las Palmas", "León", "Lleida", "Lugo",
"Madrid", "Melilla", "Murcia", "Málaga", "Navarra", "Ourense",
"Palencia", "Pontevedra", "Salamanca", "Santa Cruz de Tenerife",
"Segovia", "Sevilla", "Soria", "Tarragona", "Teruel", "Toledo",
"Valencia", "Valladolid", "Zamora", "Zaragoza", "Ávila"), mean_price = c(179833.167862839,
148583.87109077, 418148.151437643, 142337.792926704, 243704.750636132,
149179.732438607, 147802.894486692, 462813.775190776, 286461.604484305,
134635.540239044, 201214.437926878, 139819.15323646, 204318.181818182,
113856.401715511, 116992.998609179, 143137.441025641, 251270.65520481,
166586.746650426, 340663.440746753, 365733.360551724, 219325.194605466,
173221.569037657, 142430.992912371, 141754.603535354, 749037.540922619,
131396.292428198, 160838.235963581, 656553.777123633, 140312.919506463,
160485.155614973, 129787.667711599, 433665.652781242, 234306.12244898,
188635.237652749, 687127.622056842, 197388.050991501, 137829.640394089,
123450.853813559, 230344.604904632, 159999.046077894, 328912.419004007,
184829.536144578, 180692.511036468, 155759.235955056, 193651.956693536,
128909.006756757, 132623.629553967, 177825.529404212, 165240.297002725,
116820.125531915, 164934.932635983, 124977.724215247), number_properties = c(2508L,
1311L, 22676L, 3902L, 786L, 3502L, 2104L, 35906L, 3345L, 1255L,
2489L, 4542L, 44L, 1399L, 719L, 1170L, 5322L, 5747L, 1232L, 7250L,
16906L, 956L, 1552L, 396L, 5376L, 1915L, 1318L, 2378L, 1702L,
1870L, 638L, 16036L, 49L, 7856L, 15587L, 706L, 1421L, 472L, 2569L,
1823L, 3494L, 498L, 6252L, 89L, 6327L, 296L, 2937L, 15576L, 1468L,
470L, 2390L, 446L)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-52L))
>Solution :
We can use quantile output within cut as mentioned in the comments
library(dplyr)
df %>%
mutate(bins = ntile(mean_price, 52),
bins_cutpoints = quantile(mean_price,
probs = seq(0, 1, length.out = 52), na.rm = TRUE),
cut_bins = cut(mean_price, breaks = bins_cutpoints) )
-output
# A tibble: 52 × 6
provincia mean_price number_properties bins bins_cutpoints cut_bins
<chr> <dbl> <int> <int> <dbl> <fct>
1 A Coruña 179833. 2508 30 113856. (1.778e+05,1.798e+05]
2 Albacete 148584. 1311 19 116820. (1.478e+05,1.486e+05]
3 Alicante 418148. 22676 47 116993. (3.657e+05,4.181e+05]
4 Almería 142338. 3902 15 123451. (1.418e+05,1.423e+05]
5 Araba - Álava 243705. 786 41 124978. (2.343e+05,2.437e+05]
6 Asturias 149180. 3502 20 128909. (1.486e+05,1.492e+05]
7 Badajoz 147803. 2104 18 129788. (1.431e+05,1.478e+05]
8 Barcelona 462814. 35906 49 131396. (4.337e+05,4.628e+05]
9 Bizkaia 286462. 3345 43 132624. (2.513e+05,2.865e+05]
10 Burgos 134636. 1255 10 134636. (1.326e+05,1.346e+05]
# … with 42 more rows