split a dataframe into equal parts and store the results

Advertisements

I’m relatively new to R.
I have a large dataframe which I would like to split into multiple dataframes around different values.

structure(list(country = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), .Label = c("Bangladesh", "Barbados", "Benin", "Burundi", 
"Cameroon", "Chile", "Cyprus", "Ecuador", "Equatorial Guinea", 
"Gabon", "Ghana", "Guatemala", "Guinea", "Guyana", "Haiti", "India", 
"Jordan", "Lebanon", "Liberia", "Madagascar", "Mali", "Mexico", 
"Morocco", "Mozambique", "Nepal", "Nicaragua", "Niger", "Oman", 
"Pakistan", "Panama", "Peru", "Rwanda", "Senegal", "Seychelles", 
"Sierra Leone", "Singapore", "Sri Lanka", "Sudan", "Togo", "Tunisia", 
"Turkey", "Uganda", "Zambia"), class = c("pseries", "factor")), 
    date = structure(12:36, .Label = c("1965", "1966", "1967", 
    "1968", "1969", "1970", "1971", "1972", "1973", "1974", "1975", 
    "1976", "1977", "1978", "1979", "1980", "1981", "1982", "1983", 
    "1984", "1985", "1986", "1987", "1988", "1989", "1990", "1991", 
    "1992", "1993", "1994", "1995", "1996", "1997", "1998", "1999", 
    "2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", 
    "2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", 
    "2016", "2017", "2018"), class = c("pseries", "factor")), 
    oda_gdp = c(0.15080885502447, 0.1744123099839, 0.199176897551553, 
    0.193616875061556, 0.186942991013889, 0.164744452026834, 
    0.192609744294439, 0.13752013069625, 0.156186721262664, 0.137192335225767, 
    0.131167382827501, 0.139945790928319, 0.112553104508006, 
    0.1172188903714, 0.120991133274215, 0.0940867931618562, 0.0857724612850372, 
    0.0653099752359248, 0.0714189688493898, 0.0470115150264598, 
    0.0446068588203229, 0.0414522297087586, 0.0450866627292532, 
    0.0435203084091358, 0.0404623996092304), entry = c(0, 0, 
    1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0)), row.names = c(NA, 25L), class = "data.frame")

I would like to split this into multiple dataframes around the points where entry==1. More concretely, I would like each new dataframe to contain 2 rows before entry==1 and 7 rows after entry==1.

Output should look like this

df1=structure(list(country = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L), .Label = c("Bangladesh", "Barbados", "Benin", 
"Burundi", "Cameroon", "Chile", "Cyprus", "Ecuador", "Equatorial Guinea", 
"Gabon", "Ghana", "Guatemala", "Guinea", "Guyana", "Haiti", "India", 
"Jordan", "Lebanon", "Liberia", "Madagascar", "Mali", "Mexico", 
"Morocco", "Mozambique", "Nepal", "Nicaragua", "Niger", "Oman", 
"Pakistan", "Panama", "Peru", "Rwanda", "Senegal", "Seychelles", 
"Sierra Leone", "Singapore", "Sri Lanka", "Sudan", "Togo", "Tunisia", 
"Turkey", "Uganda", "Zambia"), class = c("pseries", "factor")), 
    date = structure(12:21, .Label = c("1965", "1966", "1967", 
    "1968", "1969", "1970", "1971", "1972", "1973", "1974", "1975", 
    "1976", "1977", "1978", "1979", "1980", "1981", "1982", "1983", 
    "1984", "1985", "1986", "1987", "1988", "1989", "1990", "1991", 
    "1992", "1993", "1994", "1995", "1996", "1997", "1998", "1999", 
    "2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", 
    "2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", 
    "2016", "2017", "2018"), class = c("pseries", "factor")), 
    oda_gdp = c(0.15080885502447, 0.1744123099839, 0.199176897551553, 
    0.193616875061556, 0.186942991013889, 0.164744452026834, 
    0.192609744294439, 0.13752013069625, 0.156186721262664, 0.137192335225767
    ), entry = c(0, 0, 1, 0, 0, 0, 1, 0, 0, 0)), row.names = c(NA, 
10L), class = "data.frame")

For the first case where entry=1

df2=structure(list(country = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L), .Label = c("Bangladesh", "Barbados", "Benin", 
"Burundi", "Cameroon", "Chile", "Cyprus", "Ecuador", "Equatorial Guinea", 
"Gabon", "Ghana", "Guatemala", "Guinea", "Guyana", "Haiti", "India", 
"Jordan", "Lebanon", "Liberia", "Madagascar", "Mali", "Mexico", 
"Morocco", "Mozambique", "Nepal", "Nicaragua", "Niger", "Oman", 
"Pakistan", "Panama", "Peru", "Rwanda", "Senegal", "Seychelles", 
"Sierra Leone", "Singapore", "Sri Lanka", "Sudan", "Togo", "Tunisia", 
"Turkey", "Uganda", "Zambia"), class = c("pseries", "factor")), 
    date = structure(16:25, .Label = c("1965", "1966", "1967", 
    "1968", "1969", "1970", "1971", "1972", "1973", "1974", "1975", 
    "1976", "1977", "1978", "1979", "1980", "1981", "1982", "1983", 
    "1984", "1985", "1986", "1987", "1988", "1989", "1990", "1991", 
    "1992", "1993", "1994", "1995", "1996", "1997", "1998", "1999", 
    "2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", 
    "2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", 
    "2016", "2017", "2018"), class = c("pseries", "factor")), 
    oda_gdp = c(0.186942991013889, 0.164744452026834, 0.192609744294439, 
    0.13752013069625, 0.156186721262664, 0.137192335225767, 0.131167382827501, 
    0.139945790928319, 0.112553104508006, 0.1172188903714), entry = c(0, 
    0, 1, 0, 0, 0, 0, 0, 0, 0)), row.names = 5:14, class = "data.frame")

For the second case where entry=1

I’m guessing split might be the way to go, but I can’t figure out a way to include the conditions.
Notice that all newly created dataframes should be the same size.
Any help is much appreciated!

>Solution :

I don’t think split is the right tool here. Instead, you can do:

lapply(which(df$entry == 1), function(i) df[(i-2):(i+7),])
#> [[1]]
#>       country date   oda_gdp entry
#> 1  Bangladesh 1976 0.1508089     0
#> 2  Bangladesh 1977 0.1744123     0
#> 3  Bangladesh 1978 0.1991769     1
#> 4  Bangladesh 1979 0.1936169     0
#> 5  Bangladesh 1980 0.1869430     0
#> 6  Bangladesh 1981 0.1647445     0
#> 7  Bangladesh 1982 0.1926097     1
#> 8  Bangladesh 1983 0.1375201     0
#> 9  Bangladesh 1984 0.1561867     0
#> 10 Bangladesh 1985 0.1371923     0
#> 
#> [[2]]
#>       country date   oda_gdp entry
#> 5  Bangladesh 1980 0.1869430     0
#> 6  Bangladesh 1981 0.1647445     0
#> 7  Bangladesh 1982 0.1926097     1
#> 8  Bangladesh 1983 0.1375201     0
#> 9  Bangladesh 1984 0.1561867     0
#> 10 Bangladesh 1985 0.1371923     0
#> 11 Bangladesh 1986 0.1311674     0
#> 12 Bangladesh 1987 0.1399458     0
#> 13 Bangladesh 1988 0.1125531     0
#> 14 Bangladesh 1989 0.1172189     0
#> 
#> [[3]]
#>       country date    oda_gdp entry
#> 13 Bangladesh 1988 0.11255310     0
#> 14 Bangladesh 1989 0.11721889     0
#> 15 Bangladesh 1990 0.12099113     1
#> 16 Bangladesh 1991 0.09408679     0
#> 17 Bangladesh 1992 0.08577246     0
#> 18 Bangladesh 1993 0.06530998     0
#> 19 Bangladesh 1994 0.07141897     0
#> 20 Bangladesh 1995 0.04701152     0
#> 21 Bangladesh 1996 0.04460686     0
#> 22 Bangladesh 1997 0.04145223     0

^{Created on 2022-02-08 by the reprex package (v2.0.1)}

Dev solutions

Solutions for development problems

split a dataframe into equal parts and store the results

>Solution :

Leave a ReplyCancel reply

>Solution :

Share this:

Leave a ReplyCancel reply

Discover more from Dev solutions