Getting error in the calculation in pandas

June 1, 2023

I am getting calculation errors while writing the groupby function with aggregate function in a loop.But, outside the loop everything is okay. Getting the results correctly…!

import pandas as pd
import numpy as np

# Example DataFrame
df = pd.DataFrame({
    'GroupA': ['A', 'A', 'B', 'B', 'B', 'C'],
    'GroupB': ['X', 'Y', 'Z', 'X', 'Y', 'X'],
    'POP': [10, 20, 30, 40, 50, 60],
    'LF': [1, 2, 3, 4, 5, 6],
    'WRK': [100, 200, 300, 400, 500, 600]
})

groupby_cols = [[], ['GroupA'], ['GroupB'], ['GroupA', 'GroupB']]

def test(df, gby):
    # Perform groupby and aggregation
    groupby_columns = groupby_cols[gby]
    w2 = df.groupby(groupby_columns).agg(
        pophat=('POP', lambda x: np.sum(x * df['CMULT'])),
        lfhat=('LF', lambda x: np.sum(x * df['CMULT'])),
        wrkhat=('WRK', lambda x: np.sum(x * df['CMULT']))
    ).reset_index()

    # Calculate CMULT column based on the current groupby configuration
    if len(groupby_columns) == 1:
        w2['CMULT'] = w2[groupby_columns[0]].map({'A': 0.5, 'B': 0.3, 'C': 0.2})
    else:
        w2['CMULT'] = w2['GroupA'].map({'A': 0.5, 'B': 0.3, 'C': 0.2})


    print(w2, groupby_columns)

for i in range(len(groupby_cols)):
    if i == 0:
        df['CMULT'] = df['GroupA'].map({'A': 0.5, 'B': 0.3, 'C': 0.2})
        df['POP'] = pd.to_numeric(df['POP']) * df['CMULT']
        df['LF'] = pd.to_numeric(df['LF']) * df['CMULT']
        df['WRK'] = pd.to_numeric(df['WRK']) * df['CMULT']
        df['no_sam'] = df.shape[0]

        agg_dict = {'POP': 'sum', 'LF': 'sum', 'WRK': 'sum', 'no_sam': 'count'}
        # Group the data by the current groupby configuration and calculate the aggregates
        w2 = df.agg(agg_dict).to_frame().T

        print(w2, groupby_cols[i])

    else:
        test(df, i)

This is the code , in which I am getting calculation errors. The results are –

POP   LF    WRK  no_sam
0  63.0  6.3  630.0     6.0 []
  GroupA  pophat  lfhat  wrkhat  CMULT
0      A     7.5   0.75    75.0    0.5
1      B    10.8   1.08   108.0    0.3
2      C     2.4   0.24    24.0    0.2 ['GroupA']
  GroupB  pophat  lfhat  wrkhat  CMULT
0      X     8.5   0.85    85.0    NaN
1      Y     9.5   0.95    95.0    NaN
2      Z     2.7   0.27    27.0    NaN ['GroupB']
  GroupA GroupB  pophat  lfhat  wrkhat  CMULT
0      A      X     2.5   0.25    25.0    0.5
1      A      Y     5.0   0.50    50.0    0.5
2      B      X     3.6   0.36    36.0    0.3
3      B      Y     4.5   0.45    45.0    0.3
4      B      Z     2.7   0.27    27.0    0.3
5      C      X     2.4   0.24    24.0    0.2 ['GroupA', 'GroupB']

But, outside the loop the results are – (You can verify it by changing the index of groupby_cols[NNNNNNNNN])

import pandas as pd
import numpy as np

# Example DataFrame
df = pd.DataFrame({
    'GroupA': ['A', 'A', 'B', 'B', 'B', 'C'],
    'GroupB': ['X', 'Y', 'Z', 'X', 'Y', 'X'],
    'POP': [10, 20, 30, 40, 50, 60],
    'LF': [1, 2, 3, 4, 5, 6],
    'WRK': [100, 200, 300, 400, 500, 600]
})

groupby_cols = [[], ['GroupA'], ['GroupB'], ['GroupA', 'GroupB']]

df['CMULT'] = df.groupby(groupby_cols[i])['GroupA'].transform(lambda x: x.map({'A': 0.5, 'B': 0.3, 'C': 0.2}))

# Perform groupby and aggregation based on the current groupby configuration

w2 = df.groupby(groupby_cols[3]).agg(
    pophat=('POP', lambda x: np.sum(x * df['CMULT'])),
    lfhat=('LF', lambda x: np.sum(x * df['CMULT'])),
    wrkhat=('WRK', lambda x: np.sum(x * df['CMULT']))
).reset_index()

print(w2)

         POP       LF       WRK         no_sam
 0       63.0      6.3      630.0       6.0 

  GroupA  pophat  lfhat  wrkhat
0      A    15.0    1.5   150.0
1      B    36.0    3.6   360.0
2      C    12.0    1.2   120.0

  GroupB  pophat  lfhat  wrkhat
0      X    29.0    2.9   290.0
1      Y    25.0    2.5   250.0
2      Z     9.0    0.9    90.0

  GroupA GroupB  pophat  lfhat  wrkhat
0      A      X     5.0    0.5    50.0
1      A      Y    10.0    1.0   100.0
2      B      X    12.0    1.2   120.0
3      B      Y    15.0    1.5   150.0
4      B      Z     9.0    0.9    90.0
5      C      X    12.0    1.2   120.0

So, am I not understanding the groupby and aggregation properly because It is not working in the loop or The functions work differently in the loop. I have doubt about it, how can It be possible???

>Solution :

IIUC multiple columns before loop and then aggregate sum only:

groupby_cols = [[], ['GroupA'], ['GroupB'], ['GroupA', 'GroupB']]

def test(df, gby):
    # print (df)
    # Perform groupby and aggregation
    groupby_columns = groupby_cols[gby]
    w2 = df.groupby(groupby_columns).agg(
        pophat=('POP', 'sum'),
        lfhat=('LF', 'sum'),
        wrkhat=('WRK', 'sum')
    ).reset_index()
    # print (w2)

    # Calculate CMULT column based on the current groupby configuration
    if len(groupby_columns) == 1:
        w2['CMULT'] = w2[groupby_columns[0]].map({'A': 0.5, 'B': 0.3, 'C': 0.2})
    else:
        w2['CMULT'] = w2['GroupA'].map({'A': 0.5, 'B': 0.3, 'C': 0.2})

    print(w2, groupby_columns)

df['CMULT'] = df['GroupA'].map({'A': 0.5, 'B': 0.3, 'C': 0.2})
df['POP'] = pd.to_numeric(df['POP']) * df['CMULT']
df['LF'] = pd.to_numeric(df['LF']) * df['CMULT']
df['WRK'] = pd.to_numeric(df['WRK']) * df['CMULT']
df['no_sam'] = df.shape[0]

for i in range(len(groupby_cols)):
    if i == 0:

        agg_dict = {'POP': 'sum', 'LF': 'sum', 'WRK': 'sum', 'no_sam': 'count'}
        # Group the data by the current groupby configuration and calculate the aggregates
        w2 = df.agg(agg_dict).to_frame().T

        print(w2, groupby_cols[i])

    else:
        test(df, i)