I want to merge/concatenate two dataframes tcia and clin. In contrast to the tcia dataframe, the clin dataframe has a substring at the end of the column names (i.e., the 3rd "-" followed by subsequent letters). The dataframes should be combined irrespective of the substring but the final dataframe should have this substring.
My code does the job but I’m hoping for a more robust/concise way to do it.
Code:
clin_df = clin.copy()
clin_df.columns = clin_df.columns.str.rsplit('-', n=1).str.get(0)
df = pd.concat([clin_df, tcia], axis=0)
df.columns = clin.columns
Input:
clin
pd.DataFrame({'TCGA-2K-A9WE-01': {'admin.batch_number': '398.45.0',
'age': '53',
'days_to_initial_pathologic_diagnosis': '0',
'days_to_last_follow_up': '207.0',
'ethnicity': 'not hispanic or latino'},
'TCGA-2Z-A9J1-01': {'admin.batch_number': '398.45.0',
'age': '71',
'days_to_initial_pathologic_diagnosis': '0',
'days_to_last_follow_up': '2298.0',
'ethnicity': 'not hispanic or latino'},
'TCGA-2Z-A9J3-01': {'admin.batch_number': '398.45.0',
'age': '67',
'days_to_initial_pathologic_diagnosis': '0',
'days_to_last_follow_up': nan,
'ethnicity': 'not hispanic or latino'},
'TCGA-2Z-A9J6-01': {'admin.batch_number': '398.45.0',
'age': '60',
'days_to_initial_pathologic_diagnosis': '0',
'days_to_last_follow_up': '1731.0',
'ethnicity': 'not hispanic or latino'},
'TCGA-2Z-A9J7-01': {'admin.batch_number': '398.45.0',
'age': '63',
'days_to_initial_pathologic_diagnosis': '0',
'days_to_last_follow_up': nan,
'ethnicity': 'not hispanic or latino'}})
tcia
pd.DataFrame({'TCGA-2K-A9WE': {'ips_ctla4_neg_pd1_neg': 8.0,
'ips_ctla4_neg_pd1_pos': 7.0,
'ips_ctla4_pos_pd1_neg': 7.0,
'ips_ctla4_pos_pd1_pos': 6.0,
'patient_uuid': '73292c19-d6a8-4bc4-97bc-ccce54f264f8'},
'TCGA-2Z-A9J1': {'ips_ctla4_neg_pd1_neg': 9.0,
'ips_ctla4_neg_pd1_pos': 8.0,
'ips_ctla4_pos_pd1_neg': 9.0,
'ips_ctla4_pos_pd1_pos': 7.0,
'patient_uuid': '851a1157-e460-4794-8534-2eb6f0ae7468'},
'TCGA-2Z-A9J3': {'ips_ctla4_neg_pd1_neg': 9.0,
'ips_ctla4_neg_pd1_pos': 7.0,
'ips_ctla4_pos_pd1_neg': 8.0,
'ips_ctla4_pos_pd1_pos': 6.0,
'patient_uuid': '5195c9ac-b649-49f8-8750-f9a4787e8e52'},
'TCGA-2Z-A9J6': {'ips_ctla4_neg_pd1_neg': 9.0,
'ips_ctla4_neg_pd1_pos': 7.0,
'ips_ctla4_pos_pd1_neg': 8.0,
'ips_ctla4_pos_pd1_pos': 7.0,
'patient_uuid': '4a540448-f106-4b0e-9038-9f7ccefc785b'},
'TCGA-2Z-A9J7': {'ips_ctla4_neg_pd1_neg': 7.0,
'ips_ctla4_neg_pd1_pos': 5.0,
'ips_ctla4_pos_pd1_neg': 6.0,
'ips_ctla4_pos_pd1_pos': 5.0,
'patient_uuid': 'd66c9261-6c0c-44b0-92fa-a43757f34cb2'}})
Desired output:
pd.DataFrame({'TCGA-2K-A9WE': {'admin.batch_number': '398.45.0',
'age': '53',
'days_to_initial_pathologic_diagnosis': '0',
'days_to_last_follow_up': '207.0',
'ethnicity': 'not hispanic or latino',
'ips_ctla4_neg_pd1_neg': 8.0,
'ips_ctla4_neg_pd1_pos': 7.0,
'ips_ctla4_pos_pd1_neg': 7.0,
'ips_ctla4_pos_pd1_pos': 6.0,
'patient_uuid': '73292c19-d6a8-4bc4-97bc-ccce54f264f8'},
'TCGA-2Z-A9J1': {'admin.batch_number': '398.45.0',
'age': '71',
'days_to_initial_pathologic_diagnosis': '0',
'days_to_last_follow_up': '2298.0',
'ethnicity': 'not hispanic or latino',
'ips_ctla4_neg_pd1_neg': 9.0,
'ips_ctla4_neg_pd1_pos': 8.0,
'ips_ctla4_pos_pd1_neg': 9.0,
'ips_ctla4_pos_pd1_pos': 7.0,
'patient_uuid': '851a1157-e460-4794-8534-2eb6f0ae7468'},
'TCGA-2Z-A9J3': {'admin.batch_number': '398.45.0',
'age': '67',
'days_to_initial_pathologic_diagnosis': '0',
'days_to_last_follow_up': nan,
'ethnicity': 'not hispanic or latino',
'ips_ctla4_neg_pd1_neg': 9.0,
'ips_ctla4_neg_pd1_pos': 7.0,
'ips_ctla4_pos_pd1_neg': 8.0,
'ips_ctla4_pos_pd1_pos': 6.0,
'patient_uuid': '5195c9ac-b649-49f8-8750-f9a4787e8e52'},
'TCGA-2Z-A9J6': {'admin.batch_number': '398.45.0',
'age': '60',
'days_to_initial_pathologic_diagnosis': '0',
'days_to_last_follow_up': '1731.0',
'ethnicity': 'not hispanic or latino',
'ips_ctla4_neg_pd1_neg': 9.0,
'ips_ctla4_neg_pd1_pos': 7.0,
'ips_ctla4_pos_pd1_neg': 8.0,
'ips_ctla4_pos_pd1_pos': 7.0,
'patient_uuid': '4a540448-f106-4b0e-9038-9f7ccefc785b'},
'TCGA-2Z-A9J7': {'admin.batch_number': '398.45.0',
'age': '63',
'days_to_initial_pathologic_diagnosis': '0',
'days_to_last_follow_up': nan,
'ethnicity': 'not hispanic or latino',
'ips_ctla4_neg_pd1_neg': 7.0,
'ips_ctla4_neg_pd1_pos': 5.0,
'ips_ctla4_pos_pd1_neg': 6.0,
'ips_ctla4_pos_pd1_pos': 5.0,
'patient_uuid': 'd66c9261-6c0c-44b0-92fa-a43757f34cb2'}})
>Solution :
With df.set_axis method (to assign a new column index on-the-fly):
df = pd.concat([clin.set_axis(clin.columns.str.rpartition('-', expand=False).str[0], axis=1),
tcia]).set_axis(clin.columns, axis=1)
print(df)
TCGA-2K-A9WE-01 TCGA-2Z-A9J1-01 TCGA-2Z-A9J3-01 TCGA-2Z-A9J6-01 TCGA-2Z-A9J7-01
admin.batch_number 398.45.0 398.45.0 398.45.0 398.45.0 398.45.0
age 53 71 67 60 63
days_to_initial_pathologic_diagnosis 0 0 0 0 0
days_to_last_follow_up 207.0 2298.0 NaN 1731.0 NaN
ethnicity not hispanic or latino not hispanic or latino not hispanic or latino not hispanic or latino not hispanic or latino
ips_ctla4_neg_pd1_neg 8.0 9.0 9.0 9.0 7.0
ips_ctla4_neg_pd1_pos 7.0 8.0 7.0 7.0 5.0
ips_ctla4_pos_pd1_neg 7.0 9.0 8.0 8.0 6.0
ips_ctla4_pos_pd1_pos 6.0 7.0 6.0 7.0 5.0
patient_uuid 73292c19-d6a8-4bc4-97bc-ccce54f264f8 851a1157-e460-4794-8534-2eb6f0ae7468 5195c9ac-b649-49f8-8750-f9a4787e8e52 4a540448-f106-4b0e-9038-9f7ccefc785b d66c9261-6c0c-44b0-92fa-a43757f34cb2