I have a data frame with one column that contains a list of factors in each row.
Each row follows the same formatting:
ConceptMMI(index='tmppplgwffe', mm='MMI', score='3.68', preferred_name='Transurethral cystoscopy (procedure)', cui='C0010707', semtypes='[diap]', trigger='[\"Cystourethroscopy\"-tx-1-\"cystourethroscopy\"-noun-0]', location='TX', pos_info='8/17', tree_codes=''). The factor value follows the factor name and an equal sign (e.g., mm = ‘MMI’).
I would like to split this column and create columns for these factors. i.e., the results will have index, mm, score, preferred_name, cui…. columns with each row containing the value in single quotation marks. Is there an efficient way to split these lists into columns? I am attaching the sample dataset below.
Sample dataset
d <-
structure(list(0:36, o3.word = c("ConceptMMI(index='tmppplgwffe', mm='MMI', score='3.68', preferred_name='Transurethral cystoscopy (procedure)', cui='C0010707', semtypes='[diap]', trigger='[\"Cystourethroscopy\"-tx-1-\"cystourethroscopy\"-noun-0]', location='TX', pos_info='8/17', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='3.50', preferred_name='0%', cui='C3842591', semtypes='[qnco]', trigger='[\"0%\"-tx-1-\"0\"-integer-0]', location='TX', pos_info='0/1', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='16.30', preferred_name='Insertion Mutation', cui='C1512796', semtypes='[genf]', trigger='[\"Insertion\"-tx-1-\"insertion\"-noun-0]', location='TX', pos_info='16/9', tree_codes='E05.393.420.601.550;G05.365.590.575;G05.558.550')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='3.68', preferred_name='Clinical act of insertion', cui='C0441587', semtypes='[hlca]', trigger='[\"Insertion\"-tx-1-\"insertion\"-noun-0]', location='TX', pos_info='16/9', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='3.68', preferred_name='Implantation procedure', cui='C0021107', semtypes='[topp]', trigger='[\"Insertion\"-tx-1-\"insertion\"-noun-0]', location='TX', pos_info='16/9', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='3.68', preferred_name='Insert (object)', cui='C1883719', semtypes='[ftcn]', trigger='[\"Insertion\"-tx-1-\"insertion\"-noun-0]', location='TX', pos_info='16/9', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='3.50', preferred_name='1+', cui='C3816745', semtypes='[fndg]', trigger='[\"1+\"-tx-1-\"1\"-integer-0]', location='TX', pos_info='0/1', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='3.50', preferred_name='1+ Score', cui='C2827734', semtypes='[qnco]', trigger='[\"1+\"-tx-1-\"1\"-integer-0]', location='TX', pos_info='0/1', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='3.50', preferred_name='1+ Score, WHO', cui='C2981698', semtypes='[clas]', trigger='[\"1+\"-tx-1-\"1\"-integer-0]', location='TX', pos_info='0/1', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='3.50', preferred_name='Greater than one', cui='C4280965', semtypes='[qnco]', trigger='[\">1\"-tx-1-\"1\"-integer-0]', location='TX', pos_info='0/1', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='3.68', preferred_name='Indwelling (qualifier value)', cui='C0439848', semtypes='[ftcn]', trigger='[\"Indwelling\"-tx-1-\"indwelling\"-adj-0]', location='TX', pos_info='15/10', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='3.68', preferred_name='Indwelling Device', cui='C3694424', semtypes='[medd]', trigger='[\"Indwelling\"-tx-1-\"indwelling\"-adj-0]', location='TX', pos_info='15/10', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='3.50', preferred_name='+2', cui='C0740116', semtypes='[qnco]', trigger='[\"+2\"-tx-1-\"2\"-integer-0]', location='TX', pos_info='0/1', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='3.50', preferred_name='2+', cui='C3833492', semtypes='[fndg]', trigger='[\"2+\"-tx-1-\"2\"-integer-0]', location='TX', pos_info='0/1', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='3.50', preferred_name='2+ Score', cui='C2827735', semtypes='[qnco]', trigger='[\"2+\"-tx-1-\"2\"-integer-0]', location='TX', pos_info='0/1', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='3.50', preferred_name='2+ Score, WHO', cui='C2981700', semtypes='[clas]', trigger='[\"2+\"-tx-1-\"2\"-integer-0]', location='TX', pos_info='0/1', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='3.50', preferred_name='PSA Level Less than Two', cui='C4526661', semtypes='[lbtr]', trigger='[\"<2\"-tx-1-\"2\"-integer-0]', location='TX', pos_info='0/1', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='9.99', preferred_name='Ureter', cui='C0041951', semtypes='[bpoc]', trigger='[\"Ureteral\"-tx-1-\"ureteral\"-adj-0]', location='TX', pos_info='17/8', tree_codes='A05.810.776')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='3.68', preferred_name='Ureteral Route of Drug Administration', cui='C1522613', semtypes='[ftcn]', trigger='[\"URETERAL\"-tx-1-\"ureteral\"-adj-0]', location='TX', pos_info='17/8', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='3.50', preferred_name='3+ Answer to Question', cui='C3838680', semtypes='[fndg]', trigger='[\"3+\"-tx-1-\"3\"-integer-0]', location='TX', pos_info='0/1', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='3.50', preferred_name='3+ Score', cui='C2827736', semtypes='[qnco]', trigger='[\"3+\"-tx-1-\"3\"-integer-0]', location='TX', pos_info='0/1', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='3.50', preferred_name='3+ Score, WHO', cui='C2981702', semtypes='[clas]', trigger='[\"3+\"-tx-1-\"3\"-integer-0]', location='TX', pos_info='0/1', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='3.50', preferred_name='<3 (qualifier value)', cui='C0439086', semtypes='[qnco]', trigger='[\"<3\"-tx-1-\"3\"-integer-0]', location='TX', pos_info='0/1', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='9.99', preferred_name='Stent, device', cui='C0038257', semtypes='[medd]', trigger='[\"Stent\"-tx-1-\"stent\"-noun-0]', location='TX', pos_info='20/5', tree_codes='E07.695.750')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='3.50', preferred_name='4+ Answer to question', cui='C3838679', semtypes='[fndg]', trigger='[\"4+\"-tx-1-\"4\"-integer-0]', location='TX', pos_info='0/1', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='17.80', preferred_name='Name', cui='C0027365', semtypes='[inpr]', trigger='[\"Name\"-tx-1-\"Name\"-noun-0]', location='TX', pos_info='0/4', tree_codes='L01.559.598.400.556')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='12.96', preferred_name='Ozone', cui='C0030106', semtypes='[chem]', trigger='[\"O3\"-tx-1-\"o3\"-noun-0]', location='TX', pos_info='6/2', tree_codes='D01.362.670.600;x.x.x.x')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='5.18', preferred_name='CDISC ADaM Derivation Type Terminology', cui='C2825527', semtypes='[inpr]', trigger='[\"DTYPE\"-tx-1-\"dtype\"-noun-0]', location='TX', pos_info='29/5', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='5.18', preferred_name='Computer Programming Object', cui='C1704861', semtypes='[cnce]', trigger='[\"Object\"-tx-1-\"object\"-noun-0]', location='TX', pos_info='36/6', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='5.18', preferred_name='Length', cui='C1444754', semtypes='[qnco]', trigger='[\"Length\"-tx-1-\"Length\"-noun-0]', location='TX', pos_info='15/6', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='5.18', preferred_name='Length of Trial', cui='C1706316', semtypes='[resa]', trigger='[\"LENGTH\"-tx-1-\"Length\"-noun-0]', location='TX', pos_info='15/6', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='5.18', preferred_name='Name (property) (qualifier value)', cui='C4522128', semtypes='[qlco]', trigger='[\"Name\"-tx-1-\"Name\"-noun-0]', location='TX', pos_info='0/4', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='5.18', preferred_name='Object Class', cui='C1518526', semtypes='[cnce]', trigger='[\"Object\"-tx-1-\"object\"-noun-0]', location='TX', pos_info='36/6', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='5.18', preferred_name='Person Name', cui='C1547383', semtypes='[inpr]', trigger='[\"Name\"-tx-1-\"Name\"-noun-0]', location='TX', pos_info='0/4', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='5.18', preferred_name='Physical object', cui='C0347997', semtypes='[phob]', trigger='[\"Object\"-tx-1-\"object\"-noun-0]', location='TX', pos_info='36/6', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='3.68', preferred_name='Term (lexical)', cui='C1705313', semtypes='[idcn]', trigger='[\"Word\"-tx-1-\"word\"-noun-0]', location='TX', pos_info='9/4', tree_codes='')",
"ConceptMMI(index='tmppplgwffe', mm='MMI', score='3.50', preferred_name='United States Military Commissioned Officer O3', cui='C4521399', semtypes='[clas]', trigger='[\"O3\"-tx-1-\"o3\"-noun-0]', location='TX', pos_info='6/2', tree_codes='')"
)), class = "data.frame", row.names = c(NA, -37L))
>Solution :
You could convert each entry into a list (with parse and eval) and then bind them together:
library(tidyverse)
str_replace(d$o3.word, "ConceptMMI", "list") |>
map(\(x) as_tibble(eval(parse(text = x)))) |>
list_rbind()
Output
# A tibble: 37 × 10
index mm score preferred_name cui semty…¹ trigger locat…² pos_i…³ tree_…⁴
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 tmppplgwffe MMI 3.68 Transurethral cystoscopy (procedure) C0010… [diap] "[\"Cy… TX 8/17 ""
2 tmppplgwffe MMI 3.50 0% C3842… [qnco] "[\"0%… TX 0/1 ""
3 tmppplgwffe MMI 16.30 Insertion Mutation C1512… [genf] "[\"In… TX 16/9 "E05.3…
4 tmppplgwffe MMI 3.68 Clinical act of insertion C0441… [hlca] "[\"In… TX 16/9 ""
5 tmppplgwffe MMI 3.68 Implantation procedure C0021… [topp] "[\"In… TX 16/9 ""
6 tmppplgwffe MMI 3.68 Insert (object) C1883… [ftcn] "[\"In… TX 16/9 ""
7 tmppplgwffe MMI 3.50 1+ C3816… [fndg] "[\"1+… TX 0/1 ""
8 tmppplgwffe MMI 3.50 1+ Score C2827… [qnco] "[\"1+… TX 0/1 ""
9 tmppplgwffe MMI 3.50 1+ Score, WHO C2981… [clas] "[\"1+… TX 0/1 ""
10 tmppplgwffe MMI 3.50 Greater than one C4280… [qnco] "[\">1… TX 0/1 ""
# … with 27 more rows, and abbreviated variable names ¹semtypes, ²location, ³pos_info, ⁴tree_codes
# ℹ Use `print(n = ...)` to see more rows