Follow

Keep Up to Date with the Most Important News

By pressing the Subscribe button, you confirm that you have read and are agreeing to our Privacy Policy and Terms of Use
Contact

Explode data in a list not separated by comma

Let’s say have the following database:

{'docdb_family_id': {0: 569328,
  1: 574660,
  2: 1187498,
  3: 1226468,
  4: 1236571,
  5: 1239098,
  6: 1239277,
  7: 1239483,
  8: 1239622,
  9: 1239624,
  10: 1239749,
  11: 1334477,
  12: 1340405,
  13: 1340418,
  14: 1340462,
  15: 1340471,
  16: 1340485,
  17: 1340488,
  18: 1340508,
  19: 1340519,
  20: 1340541},
 'newa_cited_docdb': {0: '[  596005  4321416  5802640  6031690  6043910  8600475  8642629  9203255  9345445 10177065 10455451 13428248 22139349 22591458 24627241 24750476 26261826 26405611 27079105 27096884]',
  1: '[ 5956195 11260528 22181831 22437920 22642946 23278096 23407037 23458128 24244657 24355363 25014714 25115774 25156886 27047688 27089078 27398716]',
  2: '[ 5855196  7755392 11183886 22894980 24648618 27185399]',
  3: '[ 3573464  6279285  6294985  6542463  6981930  7427770 10325811 14970234 16878329 17935009 21811002 22329817 23543436 23907898 24456108 25283772]',
  4: '[ 2777078  2826073  5944733 10484188 11052747 14682645 15688752 22333410 22614097 22646501 22783765 22978728 23231683 24259740 24605606 24839432 25492752 27009992 27044704]',
  5: '[ 5777407 10417156 23463145 23845079 24397163 24426379 24916732 25216234 25296619 27054560 27509152]',
  6: '[ 4136523 12578497 21994155 22418792 22626616 22655464 22694825 22779403 23081767 23309829 23379411 23621952 24130698 24236071 24267003 24790872 24841797 25343500 27006578]',
  7: '[21722194 23841261 23870348 24749080 26713455 26884023 26892256 27123571]',
  8: '[ 3770167  9249538 20340153 21805004 21826650 23074051 23211424 23586695 23664858 24139881 24669345 24951262 25109266 25172355 25351735 26158421 27074633]',
  9: '[ 3773931 10400885 23825854 24863945 24904226 25372210 26673422 27108903]',
  10: '[ 6245732  6270984  6282047  6313094  6323632  6357314 12700997 14934415]',
  11: '[1331950 5937719 5950928 6032897 6737094 8103287]',
  12: '[22536768 23111794 23827356 24148953 24483064 24636228 26369896 26722884]',
  13: '[ 4096597  6452385  9164095 19820980 22468583 23758517 24922228]',
  14: '[ 6273193  6365448  9349940 10531948 13589721 20897840 21818345 22422049 23234586 23722349 24282964 24466601 25476838 26223504 26685774 26756449 26812104 26900843 27088150]',
  15: '[ 3770297  6285357 21272262 21883292 22392025 23100861 23160290 23827496 24060758 25448672 26918320]',
  16: '[21808322 25167492 25401922 26858065]',
  17: '[ 6293130 12621423 12977043 14043576 14524083 22013480 23070753 23360636 23672818 24210016 24396413 24505095 25447453 26335550 27560125]',
  18: '[21923978 23414619 23700077 23916998 23917011 23917023 24227869]',
  19: '[ 3029629  3461742  8589904 10338953 10633369 16254362 22248316 22635394 24392987 25416705 26671842 27391491 27406148]',
  20: None},
 'paperid': {0: nan,
  1: nan,
  2: nan,
  3: nan,
  4: nan,
  5: nan,
  6: nan,
  7: nan,
  8: nan,
  9: nan,
  10: nan,
  11: nan,
  12: nan,
  13: nan,
  14: nan,
  15: nan,
  16: nan,
  17: nan,
  18: nan,
  19: nan,
  20: 1998988989.0},
 'fronteer': {0: 0,
  1: 0,
  2: 0,
  3: 0,
  4: 0,
  5: 0,
  6: 0,
  7: 0,
  8: 0,
  9: 0,
  10: 0,
  11: 0,
  12: 0,
  13: 0,
  14: 0,
  15: 0,
  16: 0,
  17: 0,
  18: 0,
  19: 0,
  20: 1},
 'distance': {0: nan,
  1: nan,
  2: nan,
  3: nan,
  4: nan,
  5: nan,
  6: nan,
  7: nan,
  8: nan,
  9: nan,
  10: nan,
  11: nan,
  12: nan,
  13: nan,
  14: nan,
  15: nan,
  16: nan,
  17: nan,
  18: nan,
  19: nan,
  20: 0.0},
 'cited_docdb_ls': {0: '[  596005  4321416  5802640  6031690  6043910  8600475  8642629  9203255  9345445 10177065 10455451 13428248 22139349 22591458 24627241 24750476 26261826 26405611 27079105 27096884]',
  1: '[ 5956195 11260528 22181831 22437920 22642946 23278096 23407037 23458128 24244657 24355363 25014714 25115774 25156886 27047688 27089078 27398716]',
  2: '[ 5855196  7755392 11183886 22894980 24648618 27185399]',
  3: '[ 3573464  6279285  6294985  6542463  6981930  7427770 10325811 14970234 16878329 17935009 21811002 22329817 23543436 23907898 24456108 25283772]',
  4: '[ 2777078  2826073  5944733 10484188 11052747 14682645 15688752 22333410 22614097 22646501 22783765 22978728 23231683 24259740 24605606 24839432 25492752 27009992 27044704]',
  5: '[ 5777407 10417156 23463145 23845079 24397163 24426379 24916732 25216234 25296619 27054560 27509152]',
  6: '[ 4136523 12578497 21994155 22418792 22626616 22655464 22694825 22779403 23081767 23309829 23379411 23621952 24130698 24236071 24267003 24790872 24841797 25343500 27006578]',
  7: '[21722194 23841261 23870348 24749080 26713455 26884023 26892256 27123571]',
  8: '[ 3770167  9249538 20340153 21805004 21826650 23074051 23211424 23586695 23664858 24139881 24669345 24951262 25109266 25172355 25351735 26158421 27074633]',
  9: '[ 3773931 10400885 23825854 24863945 24904226 25372210 26673422 27108903]',
  10: '[ 6245732  6270984  6282047  6313094  6323632  6357314 12700997 14934415]',
  11: '[1331950 5937719 5950928 6032897 6737094 8103287]',
  12: '[22536768 23111794 23827356 24148953 24483064 24636228 26369896 26722884]',
  13: '[ 4096597  6452385  9164095 19820980 22468583 23758517 24922228]',
  14: '[ 6273193  6365448  9349940 10531948 13589721 20897840 21818345 22422049 23234586 23722349 24282964 24466601 25476838 26223504 26685774 26756449 26812104 26900843 27088150]',
  15: '[ 3770297  6285357 21272262 21883292 22392025 23100861 23160290 23827496 24060758 25448672 26918320]',
  16: '[21808322 25167492 25401922 26858065]',
  17: '[ 6293130 12621423 12977043 14043576 14524083 22013480 23070753 23360636 23672818 24210016 24396413 24505095 25447453 26335550 27560125]',
  18: '[21923978 23414619 23700077 23916998 23917011 23917023 24227869]',
  19: '[ 3029629  3461742  8589904 10338953 10633369 16254362 22248316 22635394 24392987 25416705 26671842 27391491 27406148]',
  20: []}}

what I would like to do is to explode the variable cited_docdb_ls which contains lists separated by space rather than a comma.
How can I do that? If it is not possible, is there a way to separate them by comma rather than space and then explode them?
The resulting database should either contain cited_docdb_ls with traditional lists separated by comma and not by spaces or the exploded database. I have checked the df.explode() documentation but couldd not find any hint on how to manage situations where the list is separated by space.

Thank you

MEDevel.com: Open-source for Healthcare and Education

Collecting and validating open-source software for healthcare, education, enterprise, development, medical imaging, medical records, and digital pathology.

Visit Medevel

>Solution :

I would use str.findall with a (\d+) regex for numbers to convert the strings to lists, then explode:

out = (df.assign(newa_cited_docdb=df['newa_cited_docdb'].str.findall('\d+'),
                 cited_docdb_ls=df['cited_docdb_ls'].str.findall('(\d+)'))
         .explode(['newa_cited_docdb', 'cited_docdb_ls'])
     )

NB. if you don’t have only digits a (\w+) regex will be more generic, however if the strings also contain [/] other than the in first and last character (e.g. [abc 12]3 45d]), then @jezrael’s anwser will be an alternative.

output:

    docdb_family_id newa_cited_docdb       paperid  fronteer  distance  \
0            569328           596005           NaN         0       NaN   
0            569328          4321416           NaN         0       NaN   
0            569328          5802640           NaN         0       NaN   
0            569328          6031690           NaN         0       NaN   
0            569328          6043910           NaN         0       NaN   
..              ...              ...           ...       ...       ...   
19          1340519         25416705           NaN         0       NaN   
19          1340519         26671842           NaN         0       NaN   
19          1340519         27391491           NaN         0       NaN   
19          1340519         27406148           NaN         0       NaN   
20          1340541             None  1.998989e+09         1       0.0   

   cited_docdb_ls  
0          596005  
0         4321416  
0         5802640  
0         6031690  
0         6043910  
..            ...  
19       25416705  
19       26671842  
19       27391491  
19       27406148  
20            NaN  

[239 rows x 6 columns]
Add a comment

Leave a Reply

Keep Up to Date with the Most Important News

By pressing the Subscribe button, you confirm that you have read and are agreeing to our Privacy Policy and Terms of Use

Discover more from Dev solutions

Subscribe now to keep reading and get access to the full archive.

Continue reading