Bash script Find Duplicate Values in CSV file

Advertisements

I have a csv file and bash script but it does not work as needed

The script

#!/bin/bash

#create path to redirect accounts.csv to same directory as accounts_new.csv
path=$(dirname $1)

awk '
BEGIN { FS="\""; OFS="," }                              # input is delimited by double qutoes
NR==1 { print; next }
      { line=""
        for (i=1;i<NF;i+=2) {                           # loop through odd numbered fields
            gsub(/,/,"|",$(i+1))                        # in even numbered double-quote-delimited fields replace commas with pipes
            line=line $i FS $(i+1) FS                   # rebuild the current line
        }
        line=line $NF                                   # add last field to new line

        split(line,a,",")                               # split new line on commas
        split(tolower(a[3]),b,/[[:space:]]+/)           # split tolower(name field) on white space

        # rebuild name with first characters of first/last names uppercased

        name=toupper(substr(b[1],1,1)) substr(b[1],2) " " toupper(substr(b[2],1,1)) substr(b[2],2)

        acct=substr(b[1],1,1) b[2]                      # build email acct name

        lines[NR]=a[1] OFS a[2] OFS name OFS a[4]       # rebuild current line based on first 4 fields
        locid[NR]=a[2]                                  # make note of location_id for current line
        email[NR]=acct                                  # make note of email acct for current line
        count[acct]++                                   # keep count of number of times we see this email acct
      }

END   { for (i=2;i<=NR;i++) {                           # loop through our lines of output
            gsub(/\|/,",",lines[i])                     # replace pipes with original commas

            # print final line of output; if email acct has been seen more than once then append the location_id to the email acct; add the "@abc.com" domain and the trailing comma

            print lines[i] OFS email[i] (count[email[i]] > 1 ? locid[i] : "") "@abc.com" OFS
        }
      }' $1 > $path"/accounts_new.csv"

CSV file content

id,location_id,name,title,email,department
1,1,Susan houston,Director of Services,,
2,1,Christina Gonzalez,Director,,
3,2,Brenda brown,"Director, Second Career Services",,
4,3,Howard Lader,"Manager, Senior Counseling",,
5,4,Kimberly Pesavento,Commercial director,,
6,5,Joe Bloom,Financial Empowerment Programs Program Director,,
7,6,peter Olson,Director,,
8,6,Bart charlow,Executive Director,,
9,7,Bart Charlow,Executive Director,,
10,7,Barbara Kalt,Director,,
11,8,Marilyn Baker-Venturini,Director,,
12,8,Graciela Hernandez,Assistant Manager,,
13,8,Julie avelino,Assessment Specialist,,
14,9,Dave Genesy,Library Director,,
15,9,maria kramer,Library Divisions Manager,,
16,10,Dave Genesy,Tester,,
17,10,Maria kramer,Library Division Manager,,
18,11,Dave Genesy,Head of office,,
19,11,Elizabeth Meeks,Branch Manager,,
20,12,Kathy Endaya,Director,,
21,13,dave genesy,Library Director,,
22,14,Andres Espinoza,"Manager, Commanding Officer",,
23,15,Jack Phillips,Administrator,,
24,16,James Lee,Commanding Officer,,
25,17,Kenneth Gibson,Tester,,
26,18,Sharon Petersen,Administrator,,
27,19,Sharon Petersen,Administrator,,
28,21,Moncef Salah,Tester,,Office of Innovation
29,22,Suzanne Badenhoop,Tester,suzanne@example.com,Referrals
30,20,Sean Houston,Director of new Services,,
31,8,David Genesy,Account Manager,,
32,8,Elizabeth Feeney,CEO,e.feeney@foobar.org,Operations
33,8,Erika Meeks,Tester,e.meeks@foobar.org,Operations

My desired output is this

id,location_id,name,title,email,department
1,1,Susan Houston,Director of Services,shouston1@abc.com,
2,1,Christina Gonzalez,Director,cgonzalez@abc.com,
3,2,Brenda Brown,"Director, Second Career Services",bbrown@abc.com,
4,3,Howard Lader,"Manager, Senior Counseling",hlader@abc.com,
5,4,Kimberly Pesavento,Commercial director,kpesavento@abc.com,
6,5,Joe Bloom,Financial Empowerment Programs Program Director,jbloom@abc.com,
7,6,Peter Olson,Director,polson@abc.com,
8,6,Bart Charlow,Executive Director,bcharlow6@abc.com,
9,7,Bart Charlow,Executive Director,bcharlow7@abc.com,
10,7,Barbara Kalt,Director,bkalt@abc.com,
11,8,Marilyn Baker-venturini,Director,mbaker-venturini@abc.com,
12,8,Graciela Hernandez,Assistant Manager,ghernandez@abc.com,
13,8,Julie Avelino,Assessment Specialist,javelino@abc.com,
14,9,Dave Genesy,Library Director,dgenesy9@abc.com,
15,9,Maria Kramer,Library Divisions Manager,mkramer9@abc.com,
16,10,Dave Genesy,Tester,dgenesy10@abc.com,
17,10,Maria Kramer,Library Division Manager,mkramer10@abc.com,
18,11,Dave Genesy,Head of office,dgenesy11@abc.com,
19,11,Elizabeth Meeks,Branch Manager,emeeks11@abc.com,
20,12,Kathy Endaya,Director,kendaya@abc.com,
21,13,Dave Genesy,Library Director,dgenesy13@abc.com,
22,14,Andres Espinoza,"Manager, Commanding Officer",aespinoza@abc.com,
23,15,Jack Phillips,Administrator,jphillips@abc.com,
24,16,James Lee,Commanding Officer,jlee@abc.com,
25,17,Kenneth Gibson,Tester,kgibson@abc.com,
26,18,Sharon Petersen,Administrator,spetersen18@abc.com,
27,19,Sharon Petersen,Administrator,spetersen19@abc.com,
28,21,Moncef Salah,Tester,msalah@abc.com,Office of Innovation
29,22,Suzanne Badenhoop,Tester,sbadenhoop@abc.com,Referrals
30,20,Sean Houston,Director of new Services,shouston20@abc.com,
31,8,David Genesy,Account Manager,dgenesy8@abc.com,
32,8,Elizabeth Feeney,CEO,efeeney@abc.com,Operations
33,8,Erika Meeks,Tester,emeeks8@abc.com,Operations

But I get this output ignoring the department column content

id,location_id,name,title,email,department
1,1,Susan Houston,Director of Services,shouston1@abc.com,
2,1,Christina Gonzalez,Director,cgonzalez@abc.com,
3,2,Brenda Brown,"Director, Second Career Services",bbrown@abc.com,
4,3,Howard Lader,"Manager, Senior Counseling",hlader@abc.com,
5,4,Kimberly Pesavento,Commercial director,kpesavento@abc.com,
6,5,Joe Bloom,Financial Empowerment Programs Program Director,jbloom@abc.com,
7,6,Peter Olson,Director,polson@abc.com,
8,6,Bart Charlow,Executive Director,bcharlow6@abc.com,
9,7,Bart Charlow,Executive Director,bcharlow7@abc.com,
10,7,Barbara Kalt,Director,bkalt@abc.com,
11,8,Marilyn Baker-venturini,Director,mbaker-venturini@abc.com,
12,8,Graciela Hernandez,Assistant Manager,ghernandez@abc.com,
13,8,Julie Avelino,Assessment Specialist,javelino@abc.com,
14,9,Dave Genesy,Library Director,dgenesy9@abc.com,
15,9,Maria Kramer,Library Divisions Manager,mkramer9@abc.com,
16,10,Dave Genesy,Tester,dgenesy10@abc.com,
17,10,Maria Kramer,Library Division Manager,mkramer10@abc.com,
18,11,Dave Genesy,Head of office,dgenesy11@abc.com,
19,11,Elizabeth Meeks,Branch Manager,emeeks11@abc.com,
20,12,Kathy Endaya,Director,kendaya@abc.com,
21,13,Dave Genesy,Library Director,dgenesy13@abc.com,
22,14,Andres Espinoza,"Manager, Commanding Officer",aespinoza@abc.com,
23,15,Jack Phillips,Administrator,jphillips@abc.com,
24,16,James Lee,Commanding Officer,jlee@abc.com,
25,17,Kenneth Gibson,Tester,kgibson@abc.com,
26,18,Sharon Petersen,Administrator,spetersen18@abc.com,
27,19,Sharon Petersen,Administrator,spetersen19@abc.com,
28,21,Moncef Salah,Tester,msalah@abc.com,
29,22,Suzanne Badenhoop,Tester,sbadenhoop@abc.com,
30,20,Sean Houston,Director of new Services,shouston20@abc.com,
31,8,David Genesy,Account Manager,dgenesy8@abc.com,
32,8,Elizabeth Feeney,CEO,efeeney@abc.com,
33,8,Erika Meeks,Tester,emeeks8@abc.com,

Can you help me achieve my desired output?

I think the problem is that there is no comma after the department values, thats why it doesnt work as intended, what can be changed in code to achieve this?

>Solution :

As I mentioned in a comment to a previous answer:

  • after locid[NR]=a[2] add dept[NR]=a[6]
  • in the final print add dept[i]

The modified code (sans the original comments):

awk '
BEGIN { FS="\""
        OFS=","
      }
NR==1 { print; next }
      { line=""
        for (i=1;i<NF;i+=2) {
            gsub(/,/,"|",$(i+1))
            line=line $i FS $(i+1) FS
        }
        line=line $NF

        split(line,a,",")
        split(tolower(a[3]),b,/[[:space:]]+/)

        name=toupper(substr(b[1],1,1)) substr(b[1],2) " " toupper(substr(b[2],1,1)) substr(b[2],2)

        acct=substr(b[1],1,1) b[2]

        lines[NR]=a[1] OFS a[2] OFS name OFS a[4]
        locid[NR]=a[2]
        dept[NR]=a[6]                                    # save current department
        email[NR]=acct
        count[acct]++
      }
END   { for (i=2;i<=NR;i++) {
            gsub(/\|/,",",lines[i])
            print lines[i] OFS email[i] (count[email[i]] > 1 ? locid[i] : "") "@abc.com" OFS dept[i]
            ######### add ---------------------------------------------------------------->  ^^^^^^^
        }
      }
' accounts.csv

This generates:

id,location_id,name,title,email,department
1,1,Susan Houston,Director of Services,shouston1@abc.com,
2,1,Christina Gonzalez,Director,cgonzalez@abc.com,
3,2,Brenda Brown,"Director, Second Career Services",bbrown@abc.com,
4,3,Howard Lader,"Manager, Senior Counseling",hlader@abc.com,
5,4,Kimberly Pesavento,Commercial director,kpesavento@abc.com,
6,5,Joe Bloom,Financial Empowerment Programs Program Director,jbloom@abc.com,
7,6,Peter Olson,Director,polson@abc.com,
8,6,Bart Charlow,Executive Director,bcharlow6@abc.com,
9,7,Bart Charlow,Executive Director,bcharlow7@abc.com,
10,7,Barbara Kalt,Director,bkalt@abc.com,
11,8,Marilyn Baker-venturini,Director,mbaker-venturini@abc.com,
12,8,Graciela Hernandez,Assistant Manager,ghernandez@abc.com,
13,8,Julie Avelino,Assessment Specialist,javelino@abc.com,
14,9,Dave Genesy,Library Director,dgenesy9@abc.com,
15,9,Maria Kramer,Library Divisions Manager,mkramer9@abc.com,
16,10,Dave Genesy,Tester,dgenesy10@abc.com,
17,10,Maria Kramer,Library Division Manager,mkramer10@abc.com,
18,11,Dave Genesy,Head of office,dgenesy11@abc.com,
19,11,Elizabeth Meeks,Branch Manager,emeeks11@abc.com,
20,12,Kathy Endaya,Director,kendaya@abc.com,
21,13,Dave Genesy,Library Director,dgenesy13@abc.com,
22,14,Andres Espinoza,"Manager, Commanding Officer",aespinoza@abc.com,
23,15,Jack Phillips,Administrator,jphillips@abc.com,
24,16,James Lee,Commanding Officer,jlee@abc.com,
25,17,Kenneth Gibson,Tester,kgibson@abc.com,
26,18,Sharon Petersen,Administrator,spetersen18@abc.com,
27,19,Sharon Petersen,Administrator,spetersen19@abc.com,
28,21,Moncef Salah,Tester,msalah@abc.com,Office of Innovation
29,22,Suzanne Badenhoop,Tester,sbadenhoop@abc.com,Referrals
30,20,Sean Houston,Director of new Services,shouston20@abc.com,
31,8,David Genesy,Account Manager,dgenesy8@abc.com,
32,8,Elizabeth Feeney,CEO,efeeney@abc.com,Operations
33,8,Erika Meeks,Tester,emeeks8@abc.com,Operations

Leave a ReplyCancel reply