Follow

Keep Up to Date with the Most Important News

By pressing the Subscribe button, you confirm that you have read and are agreeing to our Privacy Policy and Terms of Use
Contact

regex pattern for exact lengths in a delimited or regular strings in R

I want to get all the strings that look like these:

ph <- c ("ioL421.63", #6 chars.2 chars
"jur421.73.0o4435", #6 chars.2 chars.6 chars
"koL421.2p.9i4675.k23", #6 chars.2 chars.6 chars.3 chars
"6775po.78.678959.p2p.913", #6 chars.2 chars.6 chars.3 chars.3 chars
"193485.k2.l3.34.67", #6 chars.2 chars.2 chars.2 chars.2 chars


"ioL421.6", #6 chars.1 chars
"jur421.3.0o4", #6 chars.1 chars.3 chars
"koL421.2.9i5.k2390", #6 chars.1 chars.3 chars.5 chars
"6775po.8.678.p2p91.674e", #6 chars.1 chars.3 chars.5 chars.4 chars

#***** Then only with these lengths ******

"842f45", #6 chars
"234567890123567hk", #17 chars
"234567890123567hkiq", #19 chars
"234567890123567hkiq5" #20 chars
)

These are are invalid strings:

invalid_ph <- c("23289jh", # 7 chars
"2382h", #5 chars
"2934567890123567h8", # 18 chars
"234567890123q3", 
"234567890123567hkiq57878787", 
"ZX3.235.9845.3843924.39403",
"sjkfuju2rwrrlnmld828384230403208402834fs",
"TY5648.235.123456",
"ABC3.235.9845",
"361 234 4356",
"a1.02.b3.00",
"01.01.01",
"23289jhd",
"01",
"01.02",
"01.01.01",
"aa.bb",
"ac.21",
"aa.01-02",
"123.2.10.834.18934",
"a1."
)

ph <- append(ph, invalid_ph)

I am using regex for the first time and came up with below, was wondering how can I consolidate them and correct the once not producing correct output.

MEDevel.com: Open-source for Healthcare and Education

Collecting and validating open-source software for healthcare, education, enterprise, development, medical imaging, medical records, and digital pathology.

Visit Medevel

library(stringr) using the stringr package to extract strings.


str_extract(ph, "^([a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{2}|[a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{1})$")

str_extract(ph, "^([a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{2}([.])[a-zA-Z0-9]{6}|[a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{1}([.])[a-zA-Z0-9]{3})$")

str_extract(ph, "^([a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{2}([.])[a-zA-Z0-9]{6}|[a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{1}([.])[a-zA-Z0-9]{3})$")

str_extract(ph, "^([a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{2}([.])[a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{3}|[a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{1}([.])[a-zA-Z0-9]{3}([.])[a-zA-Z0-9]{5})$")

str_extract(ph, "^([a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{2}([.])[a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{3}([.])[a-zA-Z0-9]{3}|[a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{1}([.])[a-zA-Z0-9]{3}([.])[a-zA-Z0-9]{5}([.])[a-zA-Z0-9]{4})$")

str_extract(ph, "^([a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{2}|[a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{1})$")

str_extract(ph, "^([a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{2}|[a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{1})$")

str_extract(ph, "^([a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{2}([.])[a-zA-Z0-9]{2}([.])[a-zA-Z0-9]{2}([.])[a-zA-Z0-9]{2})$")

>Solution :

Use a function to create all the valid patterns:

f <- function(len = 1L, pat = '[A-Za-z0-9]', sep = '\\.') {
  p <- paste0(sprintf('(%s){%s}', pat, len), collapse = sep)
  sprintf('^%s$', p)
}

f()
# [1] "^([A-Za-z0-9]){1}$"
f(len = c(1, 2))
# [1] "^([A-Za-z0-9]){1}\\.([A-Za-z0-9]){2}$"
f(len = c(6, 2, 6))
# [1] "^([A-Za-z0-9]){6}\\.([A-Za-z0-9]){2}\\.([A-Za-z0-9]){6}$"

len <- list(
  c(6, 2),
  c(6, 2, 6),
  c(6, 2, 6, 3),
  c(6, 2, 6, 3, 3),
  c(6, 2, 2, 2, 2),
  c(6, 1),
  c(6, 1, 3),
  c(6, 1, 3, 5),
  c(6, 1, 3, 5, 4),
  6, 17, 19, 20
)

pat <- paste0(sapply(len, f), collapse = '|')
data.frame(string = ph, valid = grepl(pat, ph))

#                                      string valid
# 1                                 ioL421.63  TRUE
# 2                          jur421.73.0o4435  TRUE
# 3                      koL421.2p.9i4675.k23  TRUE
# 4                  6775po.78.678959.p2p.913  TRUE
# 5                        193485.k2.l3.34.67  TRUE
# 6                                  ioL421.6  TRUE
# 7                              jur421.3.0o4  TRUE
# 8                        koL421.2.9i5.k2390  TRUE
# 9                   6775po.8.678.p2p91.674e  TRUE
# 10                                   842f45  TRUE
# 11                        234567890123567hk  TRUE
# 12                      234567890123567hkiq  TRUE
# 13                     234567890123567hkiq5  TRUE
# 14                                  23289jh FALSE
# 15                                    2382h FALSE
# 16                       2934567890123567h8 FALSE
# 17                           234567890123q3 FALSE
# 18              234567890123567hkiq57878787 FALSE
# 19               ZX3.235.9845.3843924.39403 FALSE
# 20 sjkfuju2rwrrlnmld828384230403208402834fs FALSE
# 21                        TY5648.235.123456 FALSE
# 22                            ABC3.235.9845 FALSE
# 23                             361 234 4356 FALSE
# 24                              a1.02.b3.00 FALSE
# 25                                 01.01.01 FALSE
# 26                                 23289jhd FALSE
# 27                                       01 FALSE
# 28                                    01.02 FALSE
# 29                                 01.01.01 FALSE
# 30                                    aa.bb FALSE
# 31                                    ac.21 FALSE
# 32                                 aa.01-02 FALSE
# 33                       123.2.10.834.18934 FALSE
# 34                                      a1. FALSE
Add a comment

Leave a Reply

Keep Up to Date with the Most Important News

By pressing the Subscribe button, you confirm that you have read and are agreeing to our Privacy Policy and Terms of Use

Discover more from Dev solutions

Subscribe now to keep reading and get access to the full archive.

Continue reading