I want to get all the strings that look like these:
ph <- c ("ioL421.63", #6 chars.2 chars
"jur421.73.0o4435", #6 chars.2 chars.6 chars
"koL421.2p.9i4675.k23", #6 chars.2 chars.6 chars.3 chars
"6775po.78.678959.p2p.913", #6 chars.2 chars.6 chars.3 chars.3 chars
"193485.k2.l3.34.67", #6 chars.2 chars.2 chars.2 chars.2 chars
"ioL421.6", #6 chars.1 chars
"jur421.3.0o4", #6 chars.1 chars.3 chars
"koL421.2.9i5.k2390", #6 chars.1 chars.3 chars.5 chars
"6775po.8.678.p2p91.674e", #6 chars.1 chars.3 chars.5 chars.4 chars
#***** Then only with these lengths ******
"842f45", #6 chars
"234567890123567hk", #17 chars
"234567890123567hkiq", #19 chars
"234567890123567hkiq5" #20 chars
)
These are are invalid strings:
invalid_ph <- c("23289jh", # 7 chars
"2382h", #5 chars
"2934567890123567h8", # 18 chars
"234567890123q3",
"234567890123567hkiq57878787",
"ZX3.235.9845.3843924.39403",
"sjkfuju2rwrrlnmld828384230403208402834fs",
"TY5648.235.123456",
"ABC3.235.9845",
"361 234 4356",
"a1.02.b3.00",
"01.01.01",
"23289jhd",
"01",
"01.02",
"01.01.01",
"aa.bb",
"ac.21",
"aa.01-02",
"123.2.10.834.18934",
"a1."
)
ph <- append(ph, invalid_ph)
I am using regex for the first time and came up with below, was wondering how can I consolidate them and correct the once not producing correct output.
library(stringr) using the stringr package to extract strings.
str_extract(ph, "^([a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{2}|[a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{1})$")
str_extract(ph, "^([a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{2}([.])[a-zA-Z0-9]{6}|[a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{1}([.])[a-zA-Z0-9]{3})$")
str_extract(ph, "^([a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{2}([.])[a-zA-Z0-9]{6}|[a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{1}([.])[a-zA-Z0-9]{3})$")
str_extract(ph, "^([a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{2}([.])[a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{3}|[a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{1}([.])[a-zA-Z0-9]{3}([.])[a-zA-Z0-9]{5})$")
str_extract(ph, "^([a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{2}([.])[a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{3}([.])[a-zA-Z0-9]{3}|[a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{1}([.])[a-zA-Z0-9]{3}([.])[a-zA-Z0-9]{5}([.])[a-zA-Z0-9]{4})$")
str_extract(ph, "^([a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{2}|[a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{1})$")
str_extract(ph, "^([a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{2}|[a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{1})$")
str_extract(ph, "^([a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{2}([.])[a-zA-Z0-9]{2}([.])[a-zA-Z0-9]{2}([.])[a-zA-Z0-9]{2})$")
>Solution :
Use a function to create all the valid patterns:
f <- function(len = 1L, pat = '[A-Za-z0-9]', sep = '\\.') {
p <- paste0(sprintf('(%s){%s}', pat, len), collapse = sep)
sprintf('^%s$', p)
}
f()
# [1] "^([A-Za-z0-9]){1}$"
f(len = c(1, 2))
# [1] "^([A-Za-z0-9]){1}\\.([A-Za-z0-9]){2}$"
f(len = c(6, 2, 6))
# [1] "^([A-Za-z0-9]){6}\\.([A-Za-z0-9]){2}\\.([A-Za-z0-9]){6}$"
len <- list(
c(6, 2),
c(6, 2, 6),
c(6, 2, 6, 3),
c(6, 2, 6, 3, 3),
c(6, 2, 2, 2, 2),
c(6, 1),
c(6, 1, 3),
c(6, 1, 3, 5),
c(6, 1, 3, 5, 4),
6, 17, 19, 20
)
pat <- paste0(sapply(len, f), collapse = '|')
data.frame(string = ph, valid = grepl(pat, ph))
# string valid
# 1 ioL421.63 TRUE
# 2 jur421.73.0o4435 TRUE
# 3 koL421.2p.9i4675.k23 TRUE
# 4 6775po.78.678959.p2p.913 TRUE
# 5 193485.k2.l3.34.67 TRUE
# 6 ioL421.6 TRUE
# 7 jur421.3.0o4 TRUE
# 8 koL421.2.9i5.k2390 TRUE
# 9 6775po.8.678.p2p91.674e TRUE
# 10 842f45 TRUE
# 11 234567890123567hk TRUE
# 12 234567890123567hkiq TRUE
# 13 234567890123567hkiq5 TRUE
# 14 23289jh FALSE
# 15 2382h FALSE
# 16 2934567890123567h8 FALSE
# 17 234567890123q3 FALSE
# 18 234567890123567hkiq57878787 FALSE
# 19 ZX3.235.9845.3843924.39403 FALSE
# 20 sjkfuju2rwrrlnmld828384230403208402834fs FALSE
# 21 TY5648.235.123456 FALSE
# 22 ABC3.235.9845 FALSE
# 23 361 234 4356 FALSE
# 24 a1.02.b3.00 FALSE
# 25 01.01.01 FALSE
# 26 23289jhd FALSE
# 27 01 FALSE
# 28 01.02 FALSE
# 29 01.01.01 FALSE
# 30 aa.bb FALSE
# 31 ac.21 FALSE
# 32 aa.01-02 FALSE
# 33 123.2.10.834.18934 FALSE
# 34 a1. FALSE