R subtract string from text and create 4 new variables base on the subtracts -
i have following data in r:
product_description
can al sol 355ml exp 2014 can al 7up 330ml std vintage 50s 2015 zz_can al heineken light 500ml 473 13 zz_can al tecate 710ml mx 2009 can al sol 355ml carnaval 2012 can al heineken 330ml 4x6 nl 1508 zz_can al carta blanca light 355ml 2010 can al carta blanca 355ml cl/co 2012 can al strongbow red berries 400bg/gr/ro
and create 4 columns splitting content in 4 new variables: brand, type, capacity , description.
i following (not sure if clear without columns):
brand type capacity description sol 355ml exp 2014 7up 330ml std vintage 50s 2015 heineken light 500ml 473 13 tecate 710ml mx 2009 sol 355ml carnaval 2012 heineken 330ml 4x6 nl 1508 carta blanca light 355ml 2010 carta blanca 355ml cl/co 2012 strongbow red berries 400bg/gr/ro
so far i've used gsub function still not getting want.
data$descrip <- gsub("can al", " ", data$materialdescription) data$descrip <- gsub("[s]", " ", data$materialdescription) data$brand <-gsub("[a-z][a-za-z]+|[a-za-z][a-z]+", '', data$descrip) data$brand <-gsub("\\d", '', data$brand) x <- regmatches(data$brand, gregexpr("[[:digit:]]+", data$brand)) data$dim <- as.numeric(unlist(x))
any appreciated. please, let me know if need else.
i assumed data contained in text file, "brand" written uppercase, , "type" never written uppercase. "type" , "description" separated "capacity". in
can al strongbow red berries 400bg/gr/ro
for example there no capacity. such cases function generatetable
has optional argument knowntypes
. if "brand" followed string matches 1 of known types, used "type". otherwise warning given , put "description".
data <- readlines(path) #===================================================================== generatetable <- function( data, knowntypes = null ) { knowntypes.expr <- paste("[ ]*(",paste(knowntypes,collapse="|"),")[ ]+",sep="") data <- gsub( "(.*can al[ ]+)","",data) brand.pos <- regexpr( "([a-z0-9]+( ))+", data ) if (!all(brand.pos==1)){ stop("'brand' missing.")} brand <- mapply("substr", data, brand.pos, brand.pos+attr(brand.pos,"match.length")-2 ) names(brand) <- null data <- mapply("sub",paste(brand,"[ ]*",sep=""),"",data) capacity.pos <- regexpr( "[0-9]+ml", data ) if (!is.null(knowntypes)) { lastattempt <- regexpr(knowntypes.expr,data) capacity.pos[which(lastattempt==1)] <- attr(lastattempt,"match.length")[which(lastattempt==1)] + 1 } (i in which(capacity.pos<0)) { warning( sprintf("no 'capacity' found in line %i. separation of 'type' , 'description' impossible.",i), call.=false ) } capacity <- mapply("substr", data, capacity.pos, capacity.pos+attr(capacity.pos,"match.length")-1 ) names(capacity) <- null type <- mapply("substr", data, 1, capacity.pos-1 ) names(type) <- null data <- mapply("sub",type,"",data) data <- mapply("sub",capacity,"",data) description <- mapply("sub","[ ]+","",paste(" ",data)) names(description) <- null #----------------------------------------------------------------------- tbl <- data.frame( brand = brand, type = type, capacity = capacity, description = description ) tbl <- sub("( )*\n","",apply(tbl,1:2,paste,"\n",sep="")) return ( tbl ) }
examples:
> generatetable(data) brand type capacity description [1,] "sol" "" "355ml" "exp 2014" [2,] "7up" "" "330ml" "std vintage 50s 2015" [3,] "heineken" "light" "500ml" "473 13" [4,] "tecate" "" "710ml" "mx 2009" [5,] "sol" "" "355ml" "carnaval 2012" [6,] "heineken" "" "330ml" "4x6 nl 1508" [7,] "carta blanca" "light" "355ml" "2010" [8,] "carta blanca" "" "355ml" "cl/co 2012" [9,] "strongbow" "" "" "red berries 400bg/gr/ro" warning message: no 'capacity' found in line 9. separation of 'type' , 'description' impossible. > generatetable(data,"red berries") brand type capacity description [1,] "sol" "" "355ml" "exp 2014" [2,] "7up" "" "330ml" "std vintage 50s 2015" [3,] "heineken" "light" "500ml" "473 13" [4,] "tecate" "" "710ml" "mx 2009" [5,] "sol" "" "355ml" "carnaval 2012" [6,] "heineken" "" "330ml" "4x6 nl 1508" [7,] "carta blanca" "light" "355ml" "2010" [8,] "carta blanca" "" "355ml" "cl/co 2012" [9,] "strongbow" "red berries" "" "400bg/gr/ro" > generatetable(data,"red") brand type capacity description [1,] "sol" "" "355ml" "exp 2014" [2,] "7up" "" "330ml" "std vintage 50s 2015" [3,] "heineken" "light" "500ml" "473 13" [4,] "tecate" "" "710ml" "mx 2009" [5,] "sol" "" "355ml" "carnaval 2012" [6,] "heineken" "" "330ml" "4x6 nl 1508" [7,] "carta blanca" "light" "355ml" "2010" [8,] "carta blanca" "" "355ml" "cl/co 2012" [9,] "strongbow" "red" "" "berries 400bg/gr/ro" > generatetable(data,"berries") brand type capacity description [1,] "sol" "" "355ml" "exp 2014" [2,] "7up" "" "330ml" "std vintage 50s 2015" [3,] "heineken" "light" "500ml" "473 13" [4,] "tecate" "" "710ml" "mx 2009" [5,] "sol" "" "355ml" "carnaval 2012" [6,] "heineken" "" "330ml" "4x6 nl 1508" [7,] "carta blanca" "light" "355ml" "2010" [8,] "carta blanca" "" "355ml" "cl/co 2012" [9,] "strongbow" "" "" "red berries 400bg/gr/ro" warning message: no 'capacity' found in line 9. separation of 'type' , 'description' impossible. > generatetable(data,c("red berries","heavy")) brand type capacity description [1,] "sol" "" "355ml" "exp 2014" [2,] "7up" "" "330ml" "std vintage 50s 2015" [3,] "heineken" "light" "500ml" "473 13" [4,] "tecate" "" "710ml" "mx 2009" [5,] "sol" "" "355ml" "carnaval 2012" [6,] "heineken" "" "330ml" "4x6 nl 1508" [7,] "carta blanca" "light" "355ml" "2010" [8,] "carta blanca" "" "355ml" "cl/co 2012" [9,] "strongbow" "red berries" "" "400bg/gr/ro" > generatetable(data,c("light","heavy")) brand type capacity description [1,] "sol" "" "355ml" "exp 2014" [2,] "7up" "" "330ml" "std vintage 50s 2015" [3,] "heineken" "light" "500ml" "473 13" [4,] "tecate" "" "710ml" "mx 2009" [5,] "sol" "" "355ml" "carnaval 2012" [6,] "heineken" "" "330ml" "4x6 nl 1508" [7,] "carta blanca" "light" "355ml" "2010" [8,] "carta blanca" "" "355ml" "cl/co 2012" [9,] "strongbow" "" "" "red berries 400bg/gr/ro" warning message: no 'capacity' found in line 9. separation of 'type' , 'description' impossible. > generatetable(data,c("heavy")) brand type capacity description [1,] "sol" "" "355ml" "exp 2014" [2,] "7up" "" "330ml" "std vintage 50s 2015" [3,] "heineken" "light" "500ml" "473 13" [4,] "tecate" "" "710ml" "mx 2009" [5,] "sol" "" "355ml" "carnaval 2012" [6,] "heineken" "" "330ml" "4x6 nl 1508" [7,] "carta blanca" "light" "355ml" "2010" [8,] "carta blanca" "" "355ml" "cl/co 2012" [9,] "strongbow" "" "" "red berries 400bg/gr/ro" warning message: no 'capacity' found in line 9. separation of 'type' , 'description' impossible. >
Comments
Post a Comment