R subtract string from text and create 4 new variables base on the subtracts -


i have following data in r:

product_description

can al sol 355ml exp 2014 can al 7up 330ml std vintage 50s 2015 zz_can al heineken light 500ml 473 13 zz_can al tecate 710ml mx 2009 can al sol 355ml carnaval 2012 can al heineken 330ml 4x6 nl 1508 zz_can al carta blanca light 355ml 2010 can al carta blanca 355ml cl/co 2012 can al strongbow red berries 400bg/gr/ro 

and create 4 columns splitting content in 4 new variables: brand, type, capacity , description.

i following (not sure if clear without columns):

brand   type    capacity    description sol     355ml   exp 2014 7up     330ml    std vintage 50s 2015 heineken    light   500ml   473 13 tecate      710ml   mx 2009 sol     355ml   carnaval 2012 heineken        330ml   4x6 nl 1508 carta blanca    light   355ml   2010 carta blanca        355ml   cl/co 2012 strongbow   red berries     400bg/gr/ro 

so far i've used gsub function still not getting want.

data$descrip <- gsub("can al", " ", data$materialdescription) data$descrip <- gsub("[s]", " ", data$materialdescription) data$brand <-gsub("[a-z][a-za-z]+|[a-za-z][a-z]+", '', data$descrip) data$brand <-gsub("\\d", '', data$brand)  x <- regmatches(data$brand, gregexpr("[[:digit:]]+", data$brand)) data$dim <- as.numeric(unlist(x)) 

any appreciated. please, let me know if need else.

i assumed data contained in text file, "brand" written uppercase, , "type" never written uppercase. "type" , "description" separated "capacity". in

can al strongbow red berries 400bg/gr/ro

for example there no capacity. such cases function generatetable has optional argument knowntypes. if "brand" followed string matches 1 of known types, used "type". otherwise warning given , put "description".

data <- readlines(path)  #=====================================================================  generatetable <- function( data, knowntypes = null ) {   knowntypes.expr  <- paste("[ ]*(",paste(knowntypes,collapse="|"),")[ ]+",sep="")    data <- gsub( "(.*can al[ ]+)","",data)    brand.pos <- regexpr( "([a-z0-9]+( ))+", data )    if (!all(brand.pos==1)){ stop("'brand' missing.")}    brand <- mapply("substr", data, brand.pos, brand.pos+attr(brand.pos,"match.length")-2 )   names(brand) <- null    data <- mapply("sub",paste(brand,"[ ]*",sep=""),"",data)    capacity.pos <- regexpr( "[0-9]+ml", data )    if (!is.null(knowntypes))   {     lastattempt <- regexpr(knowntypes.expr,data)      capacity.pos[which(lastattempt==1)] <-       attr(lastattempt,"match.length")[which(lastattempt==1)] + 1   }     (i in which(capacity.pos<0))   {     warning( sprintf("no 'capacity' found in line %i. separation of 'type' , 'description' impossible.",i),              call.=false )   }    capacity <- mapply("substr", data, capacity.pos, capacity.pos+attr(capacity.pos,"match.length")-1 )   names(capacity) <- null    type <- mapply("substr", data, 1, capacity.pos-1 )   names(type) <- null    data <- mapply("sub",type,"",data)   data <- mapply("sub",capacity,"",data)    description <- mapply("sub","[ ]+","",paste(" ",data))   names(description) <- null    #-----------------------------------------------------------------------    tbl <- data.frame( brand       = brand,                      type        = type,                      capacity    = capacity,                      description = description )    tbl <- sub("( )*\n","",apply(tbl,1:2,paste,"\n",sep=""))    return ( tbl ) } 

examples:

> generatetable(data)       brand          type    capacity description                [1,] "sol"          ""      "355ml"  "exp 2014"                 [2,] "7up"          ""      "330ml"  "std vintage 50s 2015"     [3,] "heineken"     "light" "500ml"  "473 13"                [4,] "tecate"       ""      "710ml"  "mx 2009"                  [5,] "sol"          ""      "355ml"  "carnaval 2012"            [6,] "heineken"     ""      "330ml"  "4x6 nl 1508"              [7,] "carta blanca" "light" "355ml"  "2010"                     [8,] "carta blanca" ""      "355ml"  "cl/co 2012"               [9,] "strongbow"    ""      ""       "red berries 400bg/gr/ro" warning message: no 'capacity' found in line 9. separation of 'type' , 'description' impossible.  > generatetable(data,"red berries")       brand          type          capacity description             [1,] "sol"          ""            "355ml"  "exp 2014"              [2,] "7up"          ""            "330ml"  "std vintage 50s 2015"  [3,] "heineken"     "light"       "500ml"  "473 13"             [4,] "tecate"       ""            "710ml"  "mx 2009"               [5,] "sol"          ""            "355ml"  "carnaval 2012"         [6,] "heineken"     ""            "330ml"  "4x6 nl 1508"           [7,] "carta blanca" "light"       "355ml"  "2010"                  [8,] "carta blanca" ""            "355ml"  "cl/co 2012"            [9,] "strongbow"    "red berries" ""       "400bg/gr/ro"          > generatetable(data,"red")       brand          type    capacity description             [1,] "sol"          ""      "355ml"  "exp 2014"              [2,] "7up"          ""      "330ml"  "std vintage 50s 2015"  [3,] "heineken"     "light" "500ml"  "473 13"             [4,] "tecate"       ""      "710ml"  "mx 2009"               [5,] "sol"          ""      "355ml"  "carnaval 2012"         [6,] "heineken"     ""      "330ml"  "4x6 nl 1508"           [7,] "carta blanca" "light" "355ml"  "2010"                  [8,] "carta blanca" ""      "355ml"  "cl/co 2012"            [9,] "strongbow"    "red"   ""       "berries 400bg/gr/ro"  > generatetable(data,"berries")       brand          type    capacity description                [1,] "sol"          ""      "355ml"  "exp 2014"                 [2,] "7up"          ""      "330ml"  "std vintage 50s 2015"     [3,] "heineken"     "light" "500ml"  "473 13"                [4,] "tecate"       ""      "710ml"  "mx 2009"                  [5,] "sol"          ""      "355ml"  "carnaval 2012"            [6,] "heineken"     ""      "330ml"  "4x6 nl 1508"              [7,] "carta blanca" "light" "355ml"  "2010"                     [8,] "carta blanca" ""      "355ml"  "cl/co 2012"               [9,] "strongbow"    ""      ""       "red berries 400bg/gr/ro" warning message: no 'capacity' found in line 9. separation of 'type' , 'description' impossible.  > generatetable(data,c("red berries","heavy"))       brand          type          capacity description             [1,] "sol"          ""            "355ml"  "exp 2014"              [2,] "7up"          ""            "330ml"  "std vintage 50s 2015"  [3,] "heineken"     "light"       "500ml"  "473 13"             [4,] "tecate"       ""            "710ml"  "mx 2009"               [5,] "sol"          ""            "355ml"  "carnaval 2012"         [6,] "heineken"     ""            "330ml"  "4x6 nl 1508"           [7,] "carta blanca" "light"       "355ml"  "2010"                  [8,] "carta blanca" ""            "355ml"  "cl/co 2012"            [9,] "strongbow"    "red berries" ""       "400bg/gr/ro"          > generatetable(data,c("light","heavy"))       brand          type    capacity description                [1,] "sol"          ""      "355ml"  "exp 2014"                 [2,] "7up"          ""      "330ml"  "std vintage 50s 2015"     [3,] "heineken"     "light" "500ml"  "473 13"                [4,] "tecate"       ""      "710ml"  "mx 2009"                  [5,] "sol"          ""      "355ml"  "carnaval 2012"            [6,] "heineken"     ""      "330ml"  "4x6 nl 1508"              [7,] "carta blanca" "light" "355ml"  "2010"                     [8,] "carta blanca" ""      "355ml"  "cl/co 2012"               [9,] "strongbow"    ""      ""       "red berries 400bg/gr/ro" warning message: no 'capacity' found in line 9. separation of 'type' , 'description' impossible.  > generatetable(data,c("heavy"))       brand          type    capacity description                [1,] "sol"          ""      "355ml"  "exp 2014"                 [2,] "7up"          ""      "330ml"  "std vintage 50s 2015"     [3,] "heineken"     "light" "500ml"  "473 13"                [4,] "tecate"       ""      "710ml"  "mx 2009"                  [5,] "sol"          ""      "355ml"  "carnaval 2012"            [6,] "heineken"     ""      "330ml"  "4x6 nl 1508"              [7,] "carta blanca" "light" "355ml"  "2010"                     [8,] "carta blanca" ""      "355ml"  "cl/co 2012"               [9,] "strongbow"    ""      ""       "red berries 400bg/gr/ro" warning message: no 'capacity' found in line 9. separation of 'type' , 'description' impossible.  >  

Comments

Popular posts from this blog

php - Invalid Cofiguration - yii\base\InvalidConfigException - Yii2 -

How to show in django cms breadcrumbs full path? -

ruby on rails - npm error: tunneling socket could not be established, cause=connect ETIMEDOUT -