r - How to create a lag variable within each group? -
i have data.table:
set.seed(1) data <- data.table(time = c(1:3, 1:4), groups = c(rep(c("b", "a"), c(3, 4))), value = rnorm(7)) data # groups time value # 1: b 1 -0.6264538 # 2: b 2 0.1836433 # 3: b 3 -0.8356286 # 4: 1 1.5952808 # 5: 2 0.3295078 # 6: 3 -0.8204684 # 7: 4 0.4874291
i want compute lagged version of "value" column, within each level of "groups".
the result should like
# groups time value lag.value # 1 1 1.5952808 na # 2 2 0.3295078 1.5952808 # 3 3 -0.8204684 0.3295078 # 4 4 0.4874291 -0.8204684 # 5 b 1 -0.6264538 na # 6 b 2 0.1836433 -0.6264538 # 7 b 3 -0.8356286 0.1836433
i have tried use lag
directly:
data$lag.value <- lag(data$value)
...which wouldn't work.
i have tried:
unlist(tapply(data$value, data$groups, lag)) a1 a2 a3 a4 b1 b2 b3 na -0.1162932 0.4420753 2.1505440 na 0.5894583 -0.2890288
which want. vector generated ordered differently ordering in data.table problematic.
what efficient way in base r, plyr, dplyr, , data.table?
you within data.table
library(data.table) data[, lag.value:=c(na, value[-.n]), by=groups] data # time groups value lag.value #1: 1 0.02779005 na #2: 2 0.88029938 0.02779005 #3: 3 -1.69514201 0.88029938 #4: 1 b -1.27560288 na #5: 2 b -0.65976434 -1.27560288 #6: 3 b -1.37804943 -0.65976434 #7: 4 b 0.12041778 -1.37804943
for multiple columns:
nm1 <- grep("^value", colnames(data), value=true) nm2 <- paste("lag", nm1, sep=".") data[, (nm2):=lapply(.sd, function(x) c(na, x[-.n])), by=groups, .sdcols=nm1] data # time groups value value1 value2 lag.value lag.value1 #1: 1 b -0.6264538 0.7383247 1.12493092 na na #2: 2 b 0.1836433 0.5757814 -0.04493361 -0.6264538 0.7383247 #3: 3 b -0.8356286 -0.3053884 -0.01619026 0.1836433 0.5757814 #4: 1 1.5952808 1.5117812 0.94383621 na na #5: 2 0.3295078 0.3898432 0.82122120 1.5952808 1.5117812 #6: 3 -0.8204684 -0.6212406 0.59390132 0.3295078 0.3898432 #7: 4 0.4874291 -2.2146999 0.91897737 -0.8204684 -0.6212406 # lag.value2 #1: na #2: 1.12493092 #3: -0.04493361 #4: na #5: 0.94383621 #6: 0.82122120 #7: 0.59390132
update
from data.table
versions >= v1.9.5
, can use shift
type
lag
or lead
. default, type lag
.
data[, (nm2) := shift(.sd), by=groups, .sdcols=nm1] # time groups value value1 value2 lag.value lag.value1 #1: 1 b -0.6264538 0.7383247 1.12493092 na na #2: 2 b 0.1836433 0.5757814 -0.04493361 -0.6264538 0.7383247 #3: 3 b -0.8356286 -0.3053884 -0.01619026 0.1836433 0.5757814 #4: 1 1.5952808 1.5117812 0.94383621 na na #5: 2 0.3295078 0.3898432 0.82122120 1.5952808 1.5117812 #6: 3 -0.8204684 -0.6212406 0.59390132 0.3295078 0.3898432 #7: 4 0.4874291 -2.2146999 0.91897737 -0.8204684 -0.6212406 # lag.value2 #1: na #2: 1.12493092 #3: -0.04493361 #4: na #5: 0.94383621 #6: 0.82122120 #7: 0.59390132
if need reverse, use type=lead
nm3 <- paste("lead", nm1, sep=".")
using original dataset
data[, (nm3) := shift(.sd, type='lead'), = groups, .sdcols=nm1] # time groups value value1 value2 lead.value lead.value1 #1: 1 b -0.6264538 0.7383247 1.12493092 0.1836433 0.5757814 #2: 2 b 0.1836433 0.5757814 -0.04493361 -0.8356286 -0.3053884 #3: 3 b -0.8356286 -0.3053884 -0.01619026 na na #4: 1 1.5952808 1.5117812 0.94383621 0.3295078 0.3898432 #5: 2 0.3295078 0.3898432 0.82122120 -0.8204684 -0.6212406 #6: 3 -0.8204684 -0.6212406 0.59390132 0.4874291 -2.2146999 #7: 4 0.4874291 -2.2146999 0.91897737 na na # lead.value2 #1: -0.04493361 #2: -0.01619026 #3: na #4: 0.82122120 #5: 0.59390132 #6: 0.91897737 #7: na
data
set.seed(1) data <- data.table(time =c(1:3,1:4),groups = c(rep(c("b","a"),c(3,4))), value = rnorm(7), value1=rnorm(7), value2=rnorm(7))
Comments
Post a Comment