r - 別のテーブルの日付範囲に基づいて、あるテーブルにダミー変数を作成する

Question

私は2つのテーブルを持っています。 table1このように見えます

  date       hour     data
2010-05-01     3        5
2010-05-02     7        7
2010-05-02     10       8
2010-07-03     18       3
2011-12-09     22       1
2012-05-01     3        0

これはdata.tableとにキーが設定さdateれたとして保存されhourます。私はこのように見える別のテーブルを持っています。それは私のoutagesテーブルです。

 resource        date_out                date_back
   joey       2010-04-30 4:00:00      2010-05-02 8:30:00
   billy      2009-04-20 7:00:00      2009-02-02 5:30:00
   bob        2011-11-15 12:20:00     2010-12-09 23:00:00
   joey       2012-04-28 1:00:00      2012-05-02 17:00:00

table1それらの列がテーブルのリソースである場所に列を追加したいと思いoutagesます。これらの列の値は、停止がない場合は常に 0 になり、障害がある場合は 1 になります。

この例の結果は次のようになります。

  date       hour     data     joey      billy      bob
2010-05-01     3        5       1          0         0        
2010-05-02     7        7       1          0         0 
2010-05-02     10       8       0          0         0 
2010-07-03     18       3       0          0         0 
2011-12-09     22       1       0          0         1
2012-05-01     3        0       1          0         0

実際にtable1は、約 2500 行あり、outagesテーブルには 19000 行あります。これを行う唯一の方法は、テーブルの各行をループしてから、正しい場所にoutages1 を挿入することです。table1私のコードは順番に並んでいることに依存しtable1ているので、少なくともそのテーブルの 100% をoutages. ただし、以下は私のデータに4時間以上かかります。

for (out in 1:length(outages$resource)) {
  a<-as.character(outages[out]$resource)
  #if column doesn't exist then create it
  if (a %in% colnames(table1)==FALSE) {
    table1$new<-0
    setnames(table1, "new", a)
    }
  midpoint<-round(length(table1$date)/2,0)
  if (table1$date[midpoint]+table1$hour[midpoint]*60*60>=outages[out]$due_out && table1$date[midpoint]+table1$hour[midpoint]*60*60<=outages    [out]$due_back)
  {
    while(table1$date[midpoint]+table1$hour[midpoint]*60*60>=outages[out]$due_out && midpoint>=1 && midpoint<=length(table1$date)) {
      table1[midpoint,a:=1,with=FALSE]
      midpoint<-midpoint-1
    }
    midpoint<-round(length(table1$date)/2,0)
    while(table1$date[midpoint]+table1$hour[midpoint]*60*60<=outages[out]$due_back && midpoint>=1 && midpoint<=length(table1$date)) {
      table1[midpoint,a:=1,with=FALSE]
      midpoint<-midpoint+1
    }
  } else {
    if (table1$date[midpoint]+table1$hour[midpoint]*60*60>outages[out]$due_back) {
      while(table1$date[midpoint]+table1$hour[midpoint]*60*60>outages[out]$due_back && midpoint>=1 && midpoint<=length(table1$date)) {
        midpoint<-midpoint-1
      }
      while(table1$date[midpoint]+table1$hour[midpoint]*60*60>=outages[out]$due_out && midpoint>=1 && midpoint<=length(table1$date)) {
        table1[midpoint,a:=1,with=FALSE]
        midpoint<-midpoint-1
      }
    } 
    midpoint<-round(length(table1$date)/2,0)
    if (table1$date[midpoint]+table1$hour[midpoint]*60*60<outages[out]$due_out) {
      while(table1$date[midpoint]+table1$hour[midpoint]*60*60<outages[out]$due_out && midpoint>=1 && midpoint<=length(table1$date)) {
        midpoint<-midpoint+1
      }
      while(table1$date[midpoint]+table1$hour[midpoint]*60*60<=outages[out]$due_back && midpoint>=1 && midpoint<=length(table1$date)) {
        table1[midpoint,a:=1,with=FALSE]
        midpoint<-midpoint+1
 }
 }
 }
if (sum(table1[,a,with=FALSE])==0) {
  table1[,a:=NULL,with=FALSE]
}
}

みんなのお気に入りのインフォマーシャルのセリフを引用すると、「もっと良い方法があるはずだ」.

score 2 · Accepted Answer

これがあなたが望むものを達成する方法です。table1これは、の時間精度が 1 時間であることを前提としています。これは任意の精度に変更できますが、date_out-のdate_back範囲で可能な時間の完全なシーケンスを構築するため、より長い時間間隔でパフォーマンスが大幅に向上します。注: OP とはわずかに異なる表を使用して、重複する間隔を示し、OP のいくつかの間違いを修正しました。

table1 = data.table(date = c("2010-05-01", "2010-05-02", "2010-05-02", "2010-07-03", "2011-12-09", "2012-05-01"), hour = c(3,7,10,18,22,3), data = c(5,7,8,3,1,0))
outages = data.table(resource = c("joey", "bob", "billy", "bob", "joey"), date_out = c("2010-04-30 4:00:00", "2010-04-30 4:00:00", "2009-04-20 7:00:00", "2011-11-15 12:20:00", "2012-04-28 1:00:00"), date_back=c("2010-05-02 8:30:00", "2010-05-02 8:30:00", "2009-06-02 5:30:00", "2011-12-09 23:00:00", "2012-05-02 17:00:00"))

# round up date_out and round down date_back
# and create a sequence in-between spaced by 1 hour
outages[, list(datetime = seq(as.POSIXct(round(as.POSIXct(date_out) + 30*60-1, "hours")),
                              as.POSIXct(round(as.POSIXct(date_back) - 30*60, "hours")),
                              60*60)),
          by = list(resource, date_out)] -> outages.expanded
setkey(outages.expanded, datetime)

# merge with the original table, then run "table" to get the frequencies/occurences
# and cbind back with the original table
cbind(table1, unclass(table(
                outages.expanded[table1[, list(datetime=as.POSIXct(paste0(date, " ", hour, ":00:00")))],
                                 resource])))

#         date hour data bob joey
#1: 2010-05-01    3    5   1    1
#2: 2010-05-02    7    7   1    1
#3: 2010-05-02   10    8   0    0
#4: 2010-07-03   18    3   0    0
#5: 2011-12-09   22    1   1    0
#6: 2012-05-01    3    0   0    1

r - 別のテーブルの日付範囲に基づいて、あるテーブルにダミー変数を作成する

1 に答える 1

Related

Reference