2017-06-15 327 views
0

我想使用tmerge()函數來轉換數據集以用於Cox迴歸框架的Andersen-Gill擴展重複的事件。見Therneau的excellent vignetteAndersen-Gill計數過程中的無風險區間使用survival生存的Cox迴歸公式:: tmerge()

我想說明的是,個人不受事件發生後30天重複的事件,也就是我想單獨退出的風險暫時設置,例如,如果當個人是不是在事件發生風險,它被忽略。

原始方法是迭代地添加所有事件,然後簡單地將30添加到tstart變量。但是,這可能導致實例爲tstart >= tstop,並且在更大和更復雜的數據集中將會是災難性的。

我曾嘗試利用tmerge()函數與forloop糾正我上面提到的問題。對於這個例子,我將在生存包中使用cgd數據。

編輯:見更正for循環低於

library(survival) 
cgd0 <- cgd0 
newcgd <- tmerge(data1=cgd0[, 1:13], data2=cgd0, id=id, tstop=futime) 

for(i in 1:7){   
    x <- paste0("etime", i) #etime1:etime7 

# iteratively add each event 
    newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x])) 

# select only observations that end in an event and iteratively create 
# cumulative number of events for each individual 
    newcgd <- tmerge(newcgd, subset(newcgd, infect == 1), 
        id = id, cum_infect = cumtdc(tstop)) 

# for each loop add 30 days to the start time of the ith cumulative event 
    newcgd[which(newcgd$cum_infect == i), "tstart"] <- 
      newcgd[which(newcgd$cum_infect == i), "tstart"] + 30 

# for each loop remove observations were the start time >= stop time 
    newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),] 
} 

attr(newcgd, "tcount") 
#   early late gap within boundary leading trailing tied 
#infect   0 0 0  44  0  0  0 0 
#cum_infect  0 0 0  0  44  0  0 0 
#infect   0 0 4  11  0  1  1 0 
#cum_infect  0 0 0  0  11  0  45 0 
#infect   0 0 2  6  0  0  0 0 
#cum_infect  0 0 0  0  6  0  56 0 
#infect   0 0 1  2  0  0  0 0 
#cum_infect  0 0 0  0  6  0  58 0 
#infect   0 0 0  2  0  0  0 0 
#cum_infect  0 0 0  0  8  0  58 0 
#infect   0 0 0  1  0  0  0 0 
#cum_infect  0 0 0  0  9  0  58 0 
#infect   0 0 0  1  0  0  0 0 
#cum_infect  0 0 0  0  10  0  58 0 

我相信這個解決方案是正確的。然而,這是生存分析中的一個常見問題,我擔心的是,我忽略了一些東西,代碼並沒有做到我認爲的那樣。

二)我俯瞰驗證方式R中要做到這一點

iii)如i)和ii)沒有問題,我相信這個代碼是低效的,不知道是否有改善明顯的方式執行速度。

--------------------------------------------- -------------------------------------------------- ------------------------------------

編輯:進一步的錯誤檢查與評論。希望這可以澄清一下我正在嘗試做的事情。從概念上;我明確規定,個人在經歷事件後30天內沒有再次發生其他事件的風險。在Andersen-Gill計數過程公式中,每一行代表一個觀察值,其中包含開始時間tstart和停止時間tstop以及指示觀察是否因事件infect == 1或檢查infect == 0而結束的指示器(在此例中爲infect) 。在這裏,我手動完成上述forloop中的步驟,並量化每個循環中發生多少事件以及指定30天免疫週期時的總體隨訪時間。然後將這個相同的代碼作爲forloop來實現完整性。結果顯示在下面的代碼塊中。

newcgd <- tmerge(data1=cgd0[, 1:13], data2=cgd0, id=id, tstop=futime) 

###1st event 

x <- "etime1" 
immunecgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x])) 
immunecgd <- tmerge(immunecgd, subset(immunecgd, infect == 1), id = id, cum_infect = cumtdc(tstop)) 
immunecgd[which(immunecgd$cum_infect == 1), "tstart"] <- immunecgd[which(immunecgd$cum_infect == 1), "tstart"] + 30 
immunecgd <- immunecgd[which(immunecgd$tstart < immunecgd$tstop),] 

newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x])) 
newcgd <- tmerge(newcgd, subset(newcgd, infect == 1), id = id, cum_infect = cumtdc(tstop)) 
newcgd[which(newcgd$cum_infect == 1), "tstart"] <- newcgd[which(newcgd$cum_infect == 1), "tstart"] 
newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),] 

etime1 <- c(sum(immunecgd$infect), sum(newcgd$infect)) 
futime1 <- c(sum(immunecgd$tstop - immunecgd$tstart), sum(newcgd$tstop - newcgd$tstart)) 

###2nd event 
x <- "etime2" 
immunecgd <- tmerge(immunecgd, cgd0, id = id, infect = event(cgd0[,x])) 
immunecgd <- tmerge(immunecgd, subset(immunecgd, infect == 1), id = id, cum_infect = cumtdc(tstop)) 
immunecgd[which(immunecgd$cum_infect == 2), "tstart"] <- immunecgd[which(immunecgd$cum_infect == 2), "tstart"] + 30 
immunecgd <- immunecgd[which(immunecgd$tstart < immunecgd$tstop),] 

newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x])) 
newcgd <- tmerge(newcgd, subset(newcgd, infect == 1), id = id, cum_infect = cumtdc(tstop)) 
newcgd[which(newcgd$cum_infect == 2), "tstart"] <- newcgd[which(newcgd$cum_infect == 2), "tstart"] 
newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),] 

etime2 <- c(sum(immunecgd$infect), sum(newcgd$infect)) 
futime2 <- c(sum(immunecgd$tstop - immunecgd$tstart), sum(newcgd$tstop - newcgd$tstart)) 

###3rd event 
x <- "etime3" 
immunecgd <- tmerge(immunecgd, cgd0, id = id, infect = event(cgd0[,x])) 
immunecgd <- tmerge(immunecgd, subset(immunecgd, infect == 1), id = id, cum_infect = cumtdc(tstop)) 
immunecgd[which(immunecgd$cum_infect == 3), "tstart"] <- immunecgd[which(immunecgd$cum_infect == 3), "tstart"] + 30 
immunecgd <- immunecgd[which(immunecgd$tstart < immunecgd$tstop),] 

newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x])) 
newcgd <- tmerge(newcgd, subset(newcgd, infect == 1), id = id, cum_infect = cumtdc(tstop)) 
newcgd[which(newcgd$cum_infect == 3), "tstart"] <- newcgd[which(newcgd$cum_infect == 3), "tstart"] 
newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),] 

etime3 <- c(sum(immunecgd$infect), sum(newcgd$infect)) 
futime3 <- c(sum(immunecgd$tstop - immunecgd$tstart), sum(newcgd$tstop - newcgd$tstart)) 

###4th event 
x <- "etime4" 
immunecgd <- tmerge(immunecgd, cgd0, id = id, infect = event(cgd0[,x])) 
immunecgd <- tmerge(immunecgd, subset(immunecgd, infect == 1), id = id, cum_infect = cumtdc(tstop)) 
immunecgd[which(immunecgd$cum_infect == 4), "tstart"] <- immunecgd[which(immunecgd$cum_infect == 4), "tstart"] + 30 
immunecgd <- immunecgd[which(immunecgd$tstart < immunecgd$tstop),] 

newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x])) 
newcgd <- tmerge(newcgd, subset(newcgd, infect == 1), id = id, cum_infect = cumtdc(tstop)) 
newcgd[which(newcgd$cum_infect == 4), "tstart"] <- newcgd[which(newcgd$cum_infect == 4), "tstart"] 
newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),] 

etime4 <- c(sum(immunecgd$infect), sum(newcgd$infect)) 
futime4 <- c(sum(immunecgd$tstop - immunecgd$tstart), sum(newcgd$tstop - newcgd$tstart)) 

###5th event 
x <- "etime5" 
immunecgd <- tmerge(immunecgd, cgd0, id = id, infect = event(cgd0[,x])) 
immunecgd <- tmerge(immunecgd, subset(immunecgd, infect == 1), id = id, cum_infect = cumtdc(tstop)) 
immunecgd[which(immunecgd$cum_infect == 5), "tstart"] <- immunecgd[which(immunecgd$cum_infect == 5), "tstart"] + 30 
immunecgd <- immunecgd[which(immunecgd$tstart < immunecgd$tstop),] 

newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x])) 
newcgd <- tmerge(newcgd, subset(newcgd, infect == 1), id = id, cum_infect = cumtdc(tstop)) 
newcgd[which(newcgd$cum_infect == 5), "tstart"] <- newcgd[which(newcgd$cum_infect == 5), "tstart"] 
newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),] 

etime5 <- c(sum(immunecgd$infect), sum(newcgd$infect)) 
futime5 <- c(sum(immunecgd$tstop - immunecgd$tstart), sum(newcgd$tstop - newcgd$tstart)) 

###6th event 
x <- "etime6" 
immunecgd <- tmerge(immunecgd, cgd0, id = id, infect = event(cgd0[,x])) 
immunecgd <- tmerge(immunecgd, subset(immunecgd, infect == 1), id = id, cum_infect = cumtdc(tstop)) 
immunecgd[which(immunecgd$cum_infect == 6), "tstart"] <- immunecgd[which(immunecgd$cum_infect == 6), "tstart"] + 30 
immunecgd <- immunecgd[which(immunecgd$tstart < immunecgd$tstop),] 

newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x])) 
newcgd <- tmerge(newcgd, subset(newcgd, infect == 1), id = id, cum_infect = cumtdc(tstop)) 
newcgd[which(newcgd$cum_infect == 6), "tstart"] <- newcgd[which(newcgd$cum_infect == 6), "tstart"] 
newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),] 

etime6 <- c(sum(immunecgd$infect), sum(newcgd$infect)) 
futime6 <- c(sum(immunecgd$tstop - immunecgd$tstart), sum(newcgd$tstop - newcgd$tstart)) 

###7th event 
x <- "etime7" 
immunecgd <- tmerge(immunecgd, cgd0, id = id, infect = event(cgd0[,x])) 
immunecgd <- tmerge(immunecgd, subset(immunecgd, infect == 1), id = id, cum_infect = cumtdc(tstop)) 
immunecgd[which(immunecgd$cum_infect == 7), "tstart"] <- immunecgd[which(immunecgd$cum_infect == 7), "tstart"] + 30 
immunecgd <- immunecgd[which(immunecgd$tstart < immunecgd$tstop),] 

newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x])) 
newcgd <- tmerge(newcgd, subset(newcgd, infect == 1), id = id, cum_infect = cumtdc(tstop)) 
newcgd[which(newcgd$cum_infect == 7), "tstart"] <- newcgd[which(newcgd$cum_infect == 7), "tstart"] 
newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),] 

etime7 <- c(sum(immunecgd$infect), sum(newcgd$infect)) 
futime7 <- c(sum(immunecgd$tstop - immunecgd$tstart), sum(newcgd$tstop - newcgd$tstart)) 

df_event <- rbind.data.frame(etime1, etime2, etime3, etime4, etime5, etime6, etime7) 
colnames(df_event) <- c("immunity", "no_immunity") 
df_event$diff <- df_event$no_immunity - df_event$immunity 

df_futime <- rbind.data.frame(futime1, futime2, futime3, futime4, futime5, futime6, futime7) 
colnames(df_futime) <- c("immunity", "no_immunity") 
df_futime$diff <- df_futime$no_immunity - df_futime$immunity 

與forloop的代碼相同。

newcgd <- tmerge(data1=cgd0[, 1:13], data2=cgd0, id=id, tstop=futime) 
immunecgd <- tmerge(data1=cgd0[, 1:13], data2=cgd0, id=id, tstop=futime) 

event <- matrix(NA, nrow = 7, ncol = 2) 
futime <- matrix(NA, nrow = 7, ncol = 2) 
for(i in 1:7){   
    x <- paste0("etime", i) #etime1:etime7 

    # iteratively add each event 
    immunecgd <- tmerge(immunecgd, cgd0, id = id, infect = event(cgd0[,x])) 
    newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x])) 

    # select only observations that end in an event and iteratively create 
    # cumulative number of events for each individual 
    immunecgd <- tmerge(immunecgd, subset(immunecgd, infect == 1), id = id, cum_infect = cumtdc(tstop)) 
    newcgd <- tmerge(newcgd, subset(newcgd, infect == 1), id = id, cum_infect = cumtdc(tstop)) 

    # for each loop add 30 days to the start time of the ith cumulative event 
    immunecgd[which(immunecgd$cum_infect == i), "tstart"] <- immunecgd[which(immunecgd$cum_infect == i), "tstart"] + 30 
    newcgd[which(newcgd$cum_infect == i), "tstart"] <- newcgd[which(newcgd$cum_infect == i), "tstart"] 

    # for each loop remove observations were the start time >= stop time 
    immunecgd <- immunecgd[which(immunecgd$tstart < immunecgd$tstop),] 
    newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),] 

    event[i,] <- c(sum(immunecgd$infect), sum(newcgd$infect)) 
    futime[i,] <- c(sum(immunecgd$tstop - immunecgd$tstart), sum(newcgd$tstop - newcgd$tstart)) 
} 

event <- data.frame(event) 
colnames(event) <- c("immunity", "no_immunity") 
event$diff <- event$no_immunity - event$immunity 

futime <- data.frame(futime) 
colnames(futime) <- c("immunity", "no_immunity") 
futime$diff <- futime$no_immunity - futime$immunity 

上述錯誤檢測碼的結果如下

df_event 
    immunity no_immunity diff 
1  44   44 0 
2  56   61 5 
3  62   69 7 
4  64   72 8 
5  66   74 8 
6  67   75 8 
7  68   76 8 

df_futime 
    immunity no_immunity diff 
1 36202  37477 1275 
2 35935  37477 1542 
3 35875  37477 1602 
4 35875  37477 1602 
5 35875  37477 1602 
6 35875  37477 1602 
7 35875  37477 1602 

------------------------- -------------------------------------------------- -------------------------------------------------- ------

通過對survival包中的不同數據集進行進一步測試,模擬數據集和我自己的個人數據集(我希望使用此代碼的數據集),我已經發現了d'毛刺'。在上面的代碼版本中,如果一個新事件etime[i-1]屬於其中一個時期,我們已經指定該個體免於發生事件 - 這正是代碼旨在創建的實例 - 事件沒有得到併入累積事件計數器cum_infect。在接下來的遊程etime[i]個體只會有[I-1]累積事件,以及控制30天是否應該被添加到開始時間的代碼的所述部分

immunecgd[which(immunecgd$cum_infect == i), "tstart"] <- immunecgd[which(immunecgd$cum_infect == i), "tstart"] + 30 

不會識別個體爲具有有一個事件。這意味着forloop只會在事件發生後正確增加30天的免疫力,直到事件落在這樣的免疫時期的第一次。我製作了一個相當不雅的解決方案。但它的工作。

newcgd <- tmerge(data1=cgd0[, 1:13], data2=cgd0, id=id, tstop=futime) 
immunecgd <- tmerge(data1=cgd0[, 1:13], data2=cgd0, id=id, tstop=futime) 
newcgd$cum_infect_0 <- 0 
immunecgd$cum_infect_0 <- 0 
event <- matrix(NA, nrow = 7, ncol = 2) 
futime <- matrix(NA, nrow = 7, ncol = 2) 
for(i in 1:7){   
    x <- paste0("etime", i) #etime1:etime7 

    # iteratively add each event 
    immunecgd <- tmerge(immunecgd, cgd0, id = id, infect = event(cgd0[,x])) 
    newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x])) 

    # select only observations that end in an event and iteratively create 
    # cumulative number of events for each individual 
    immunecgd <- tmerge(immunecgd, subset(immunecgd, infect == 1), id = id, cum_infect = cumtdc(tstop)) 
    newcgd <- tmerge(newcgd, subset(newcgd, infect == 1), id = id, cum_infect = cumtdc(tstop)) 

    # create new column that will hold cumulative events between loops 
    immunecgd[, paste0("cum_infect_", i)] <- immunecgd[, "cum_infect"] 
    newcgd[, paste0("cum_infect_", i)] <- newcgd[, "cum_infect"] 

    # for each loop add 30 days to the start time if there is atleast one cumulative event 
    # and the value of the ith cumulative event is larger than the i-1th cumulative event 
    immunecgd[which(immunecgd$cum_infect > 0 & immunecgd$cum_infect > immunecgd[, paste0("cum_infect_", i - 1)]), "tstart"] <- 
     immunecgd[which(immunecgd$cum_infect > 0 & immunecgd$cum_infect > immunecgd[, paste0("cum_infect_", i - 1)]), "tstart"] + 30 
    newcgd[which(newcgd$cum_infect > 0 & newcgd$cum_infect > newcgd[, paste0("cum_infect_", i - 1)]), "tstart"] <- 
     newcgd[which(newcgd$cum_infect > 0 & newcgd$cum_infect > newcgd[, paste0("cum_infect_", i - 1)]), "tstart"] 

    # for each loop remove observations were the start time >= stop time 
    immunecgd <- immunecgd[which(immunecgd$tstart < immunecgd$tstop),] 
    newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),] 

    event[i,] <- c(sum(immunecgd$infect), sum(newcgd$infect)) 
    futime[i,] <- c(sum(immunecgd$tstop - immunecgd$tstart), sum(newcgd$tstop - newcgd$tstart)) 
} 
immunecgd <- immunecgd[,!grepl("cum_infect_", colnames(immunecgd))] 
newcgd <- newcgd[,!grepl("cum_infect_", colnames(newcgd))] 

event <- data.frame(event) 
colnames(event) <- c("immunity", "no_immunity") 
event$diff <- event$no_immunity - event$immunity 

futime <- data.frame(futime) 
colnames(futime) <- c("immunity", "no_immunity") 
futime$diff <- futime$no_immunity - futime$immunity 

在這裏我們可以看到差異的事件

immunity no_immunity diff 
1  44   44 0 
2  56   61 5 
3  62   69 7 
4  64   72 8 
5  65   74 9 
6  66   75 9 
7  66   76 10 

正確,指定for循環的總人數已發現2個實例是一個事件落在免疫的時期。

+0

我想我可以在每一步添加7到tstart? – user6571411

+0

我想你可以提供一個[MCVE]。 –

+0

如果我能我會回答我自己的問題。我真的不知道如何去做我所要求的。 – user6571411

回答

0

對我的評論的後續行動,這是我看到的時候我試圖實現它的代碼:

with(newcgd, table(tstart-tstop <= 30, infect)) 
#------------- 
     infect 
     0 1 
    TRUE 120 68 

所以,如果我理解正確你的目標,我不認爲你在那裏呢,和我因爲:

> newcgd$infect <- with(newcgd,ifelse(infect, tstart-tstop > 30, 0)) 
> with(newcgd, table(tstart-tstop <= 30, infect)) 
     infect 
     0 
    TRUE 188 

當我將所有短間隔事件設置爲0時,我根本沒有得到任何事件。但也許我還沒有理解這些問題?

+0

我在上面添加了一些錯誤測試代碼,我認爲它可以更好地解釋我正在嘗試執行的操作。我期待着能夠理解生存分析的那一天,我可以很容易地解釋我在做什麼,直到那時,謝謝你抽出時間。 – user6571411

+0

我可能一直不清楚。我需要確定的標準之一是fullfilled是'with(newcgd,table(tstop - tstart <= 0,infect))'只給出'FALSE'。它似乎這樣做。 – user6571411

+0

值爲'0',所以如果你強迫邏輯,那就是你得到的,以及作爲邏輯運算符所作用的「解釋」。 –