根據行優化複雜的分組變異（查找？）

對於我現在越來越頻繁地遇到的問題（在變體中），我有一個稍慢的解決方案。我懷疑有一種更有效的方式去做，並且會喜歡一些指針。根據行優化複雜的分組變異（查找？）

我在下面創建的玩具示例並不需要那長，但是當我在我的真實數據上使用幾個這樣的查找函數時，它可能需要很多很長的時間。基本上，意圖是統計符合一些條件的兄弟姐妹。因爲它取決於每個人都活着的時間，這對每個兄弟姐妹來說都不是一樣的結果。

library(dplyr) 
# sample data 
sibs = tbl_df(data.frame(survive1y = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 
1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 
1, 1, 0, 0, 1, 0, 1), byear = c(1717L, 1719L, 1721L, 1723L, 1724L, 
1725L, 1727L, 1728L, 1730L, 1732L, 1733L, 1735L, 1736L, 1738L, 
1740L, 1740L, 1742L, 1738L, 1744L, 1746L, 1748L, 1749L, 1753L, 
1755L, 1757L, 1758L, 1759L, 1761L, 1762L, 1764L, 1767L, 1717L, 
1719L, 1721L, 1786L, 1773L, 1767L, 1768L, 1792L), dyear = c(1748L, 
1791L, 1760L, 1795L, 1765L, 1756L, 1730L, 1733L, 1733L, 1732L, 
1755L, 1800L, 1736L, 1738L, 1740L, 1740L, 1761L, 1816L, 1744L, 
1748L, 1748L, 1749L, 1754L, 1756L, 1757L, 1759L, 1815L, 1761L, 
1765L, 1783L, 1768L, 1800L, 1750L, 1757L, 1786L, 1773L, 1769L, 
1768L, 1793L))) 
sibs = bind_rows(replicate(10000, sibs, simplify = F)) 
sibs$idParents = rep(1:(nrow(sibs)/10), each = 10, length.out = nrow(sibs)) 

# get the number of siblings who were alive and dependent 
# in the first five years of this individual 
dependent_sibs_f5y = function(survive1y, byear, dyear) { 
    sibs = length(byear) 
    other_dependent_sibs_f5y = integer(length=sibs) 
    for(i in 1:sibs) { 
     # remove this sib 
     other_births = byear[-i] 
     other_deaths = dyear[-i] 
     other_made1y = survive1y[-i] 
     my_sibs = sibs - 1 - # minus self 
      sum(
       other_births > (byear[i] + 5) | # born more than 5y later 
       (other_births + 5) < byear[i] | # finished infancy before birth 
       other_deaths <= byear[i] | # died before birth 
       other_made1y == 0, # if they died right away, don't count 
      na.rm=T) # if dyear missing assume they lived 
     other_dependent_sibs_f5y[i] = my_sibs 
    } 
    other_dependent_sibs_f5y 
} 

system.time({ 
sibs2 = sibs %>% 
    group_by(idParents) %>% 
    mutate(
     dependent_sibs_f5y = 
     dependent_sibs_f5y(survive1y=survive1y, byear=byear, dyear=dyear) 
    ) 
})

來源

2016-11-08 Ruben