2014-10-17 153 views
7

此問題與我之前詢問的此one有關。但提到這個問題沒有必要回答這個問題。R中的數據平滑處理

數據

我有包含記錄在0.1秒的間隔2169個的車輛速度的數據集。所以,單個車輛有很多排。在這裏,我僅複製車輛#2的數據:

> dput(uma) 
structure(list(Frame.ID = 13:445, Vehicle.velocity = c(40, 40, 
40, 40, 40, 40, 40, 40.02, 40.03, 39.93, 39.61, 39.14, 38.61, 
38.28, 38.42, 38.78, 38.92, 38.54, 37.51, 36.34, 35.5, 35.08, 
34.96, 34.98, 35, 34.99, 34.98, 35.1, 35.49, 36.2, 37.15, 38.12, 
38.76, 38.95, 38.95, 38.99, 39.18, 39.34, 39.2, 38.89, 38.73, 
38.88, 39.28, 39.68, 39.94, 40.02, 40, 39.99, 39.99, 39.65, 38.92, 
38.52, 38.8, 39.72, 40.76, 41.07, 40.8, 40.59, 40.75, 41.38, 
42.37, 43.37, 44.06, 44.29, 44.13, 43.9, 43.92, 44.21, 44.59, 
44.87, 44.99, 45.01, 45.01, 45, 45, 45, 44.79, 44.32, 43.98, 
43.97, 44.29, 44.76, 45.06, 45.36, 45.92, 46.6, 47.05, 47.05, 
46.6, 45.92, 45.36, 45.06, 44.96, 44.97, 44.99, 44.99, 44.99, 
44.99, 45.01, 45.02, 44.9, 44.46, 43.62, 42.47, 41.41, 40.72, 
40.49, 40.6, 40.76, 40.72, 40.5, 40.38, 40.43, 40.38, 39.83, 
38.59, 37.02, 35.73, 35.04, 34.85, 34.91, 34.99, 34.99, 34.97, 
34.96, 34.98, 35.07, 35.29, 35.54, 35.67, 35.63, 35.53, 35.53, 
35.63, 35.68, 35.55, 35.28, 35.06, 35.09, 35.49, 36.22, 37.08, 
37.8, 38.3, 38.73, 39.18, 39.62, 39.83, 39.73, 39.58, 39.57, 
39.71, 39.91, 40, 39.98, 39.97, 40.08, 40.38, 40.81, 41.27, 41.69, 
42.2, 42.92, 43.77, 44.49, 44.9, 45.03, 45.01, 45, 45, 45, 45, 
45, 45, 45, 45, 45, 45, 45, 44.99, 45.03, 45.26, 45.83, 46.83, 
48.2, 49.68, 50.95, 51.83, 52.19, 52, 51.35, 50.38, 49.38, 48.63, 
48.15, 47.87, 47.78, 48.01, 48.63, 49.52, 50.39, 50.9, 50.96, 
50.68, 50.3, 50.05, 49.94, 49.87, 49.82, 49.82, 49.88, 49.96, 
50, 50, 49.98, 49.98, 50.16, 50.64, 51.43, 52.33, 53.01, 53.27, 
53.22, 53.25, 53.75, 54.86, 56.36, 57.64, 58.28, 58.29, 57.94, 
57.51, 57.07, 56.64, 56.43, 56.73, 57.5, 58.27, 58.55, 58.32, 
57.99, 57.89, 57.92, 57.74, 57.12, 56.24, 55.51, 55.1, 54.97, 
54.98, 55.02, 55.03, 54.86, 54.3, 53.25, 51.8, 50.36, 49.41, 
49.06, 49.17, 49.4, 49.51, 49.52, 49.51, 49.45, 49.24, 48.84, 
48.29, 47.74, 47.33, 47.12, 47.06, 47.07, 47.08, 47.05, 47.04, 
47.25, 47.68, 47.93, 47.56, 46.31, 44.43, 42.7, 41.56, 41.03, 
40.92, 40.92, 40.98, 41.19, 41.45, 41.54, 41.32, 40.85, 40.37, 
40.09, 39.99, 39.99, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 
39.98, 39.97, 40.1, 40.53, 41.36, 42.52, 43.71, 44.57, 45.01, 
45.1, 45.04, 45, 45, 45, 45, 45, 45, 44.98, 44.97, 45.08, 45.39, 
45.85, 46.2, 46.28, 46.21, 46.29, 46.74, 47.49, 48.35, 49.11, 
49.63, 49.89, 49.94, 49.97, 50.14, 50.44, 50.78, 51.03, 51.12, 
51.05, 50.85, 50.56, 50.26, 50.06, 50.1, 50.52, 51.36, 52.5, 
53.63, 54.46, 54.9, 55.03, 55.09, 55.23, 55.35, 55.35, 55.23, 
55.07, 54.99, 54.98, 54.97, 55.06, 55.37, 55.91, 56.66, 57.42, 
58.07, 58.7, 59.24, 59.67, 59.95, 60.02, 60, 60, 60, 60, 60, 
60.01, 60.06, 60.23, 60.65, 61.34, 62.17, 62.93, 63.53, 64, 64.41, 
64.75, 65.04, 65.3, 65.57, 65.75, 65.74, 65.66, 65.62, 65.71, 
65.91, 66.1, 66.26, 66.44, 66.61, 66.78, 66.91, 66.99, 66.91, 
66.7, 66.56, 66.6, 66.83, 67.17, 67.45, 67.75, 68.15, 68.64, 
69.15, 69.57, 69.79, 69.79, 69.72, 69.72, 69.81, 69.94, 70, 70.01, 
70.02, 70.03)), .Names = c("Frame.ID", "Vehicle.velocity"), class = "data.frame", row.names = c(NA, 
433L)) 

Frame.ID是觀察Vehicle.velocity的時間範圍。速度變量中有一些噪音,我想平滑它。

方法論

爲了平穩速度我使用下面的等式:

equation

其中,
德爾塔= 10
Nalpha =數據點的(行)的數目
I = 1,...,Nalpha(即行號)
D = {i-1,Nalpha-i,3 * delta = 30}的最小值
xalpha =速度

問題

我已經通過filterconvolution在R.文檔不見了看來我必須瞭解卷積做到這一點。然而,我已經盡力而爲,無法理解卷積是如何工作的!鏈接的問題有一個答案,這有助於我理解函數的一些內部運作,但我仍然不確定。
任何人都可以在這裏解釋這個東西是如何工作的?或者引導我採用另一種方法來達到相同的目的,即應用等式?

我這工作,但當前的代碼是冗長

這裏是uma樣子:

> head(uma) 
    Frame.ID Vehicle.velocity 
1  13    40 
2  14    40 
3  15    40 
4  16    40 
5  17    40 
6  18    40 

uma$i <- 1:nrow(uma)    # this is i 
uma$im1 <- uma$i - 1 
uma$Nai <- nrow(uma) - uma$i  # this is Nalpha 
uma$delta3 <- 30     # this is 3 times delta 
uma$D <- pmin(uma$im1, uma$Nai, uma$delta3) # selecting the minimum of {i-1, Nalpha - i, 3*delta=15} 
uma$imD <- uma$i - uma$D   # i-D 
uma$ipD <- uma$i + uma$D   # i+D 

uma <- ddply(uma, .(Frame.ID), transform, k = imD:ipD) # to include all k in the data frame 
umai <- uma 
umai$imk <- umai$i - umai$k  # i-k 
umai$aimk <- (-1) * abs(umai$imk) # -|i-k| 
umai$delta <- 10     
umai$kernel <- exp(umai$aimk/umai$delta) # The kernel in the equation i.e. EXP^-|i-k|/delta 
umai$p <- umai$Vehicle.velocity[match(umai$k,umai$i)] #observed velocity in kth row as described in equation as t(k) 
umai$kernelp <- umai$p * umai$kernel  # the product of kernel and observed velocity in kth row as described in equation as t(k) 
umair <- ddply(umai, .(Frame.ID), summarize, Z = sum(kernel), prod = sum(kernelp)) # summing the kernel to get Z and summing the product to get the numerator of the equation 
umair$new.Y <- umair$prod/umair$Z # the final step to get the smoothed velocity 

情節

僅供參考,如果我暗算時間框架的觀察和平滑的速度我們可以看到平滑的結果:

ggplot() + 
     geom_point(data=uma,aes(y=Vehicle.velocity, x= Frame.ID)) + 
    geom_point(data=umair,aes(y=new.Y, x= Frame.ID), color="red") 

smooth

請通過指導我使用卷積來幫助我縮短代碼並適用於所有車輛(由Vehicle.ID代表數據集中)。

dplyr

好了,所以就用下面的代碼和它的工作原理,但需要對32 GB RAM 3小時。任何人都可以提出改進措施以加快速度(每個小時由umal,umavumaa拍攝)?

uma <- tbl_df(uma) 
uma <- uma %>%  # take data frame 
    group_by(Vehicle.ID) %>% # group by Vehicle ID 
    mutate(i = 1:length(Frame.ID), im1 = i-1, Nai = length(Frame.ID) - i, 
     Dv = pmin(im1, Nai, 30), 
     Da = pmin(im1, Nai, 120), 
     Dl = pmin(im1, Nai, 15), 

     imDv = i - Dv, 
     ipDv = i + Dv, 
     imDa = i - Da, 
     ipDa = i + Da, 
     imDl = i - Dl, 
     ipDl = i + Dl) %>% # finding i, i-1 and Nalpha-i, D, i-D and i+D for location, velocity and acceleration 
    ungroup() 



umav <- uma %>% 
    group_by(Vehicle.ID, Frame.ID) %>% 
    do(data.frame(kv = .$imDv:.$ipDv)) %>% 
    left_join(x=., y=uma) %>% 
    mutate(imk = i - kv, aimk = (-1) * abs(imk), delta = 10, kernel = exp(aimk/delta)) %>% 
    ungroup() %>% 
    group_by(Vehicle.ID) %>% 
    mutate(p = Vehicle.velocity2[match(kv,i)], kernelp = p * kernel) %>% 
    ungroup() %>% 
    group_by(Vehicle.ID, Frame.ID) %>% 
    summarise(Z = sum(kernel), prod = sum(kernelp)) %>% 
    mutate(svel = prod/Z) %>% 
    ungroup() 



umaa <- uma %>% 
    group_by(Vehicle.ID, Frame.ID) %>% 
    do(data.frame(ka = .$imDa:.$ipDa)) %>% 
    left_join(x=., y=uma) %>% 
    mutate(imk = i - ka, aimk = (-1) * abs(imk), delta = 10, kernel = exp(aimk/delta)) %>% 
    ungroup() %>% 
    group_by(Vehicle.ID) %>% 
    mutate(p = Vehicle.acceleration2[match(ka,i)], kernelp = p * kernel) %>% 
    ungroup() %>% 
    group_by(Vehicle.ID, Frame.ID) %>% 
    summarise(Z = sum(kernel), prod = sum(kernelp)) %>% 
    mutate(sacc = prod/Z) %>% 
    ungroup() 




umal <- uma %>% 
    group_by(Vehicle.ID, Frame.ID) %>% 
    do(data.frame(kl = .$imDl:.$ipDl)) %>% 
    left_join(x=., y=uma) %>% 
    mutate(imk = i - kl, aimk = (-1) * abs(imk), delta = 10, kernel = exp(aimk/delta)) %>% 
    ungroup() %>% 
    group_by(Vehicle.ID) %>% 
    mutate(p = Local.Y[match(kl,i)], kernelp = p * kernel) %>% 
    ungroup() %>% 
    group_by(Vehicle.ID, Frame.ID) %>% 
    summarise(Z = sum(kernel), prod = sum(kernelp)) %>% 
    mutate(ycoord = prod/Z) %>% 
    ungroup() 

umal <- select(umal,c("Vehicle.ID", "Frame.ID", "ycoord")) 
umav <- select(umav, c("Vehicle.ID", "Frame.ID", "svel")) 
umaa <- select(umaa, c("Vehicle.ID", "Frame.ID", "sacc")) 

umair <- left_join(uma, umal) %>% left_join(x=., y=umav) %>% left_join(x=., y=umaa) 
+0

我修改了所有車輛的代碼。 '轉換'的'ddply'已經花費了1個小時,並且仍在運行!我真的需要你的幫助。 – 2014-10-18 01:05:00

+1

當ddply時間過長時,通常的反應是切換到更高效的策略,如packages:data.table或dplyr – 2014-10-18 01:09:52

+0

不確定你的項目目標是什麼,但從氣候的角度來看,我希望你能幫助把運輸到摺疊。所以在我看來,你已經提供了一些類型的離散傅里葉變換。無論如何,我認爲你錯過了更大的觀點。我對此很敏感,但你不需要這種方法來傳達你想要的東西。我當然不是說R一般......我只是認爲你走錯了路。 – miles2know 2014-10-18 02:32:11

回答

4

良好的第一步是採取一個for循環(我將與sapply隱藏),每個指標進行指數平滑:

josilber1 <- function(uma) { 
    delta <- 10 
    sapply(1:nrow(uma), function(i) { 
    D <- min(i-1, nrow(uma)-i, 30) 
    rng <- (i-D):(i+D) 
    rng <- rng[rng >= 1 & rng <= nrow(uma)] 
    expabs <- exp(-abs(i-rng)/delta) 
    return(sum(uma$Vehicle.velocity[rng] * expabs)/sum(expabs)) 
    }) 
} 

一個更復雜的方法是隻計算每個指數的指數平滑函數的增量變化(與每個指數的重新求和相反)。指數平滑函數有一個較低的部分(當前索引之前的數據;我在下面的代碼中包含low中的當前索引)和上部(當前索引之後的數據;下面代碼中的high)。當我們循環遍歷矢量時,下半部分的所有數據的權重會減少(除以mult),而上半部分的所有數據的權重將會加大(我們乘以mult)。最左邊的元素從low中刪除,high中最左邊的元素移動到low,並將一個元素添加到high的右側。

的實際代碼是有點混亂處理的開始和矢量的結束並處理數值穩定性問題(在high錯誤由mult每次迭代相乘):

josilber2 <- function(uma) { 
    delta <- 10 
    x <- uma$Vehicle.velocity 
    ret <- c(x[1], rep(NA, nrow(uma)-1)) 
    low <- x[1] 
    high <- 0 
    norm <- 1 
    old.D <- 0 
    mult <- exp(1/delta) 
    for (i in 2:nrow(uma)) { 
    D <- min(i-1, nrow(uma)-i, 30) 
    if (D == old.D + 1) { 
     low <- low/mult + x[i] 
     high <- high * mult - x[i] + x[i+D-1]/mult^(D-1) + x[i+D]/mult^D 
     norm <- norm + 2/mult^D 
    } else if (D == old.D) { 
     low <- low/mult - x[i-(D+1)]/mult^(D+1) + x[i] 
     high <- high * mult - x[i] + x[i+D]/mult^D 
    } else { 
     low <- low/mult - x[i-(D+2)]/mult^(D+2) - x[i-(D+1)]/mult^(D+1) + x[i] 
     high <- high * mult - x[i] 
     norm <- norm - 2/mult^(D+1) 
    } 

    # For numerical stability, recompute high every so often 
    if (i %% 50 == 0) { 
     rng <- (i+1):(i+D) 
     expabs <- exp(-abs(i-rng)/delta) 
     high <- sum(x[rng] * expabs) 
    } 

    ret[i] <- (low+high)/norm 
    old.D <- D 
    } 
    return(ret) 
} 

R代碼裏面像josilber2往往可以加快大幅度使用Rcpp包:

我們現在可以從基準這三種新方法的改進:

all.equal(umair.fxn(uma), josilber1(uma)) 
# [1] TRUE 
all.equal(umair.fxn(uma), josilber2(uma)) 
# [1] TRUE 
all.equal(umair.fxn(uma), josilber3(uma$Vehicle.velocity)) 
# [1] TRUE 
library(microbenchmark) 
microbenchmark(umair.fxn(uma), josilber1(uma), josilber2(uma), josilber3(uma$Vehicle.velocity)) 
# Unit: microseconds 
#        expr  min   lq   mean  median   uq  max neval 
#     umair.fxn(uma) 370006.728 382327.4115 398554.71080 393495.052 404186.153 572801.355 100 
#     josilber1(uma) 12879.268 13640.1310 15981.82099 14265.610 14805.419 28959.230 100 
#     josilber2(uma) 4324.724 4502.8125 5753.47088 4918.835 5244.309 17328.797 100 
# josilber3(uma$Vehicle.velocity)  41.582  54.5235  57.76919  57.435  60.099  90.998 100 

我們得到了很多的改進(25X)與簡單josilber1josilber2一個70X總加速(優勢會更具有較大的delta值)。通過josilber3,我們實現了6800倍的加速,將運行時間降至54微秒以處理單個車輛!

+0

感謝您的回答+ josilber。我真的必須非常仔細地閱讀它以瞭解正在發生的事情。我也在嘗試'dplyr'。 – 2014-10-18 04:14:57

+0

@umairdurrani我已經更新了一個'Rcpp'解決方案,它比另外兩個解決方案快得多。 – josliber 2014-10-18 04:35:00

+0

啊!我對C++和Rcpp一無所知。當我運行它時,我得到'NULL'。不管怎麼說,多謝拉。 – 2014-10-18 05:40:27