2016-02-26 40 views
0

我在R'HLS'中有一個數據框,這基本上是訪問者訪問網站時頁面明智的細節。如果他已移動到第page_count所表示的第10頁,則每行代表從第3頁開始到第10頁最多的頁面的每次訪問,如下所示。保留行直到第一次使用R

ID page_count purchase_flag prob  hl_flag 
V1  3    1   0.76  1 
V1  4    1   0.65  1 
V1  5    1   0.04  0 
V1  6    1   0.86  1 
V1  7    1   0.04  0 
V1  8    1   0.65  1 
V1  9    1   0.01  0 
V1  10    1   0.00  0 
V2  3    0   0.03  0 
V2  4    0   0.01  0 
V2  5    0   0.02  0 
V2  6    0   0.00  0 
V3  3    1   0.02  0 
V3  4    1   0.001  0 
V3  5    1   0.76  1 
V3  6    1   0.03  0 
V4  3    0   0.04  0 
V4  4    0   0.65  1 
V4  5    0   0.03  0 

我想創建一個表,該表將採取的行直到hl_flag = 1的第一次出現,如果該情況下是真實的,所有的行是否爲任何ID如果hl_flag = 0。輸出需要如下所示

ID  page_count  purchase_flag prob  hl_flag 
V1   3    1   0.76  1 
V2   3    0   0.03  0 
V2   4    0   0.01  0 
V2   5    0   0.02  0 
V2   6    0   0.00  0 
V3   3    1   0.02  0 
V3   4    1   0.001  0 
V3   5    1   0.76  1 
V4   3    0   0.04  0 
V4   4    0   0.65  1 

感謝您提前給予幫助。

更新: 添加dput的輸出如下

structure(list(ung_id = c("00000f23-1019-4aff-8199-35bd0d032356/1", 
"00000f23-1019-4aff-8199-35bd0d032356/1", "00000f23-1019-4aff-8199-35bd0d032356/1", 
"00000f23-1019-4aff-8199-35bd0d032356/1", "00002b20-82d4-497b-a137-34e3bb4eaf74/1", 
"00002b20-82d4-497b-a137-34e3bb4eaf74/1", "00002b20-82d4-497b-a137-34e3bb4eaf74/1", 
"0000aeff-2d8b-4daa-a084-fb2980f1feed/1", "0000aeff-2d8b-4daa-a084-fb2980f1feed/1", 
"0000b96e-566f-4b6e-925a-b7dcfd4a7208/1", "0000b96e-566f-4b6e-925a-b7dcfd4a7208/1", 
"0000b96e-566f-4b6e-925a-b7dcfd4a7208/1", "0000b96e-566f-4b6e-925a-b7dcfd4a7208/1", 
"0000b96e-566f-4b6e-925a-b7dcfd4a7208/1", "0000b96e-566f-4b6e-925a-b7dcfd4a7208/1", 
"0000b96e-566f-4b6e-925a-b7dcfd4a7208/1", "0000b96e-566f-4b6e-925a-b7dcfd4a7208/1", 
"0000d089-edda-4c8b-8b17-d9def3cae7cf/1", "0000d089-edda-4c8b-8b17-d9def3cae7cf/1", 
"0000d089-edda-4c8b-8b17-d9def3cae7cf/1"), nop_count = c(3L, 
4L, 5L, 6L, 3L, 4L, 5L, 3L, 4L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 
3L, 4L, 5L), purchase_flag = c(1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), prob = c(0.0777615841278747, 
0.0738346887497272, 0.0741130887754292, 0.0785370078084892, 0.0619573259953132, 
0.0516201527986966, 0.0562025814090338, 0.0837301511694211, 0.0579033581198143, 
0.0364358545936557, 0.0329682922619259, 0.0420157964561273, 0.049855260762479, 
0.0500481302257314, 0.0463893143028813, 0.049855260762479, 0.0391886960037603, 
0.0683568422952682, 0.0570168506417919, 0.0661965354597502), 
decile = structure(c(8L, 8L, 8L, 8L, 6L, 4L, 5L, 8L, 5L, 
1L, 1L, 2L, 4L, 4L, 3L, 4L, 2L, 7L, 5L, 7L), .Label = c("(0.0257,0.0364]", 
"(0.0364,0.0428]", "(0.0428,0.0482]", "(0.0482,0.0531]", 
"(0.0531,0.0583]", "(0.0583,0.0645]", "(0.0645,0.0722]", 
"(0.0722,0.0842]"), class = "factor"), hl_Flag = c(1L, 1L, 
1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
1L, 1L, 1L)), .Names = c("ung_id", "nop_count", "purchase_flag", 
"prob", "decile", "hl_Flag"), row.names = c(NA, -20L), .internal.selfref = <pointer: 0x00000000002b0788>, class = c("data.table", 
"data.frame")) 

回答

2

一個選項將使用data.table。我們將'data.frame'轉換爲'data.table'(setDT(HLS)),按'ID'分組,我們檢查是否有any'hl_flag'值爲1.在這種情況下,我們得到第一個出現的索引在hl_flag中使用which.max,得到序列(1:(which.max..)),找到行索引(.I)或else返回行索引(.I),提取帶有行索引的列($V1)並用它來對行進行子集化。

library(data.table) 
setDT(HLS)[HLS[, if(any(hl_flag==1)) .I[1:(which.max(hl_flag))] 
       else .I, ID]$V1] 
#  ID page_count purchase_flag prob hl_flag 
# 1: V1   3    1 0.760  1 
# 2: V2   3    0 0.030  0 
# 3: V2   4    0 0.010  0 
# 4: V2   5    0 0.020  0 
# 5: V2   6    0 0.000  0 
# 6: V3   3    1 0.020  0 
# 7: V3   4    1 0.001  0 
# 8: V3   5    1 0.760  1 
# 9: V4   3    0 0.040  0 
#10: V4   4    0 0.650  1 

或類似的方法I showed用於data.table,一個base R選擇是

do.call(rbind, lapply(split(HLS, HLS$ID), 
      function(x) if(any(x$hl_flag==1)) 
        x[seq(which.max(x$hl_flag)), ] 
       else x)) 

或者使用dplyr

library(dplyr) 
HLS %>% 
    group_by(ID) %>% 
    filter(all(!hl_flag)| row_number() %in% seq(which.max(hl_flag))) 
#  ID page_count purchase_flag prob hl_flag 
# (chr)  (int)   (int) (dbl) (int) 
#1  V1   3    1 0.760  1 
#2  V2   3    0 0.030  0 
#3  V2   4    0 0.010  0 
#4  V2   5    0 0.020  0 
#5  V2   6    0 0.000  0 
#6  V3   3    1 0.020  0 
#7  V3   4    1 0.001  0 
#8  V3   5    1 0.760  1 
#9  V4   3    0 0.040  0 
#10 V4   4    0 0.650  1 
+0

感謝您的答覆akrun。我只是澄清了,如果我的ID是'00000f23-1019-4aff-8199-35bd0d032356/1'而不是V1的形式,那麼應該做什麼改變? – rahuliggu

+0

@rahuliggu我們不需要改變任何東西,因爲我們只使用'ID'進行分組。你有沒有嘗試過這個原始數據集? – akrun

+0

是的,我嘗試了原始數據集,但它給了我一個錯誤'錯誤:意外的數字常量在'中,所以我想知道它是否與在代碼中使用的ID不同'else .I,ID] $ V1]'。如果我理解了代碼錯誤的基本內容,請原諒我。對我而言,R對我來說是非常新鮮的,因此我感到困惑 – rahuliggu

0

你可以試試,

l <- lapply(split(df, df$ID), function(x) {if(any(x[5] == 1)) x[1:which.max(x[5] == 1),] else x}) 

分裂df通過id然後子集化每個列表,直到任何hl_flag == 1 它會給你一個ID明智的列表

#$V1 
# ID page_count purchase_flag prob hl_flag 
#1 V1   3    1 0.76  1 

#$V2 
# ID page_count purchase_flag prob hl_flag 
#9 V2   3    0 0.03  0 
#10 V2   4    0 0.01  0 
#11 V2   5    0 0.02  0 
#12 V2   6    0 0.00  0 

#$V3 
# ID page_count purchase_flag prob hl_flag 
#13 V3   3    1 0.020  0 
#14 V3   4    1 0.001  0 
#15 V3   5    1 0.760  1 

#$V4 
# ID page_count purchase_flag prob hl_flag 
#17 V4   3    0 0.04  0 
#18 V4   4    0 0.65  1 

要獲得期望的結果,你就可以使用bind_rowsdplyr

library(dplyr) 
bind_rows(l) 

#ID page_count purchase_flag prob hl_flag 
#(fctr)  (int)   (int) (dbl) (int) 
#1  V1   3    1 0.760  1 
#2  V2   3    0 0.030  0 
#3  V2   4    0 0.010  0 
#4  V2   5    0 0.020  0 
#5  V2   6    0 0.000  0 
#6  V3   3    1 0.020  0 
#7  V3   4    1 0.001  0 
#8  V3   5    1 0.760  1 
#9  V4   3    0 0.040  0 
#10  V4   4    0 0.650  1