2017-03-10 63 views
0

我以XML格式提供了一點sportVU數據(協調籃球運動員的球員跟蹤),並試圖找到一種有效的方式將其解析爲R dataframes。作爲參考,在這裏是一個數據我一起工作的類型:在R中,提高分析長XML文檔的效率/速度

xml_str = '<sequences period="1"> 
      <moment game-clock="720.00" time="1453777809409" game-event-id="" shot-clock="24.00" locations="-1,-1,49.37744,29.70218,4.18768;23,262882,20.60259,20.94833,0;23,266358,44.9396,19.39261,0;23,509450,29.94114,25.65156,0;23,604898,46.12643,31.1069,0;23,699950,65.75548,29.05962,0;5312,263884,63.23074,19.75536,0;5312,329824,46.12643,31.1069,0;5312,340730,65.75548,29.05962,0;5312,462980,65.2449,25.5923,0;5312,609567,44.9396,19.39261,0"/> 
      <moment game-clock="720.00" time="1453777809448" game-event-id="" shot-clock="24.00" locations="-1,-1,49.37601,29.74255,4.25405;23,262882,20.47625,21.10303,0;23,266358,44.94889,19.41382,0;23,509450,29.95246,25.6483,0;23,604898,46.13548,31.11234,0;23,699950,65.54889,29.13574,0;5312,263884,63.19786,19.78119,0;5312,329824,46.13548,31.11234,0;5312,340730,65.54889,29.13574,0;5312,462980,65.22661,25.67978,0;5312,609567,44.94889,19.41382,0"/> 
      <moment game-clock="720.00" time="1453777809489" game-event-id="" shot-clock="24.00" locations="-1,-1,49.27471,30.13754,4.01725;23,262882,20.34594,21.24155,0;23,266358,44.94417,19.49824,0;23,509450,29.972,25.6471,0;23,604898,46.14312,31.10966,0;23,699950,65.33004,29.25136,0;5312,263884,63.16254,19.80861,0;5312,329824,46.14312,31.10966,0;5312,340730,65.33004,29.25136,0;5312,462980,65.2156,25.73148,0;5312,609567,44.94417,19.49824,0"/> 
      <moment game-clock="720.00" time="1453777809529" game-event-id="" shot-clock="24.00" locations="-1,-1,49.37694,29.72015,4.22522;23,262882,20.21595,21.39361,0;23,266358,44.94711,19.5666,0;23,509450,29.99479,25.64543,0;23,604898,46.15027,31.12124,0;23,699950,65.11179,29.36794,0;5312,263884,63.12304,19.83758,0;5312,329824,46.15027,31.12124,0;5312,340730,65.11179,29.36794,0;5312,462980,65.20283,25.7811,0;5312,609567,44.94711,19.5666,0"/> 
      <moment game-clock="720.00" time="1453777809569" game-event-id="" shot-clock="24.00" locations="-1,-1,49.24878,29.90948,4.104;23,262882,20.08169,21.52092,0;23,266358,44.95597,19.55453,0;23,509450,30.01362,25.65796,0;23,604898,46.15381,31.12995,0;23,699950,64.87379,29.54509,0;5312,263884,63.08375,19.86668,0;5312,329824,46.15381,31.12995,0;5312,340730,64.87379,29.54509,0;5312,462980,65.19289,25.82793,0;5312,609567,44.95597,19.55453,0"/> 
      <moment game-clock="720.00" time="1453777809609" game-event-id="" shot-clock="24.00" locations="-1,-1,49.34944,29.53386,4.37756;23,262882,19.93925,21.57641,0;23,266358,44.95367,19.55548,0;23,509450,30.02373,25.68459,0;23,604898,46.15968,31.14234,0;23,699950,64.63944,29.72012,0;5312,263884,63.03494,19.95414,0;5312,329824,46.15968,31.14234,0;5312,340730,64.63944,29.72012,0;5312,462980,65.18154,25.87779,0;5312,609567,44.95367,19.55548,0"/> 
      <moment game-clock="720.00" time="1453777809649" game-event-id="" shot-clock="24.00" locations="-1,-1,49.29842,29.74836,4.29728;23,262882,19.7968,21.62425,0;23,266358,44.97828,19.51615,0;23,509450,30.02255,25.64997,0;23,604898,46.1634,31.14518,0;23,699950,64.44716,29.84182,0;5312,263884,62.98457,20.04141,0;5312,329824,46.1634,31.14518,0;5312,340730,64.44716,29.84182,0;5312,462980,65.15931,25.93168,0;5312,609567,44.97828,19.51615,0"/> 
      <moment game-clock="720.00" time="1453777809689" game-event-id="" shot-clock="24.00" locations="-1,-1,49.37601,29.74255,4.25405;23,262882,19.65811,21.66964,0;23,266358,45.05661,19.46085,0;23,509450,30.01974,25.62911,0;23,604898,46.19779,31.17625,0;23,699950,64.29233,29.91812,0;5312,263884,62.93202,20.1291,0;5312,329824,46.19779,31.17625,0;5312,340730,64.29233,29.91812,0;5312,462980,65.14666,25.96346,0;5312,609567,45.05661,19.46085,0"/> 
      <moment game-clock="720.00" time="1453777809728" game-event-id="" shot-clock="24.00" locations="-1,-1,49.42696,29.528,4.33445;23,262882,19.52976,21.72166,0;23,266358,45.07626,19.45181,0;23,509450,30.01819,25.62626,0;23,604898,46.20207,31.19139,0;23,699950,64.10344,29.96732,0;5312,263884,62.87931,20.21896,0;5312,329824,46.20207,31.19139,0;5312,340730,64.10344,29.96732,0;5312,462980,65.12518,26.01317,0;5312,609567,45.07626,19.45181,0"/> 
      <moment game-clock="720.00" time="1453777809769" game-event-id="" shot-clock="24.00" locations="-1,-1,49.42696,29.528,4.33445;23,262882,19.40259,21.77219,0;23,266358,45.01633,19.41528,0;23,509450,30.01316,25.61478,0;23,604898,46.20748,31.21032,0;23,699950,63.94112,30.00692,0;5312,263884,62.82417,20.31417,0;5312,329824,46.20748,31.21032,0;5312,340730,63.94112,30.00692,0;5312,462980,65.10361,26.05668,0;5312,609567,45.01633,19.41528,0"/> 
      <moment game-clock="720.00" time="1453777809809" game-event-id="" shot-clock="24.00" locations="-1,-1,49.40035,29.31999,4.45764;23,262882,19.26225,21.74403,0;23,266358,45.03757,19.35151,0;23,509450,30.00625,25.60483,0;23,604898,46.21089,31.22762,0;23,699950,63.76999,30.04214,0;5312,263884,62.7722,20.407,0;5312,329824,46.21089,31.22762,0;5312,340730,63.76999,30.04214,0;5312,462980,65.0792,26.10155,0;5312,609567,45.03757,19.35151,0"/> 
      <moment game-clock="720.00" time="1453777809849" game-event-id="" shot-clock="24.00" locations="-1,-1,49.17246,29.8749,4.08078;23,262882,19.13395,21.78725,0;23,266358,45.04715,19.21508,0;23,509450,29.9845,25.51368,0;23,604898,46.26867,31.28159,0;23,699950,63.61645,30.0221,0;5312,263884,62.71828,20.50007,0;5312,329824,46.26867,31.28159,0;5312,340730,63.61645,30.0221,0;5312,462980,65.04425,26.18745,0;5312,609567,45.04715,19.21508,0"/> 
      <moment game-clock="720.00" time="1453777809889" game-event-id="" shot-clock="24.00" locations="-1,-1,49.24667,29.2957,4.46901;23,262882,19.00912,21.82333,0;23,266358,45.02115,19.10097,0;23,509450,29.96265,25.43249,0;23,604898,46.26457,31.32984,0;23,699950,63.46754,29.99107,0;5312,263884,62.66985,20.58952,0;5312,329824,46.26457,31.32984,0;5312,340730,63.46754,29.99107,0;5312,462980,65.01059,26.25959,0;5312,609567,45.02115,19.10097,0"/> 
      <moment game-clock="720.00" time="1453777809928" game-event-id="" shot-clock="24.00" locations="-1,-1,49.01807,29.85051,4.09221;23,262882,18.88202,21.79858,0;23,266358,44.95699,19.07127,0;23,509450,29.93991,25.35426,0;23,604898,46.25301,31.35973,0;23,699950,63.29378,30.00075,0;5312,263884,62.61706,20.68884,0;5312,329824,46.25301,31.35973,0;5312,340730,63.29378,30.00075,0;5312,462980,64.98385,26.29224,0;5312,609567,44.95699,19.07127,0"/> 
      <moment game-clock="720.00" time="1453777809969" game-event-id="" shot-clock="24.00" locations="-1,-1,48.93946,29.87874,4.1644;23,262882,18.75688,21.75509,0;23,266358,44.95388,19.00199,0;23,509450,29.91814,25.27759,0;23,604898,46.23982,31.38857,0;23,699950,63.12091,30.01038,0;5312,263884,62.57866,20.72805,0;5312,329824,46.23982,31.38857,0;5312,340730,63.12091,30.01038,0;5312,462980,64.95801,26.3292,0;5312,609567,44.95388,19.00199,0"/> 
      </sequences>' 

這是實際數據的子集,其中1場比賽籃球,大約有(每秒25)*(10位玩家)*對應(在遊戲中48 * 60秒)=每場720,000個時刻節點。我的目標是將其解析爲一個有組織的數據框 - 前4列是當前節點的前4個屬性(遊戲時鐘,時間,遊戲事件ID,拍子時鐘),然後有幾列來自位置屬性的數據,其存儲10個不同的球員和球的座標數據,全部在一個字符串中(每個位置屬性具有55個數字,10個球員中的每個和5個球)。

這是我的代碼,這是工作好嗎,但需要加快相當多的效率爲我在做什麼。我使用XML包和簡單的函數,但切換到xml2或做其他任何事情都可以加快速度。

sportvu = xmlParse(xml_str) 

# quick load the data into a dataframe 
sportvu_df <- data.frame(
    game_clock = sapply(sportvu["//sequences/moment/@game-clock"], as, "numeric"), 
    time = sapply(sportvu["//sequences/moment/@time"], as, "character"), 
    game_event_id = sapply(sportvu["//sequences/moment/@game-event-id"], as, "integer"), 
    shot_clock = sapply(sportvu["//sequences/moment/@shot-clock"], as, "numeric"), 
    locations = sapply(sportvu["//sequences/moment/@locations"], as, "character") 
) 

# convert locations into what we want, many more columns 
locations = as.character(sportvu_df$locations) 
locations = strsplit(gsub(";", ",", locations), ",") 
reps = sapply(locations, length)/5 
locations = as.data.frame(matrix(unlist(locations), ncol = 5, byrow = TRUE), stringsAsFactors = FALSE) 
colnames(locations) = c("team_id", "global.player.id", "x_loc", "y_loc", "radius") 

locations$global.player.id = as.integer(locations$global.player.id); locations$team_id = as.integer(locations$team_id); locations$x_loc = as.numeric(locations$x_loc); locations$y_loc = as.numeric(locations$y_loc); locations$radius = as.numeric(locations$radius) 

# connect locations back with sportvu_df 
sportvu_df = sportvu_df[!(names(sportvu_df) %in% "locations")] 
sportvu_df = sportvu_df[rep(row.names(sportvu_df), reps), ] 
sportvu_df = cbind(sportvu_df, locations) 
sportvu_df$order = seq(1,nrow(sportvu_df),by=1) 

任何想法的讚賞。謝謝!

回答

2

不需要將類型轉換包裝到sapply函數中。因此,他們向量化的直接調用應該是足夠了:

sportvu_df <- data.frame(
    game_clock = as.numeric(sportvu["//sequences/moment/@game-clock"]), 
    time = as.character(sportvu["//sequences/moment/@time"]), 
    game_event_id = as.integer(sportvu["//sequences/moment/@game-event-id"]), 
    shot_clock = as.numeric(sportvu["//sequences/moment/@shot-clock"]), 
    locations = as.character(sportvu["//sequences/moment/@locations"]) 
) 

通過消除5 sapply(即循環)這應該加快你的程序。

+0

顯着加速,沒有用microbenchmark進行測量,但它似乎是10-100倍的更快。非常感謝! – Canovice