2013-05-10 130 views
2

我已經被符號化的文件清單:轉換標記的列表,以正克

dat <- list(c("texaco", "canada", "lowered", "contract", "price", "pay", 
"crude", "oil", "canadian", "cts", "barrel", "effective", "decrease", 
"brings", "companys", "posted", "price", "benchmark", "grade", 
"edmonton", "swann", "hills", "light", "sweet", "canadian", "dlrs", 
"bbl", "texaco", "canada", "changed", "crude", "oil", "postings", 
"feb", "reuter"), c("argentine", "crude", "oil", "production", 
"pct", "january", "mln", "barrels", "mln", "barrels", "january", 
"yacimientos", "petroliferos", "fiscales", "january", "natural", 
"gas", "output", "totalled", "billion", "cubic", "metrers", "pct", 
"billion", "cubic", "metres", "produced", "january", "yacimientos", 
"petroliferos", "fiscales", "added", "reuter")) 

我想令牌這個名單高效地轉換成n元列表。這是我迄今爲止編寫的功能:

find_ngrams <- function(x, n){ 

    if (n==1){ return(x)} 

    out <- as.list(rep(NA, length(x))) 

    for (i in 1:length(x)){ 
    words <- x[[i]] 
    out[[i]] <- words 

    for (j in 2:n){ 

     phrases <- sapply(1:j, function(k){ 
     words[k:(length(words)-n+k)] 
     }) 

     phrases <- apply(phrases, 1, paste, collapse=" ") 

     out[[i]] <- c(out[[i]], phrases) 

    } 
    } 
    return(out) 
} 

這對於找到n-gram很合適,但它看起來效率不高。與*apply職能將仍然給我留下循環更換for循環嵌套3深:

result <- find_ngrams(dat, 2) 
> result[[2]] 
[1] "argentine"    "crude"     "oil"      
[4] "production"    "pct"      "january"     
[7] "mln"      "barrels"     "mln"      
[10] "barrels"     "january"     "yacimientos"    
[13] "petroliferos"    "fiscales"     "january"     
[16] "natural"     "gas"      "output"     
[19] "totalled"     "billion"     "cubic"     
[22] "metrers"     "pct"      "billion"     
[25] "cubic"     "metres"     "produced"     
[28] "january"     "yacimientos"    "petroliferos"    
[31] "fiscales"     "added"     "reuter"     
[34] "argentine crude"   "crude oil"    "oil production"   
[37] "production pct"   "pct january"    "january mln"    
[40] "mln barrels"    "barrels mln"    "mln barrels"    
[43] "barrels january"   "january yacimientos"  "yacimientos petroliferos" 
[46] "petroliferos fiscales" "fiscales january"   "january natural"   
[49] "natural gas"    "gas output"    "output totalled"   
[52] "totalled billion"   "billion cubic"   "cubic metrers"   
[55] "metrers pct"    "pct billion"    "billion cubic"   
[58] "cubic metres"    "metres produced"   "produced january"   
[61] "january yacimientos"  "yacimientos petroliferos" "petroliferos fiscales" 
[64] "fiscales added"   "added reuter"    

是否有可能被矢量這段代碼的任何顯著的部分?

/編輯:這裏是馬修Plourde的功能,那確實「先進的N-gram」,並在整個列表適用的更新版本:

find_ngrams_base <- function(x, n) { 
    if (n == 1) return(x) 
    out <- lapply(1:n, function(n_i) embed(x, n_i)) 
    out <- sapply(out, function(y) apply(y, 1, function(row) paste(rev(row), collapse=' '))) 
    unlist(out) 
} 

find_ngrams_plourde <- function(x, ...){ 
    lapply(x, find_ngrams_base, ...) 
} 

我們可以對我寫的功能標杆,並認爲它是一個有點慢:

library(rbenchmark) 
benchmark(
    replications=100, 
    a <- find_ngrams(dat, 2), 
    b <- find_ngrams(dat, 3), 
    c <- find_ngrams(dat, 4), 
    d <- find_ngrams(dat, 10), 
    w <- find_ngrams_plourde(dat, 2), 
    x <- find_ngrams_plourde(dat, 3), 
    y <- find_ngrams_plourde(dat, 4), 
    z <- find_ngrams_plourde(dat, 10), 
    columns=c('test', 'elapsed', 'relative'), 
    order='relative' 
) 
           test elapsed relative 
1   a <- find_ngrams(dat, 2) 0.040 1.000 
2   b <- find_ngrams(dat, 3) 0.081 2.025 
3   c <- find_ngrams(dat, 4) 0.117 2.925 
5 w <- find_ngrams_plourde(dat, 2) 0.144 3.600 
6 x <- find_ngrams_plourde(dat, 3) 0.212 5.300 
7 y <- find_ngrams_plourde(dat, 4) 0.277 6.925 
4   d <- find_ngrams(dat, 10) 0.361 9.025 
8 z <- find_ngrams_plourde(dat, 10) 0.669 16.725 

然而,也發現了很多的n-gram的我的功能缺失(哎呦):

for (i in 1:length(dat)){ 
    print(setdiff(w[[i]], a[[i]])) 
    print(setdiff(x[[i]], b[[i]])) 
    print(setdiff(y[[i]], c[[i]])) 
    print(setdiff(z[[i]], d[[i]])) 
} 

我覺得這兩個函數都可以改進,但我想不出任何方法來避免三重循環(循環遍歷向量,遍歷所需的ngrams數,1-n,循環遍歷單詞來構造ngram )

/編輯2: 這裏有一個修訂功能,基於斷馬特的答案:

find_ngrams_2 <- function(x, n){ 
    if (n == 1) return(x) 
    lapply(x, function(y) c(y, unlist(lapply(2:n, function(n_i) do.call(paste, unname(rev(data.frame(embed(y, n_i), stringsAsFactors=FALSE)))))))) 
} 

它似乎返回n元語法的正確列表,它比我原來的功能,更快(在大多數情況下) :

library(rbenchmark) 
benchmark(
    replications=100, 
    a <- find_ngrams(dat, 2), 
    b <- find_ngrams(dat, 3), 
    c <- find_ngrams(dat, 4), 
    d <- find_ngrams(dat, 10), 
    w <- find_ngrams_2(dat, 2), 
    x <- find_ngrams_2(dat, 3), 
    y <- find_ngrams_2(dat, 4), 
    z <- find_ngrams_2(dat, 10), 
    columns=c('test', 'elapsed', 'relative'), 
    order='relative' 
) 

         test elapsed relative 
5 w <- find_ngrams_2(dat, 2) 0.039 1.000 
1 a <- find_ngrams(dat, 2) 0.041 1.051 
6 x <- find_ngrams_2(dat, 3) 0.078 2.000 
2 b <- find_ngrams(dat, 3) 0.081 2.077 
7 y <- find_ngrams_2(dat, 4) 0.119 3.051 
3 c <- find_ngrams(dat, 4) 0.123 3.154 
4 d <- find_ngrams(dat, 10) 0.399 10.231 
8 z <- find_ngrams_2(dat, 10) 0.436 11.179 

回答

3

這是embed的一種方法。

find_ngrams <- function(x, n) { 
    if (n == 1) return(x) 
    c(x, apply(embed(x, n), 1, function(row) paste(rev(row), collapse=' '))) 
} 

在你的函數中似乎有一個錯誤。如果你解決了這個問題,我們可以做一個基準。

+0

+1 - 非常好的使用'embed'。你可以運行它:'lapply(dat,find_ngrams,2)' – Arun 2013-05-10 20:05:06

+3

只是一個問題,也許?當n = 1時,你會返回兩倍的矢量長度。但它應該只返回一個。但是這當然可以用if語句來檢查。 – Arun 2013-05-10 20:06:58

+2

@阿倫好抓。固定。 – 2013-05-10 20:09:46