2
我已經被符號化的文件清單:轉換標記的列表,以正克
dat <- list(c("texaco", "canada", "lowered", "contract", "price", "pay",
"crude", "oil", "canadian", "cts", "barrel", "effective", "decrease",
"brings", "companys", "posted", "price", "benchmark", "grade",
"edmonton", "swann", "hills", "light", "sweet", "canadian", "dlrs",
"bbl", "texaco", "canada", "changed", "crude", "oil", "postings",
"feb", "reuter"), c("argentine", "crude", "oil", "production",
"pct", "january", "mln", "barrels", "mln", "barrels", "january",
"yacimientos", "petroliferos", "fiscales", "january", "natural",
"gas", "output", "totalled", "billion", "cubic", "metrers", "pct",
"billion", "cubic", "metres", "produced", "january", "yacimientos",
"petroliferos", "fiscales", "added", "reuter"))
我想令牌這個名單高效地轉換成n元列表。這是我迄今爲止編寫的功能:
find_ngrams <- function(x, n){
if (n==1){ return(x)}
out <- as.list(rep(NA, length(x)))
for (i in 1:length(x)){
words <- x[[i]]
out[[i]] <- words
for (j in 2:n){
phrases <- sapply(1:j, function(k){
words[k:(length(words)-n+k)]
})
phrases <- apply(phrases, 1, paste, collapse=" ")
out[[i]] <- c(out[[i]], phrases)
}
}
return(out)
}
這對於找到n-gram很合適,但它看起來效率不高。與*apply
職能將仍然給我留下循環更換for循環嵌套3深:
result <- find_ngrams(dat, 2)
> result[[2]]
[1] "argentine" "crude" "oil"
[4] "production" "pct" "january"
[7] "mln" "barrels" "mln"
[10] "barrels" "january" "yacimientos"
[13] "petroliferos" "fiscales" "january"
[16] "natural" "gas" "output"
[19] "totalled" "billion" "cubic"
[22] "metrers" "pct" "billion"
[25] "cubic" "metres" "produced"
[28] "january" "yacimientos" "petroliferos"
[31] "fiscales" "added" "reuter"
[34] "argentine crude" "crude oil" "oil production"
[37] "production pct" "pct january" "january mln"
[40] "mln barrels" "barrels mln" "mln barrels"
[43] "barrels january" "january yacimientos" "yacimientos petroliferos"
[46] "petroliferos fiscales" "fiscales january" "january natural"
[49] "natural gas" "gas output" "output totalled"
[52] "totalled billion" "billion cubic" "cubic metrers"
[55] "metrers pct" "pct billion" "billion cubic"
[58] "cubic metres" "metres produced" "produced january"
[61] "january yacimientos" "yacimientos petroliferos" "petroliferos fiscales"
[64] "fiscales added" "added reuter"
是否有可能被矢量這段代碼的任何顯著的部分?
/編輯:這裏是馬修Plourde的功能,那確實「先進的N-gram」,並在整個列表適用的更新版本:
find_ngrams_base <- function(x, n) {
if (n == 1) return(x)
out <- lapply(1:n, function(n_i) embed(x, n_i))
out <- sapply(out, function(y) apply(y, 1, function(row) paste(rev(row), collapse=' ')))
unlist(out)
}
find_ngrams_plourde <- function(x, ...){
lapply(x, find_ngrams_base, ...)
}
我們可以對我寫的功能標杆,並認爲它是一個有點慢:
library(rbenchmark)
benchmark(
replications=100,
a <- find_ngrams(dat, 2),
b <- find_ngrams(dat, 3),
c <- find_ngrams(dat, 4),
d <- find_ngrams(dat, 10),
w <- find_ngrams_plourde(dat, 2),
x <- find_ngrams_plourde(dat, 3),
y <- find_ngrams_plourde(dat, 4),
z <- find_ngrams_plourde(dat, 10),
columns=c('test', 'elapsed', 'relative'),
order='relative'
)
test elapsed relative
1 a <- find_ngrams(dat, 2) 0.040 1.000
2 b <- find_ngrams(dat, 3) 0.081 2.025
3 c <- find_ngrams(dat, 4) 0.117 2.925
5 w <- find_ngrams_plourde(dat, 2) 0.144 3.600
6 x <- find_ngrams_plourde(dat, 3) 0.212 5.300
7 y <- find_ngrams_plourde(dat, 4) 0.277 6.925
4 d <- find_ngrams(dat, 10) 0.361 9.025
8 z <- find_ngrams_plourde(dat, 10) 0.669 16.725
然而,也發現了很多的n-gram的我的功能缺失(哎呦):
for (i in 1:length(dat)){
print(setdiff(w[[i]], a[[i]]))
print(setdiff(x[[i]], b[[i]]))
print(setdiff(y[[i]], c[[i]]))
print(setdiff(z[[i]], d[[i]]))
}
我覺得這兩個函數都可以改進,但我想不出任何方法來避免三重循環(循環遍歷向量,遍歷所需的ngrams數,1-n,循環遍歷單詞來構造ngram )
/編輯2: 這裏有一個修訂功能,基於斷馬特的答案:
find_ngrams_2 <- function(x, n){
if (n == 1) return(x)
lapply(x, function(y) c(y, unlist(lapply(2:n, function(n_i) do.call(paste, unname(rev(data.frame(embed(y, n_i), stringsAsFactors=FALSE))))))))
}
它似乎返回n元語法的正確列表,它比我原來的功能,更快(在大多數情況下) :
library(rbenchmark)
benchmark(
replications=100,
a <- find_ngrams(dat, 2),
b <- find_ngrams(dat, 3),
c <- find_ngrams(dat, 4),
d <- find_ngrams(dat, 10),
w <- find_ngrams_2(dat, 2),
x <- find_ngrams_2(dat, 3),
y <- find_ngrams_2(dat, 4),
z <- find_ngrams_2(dat, 10),
columns=c('test', 'elapsed', 'relative'),
order='relative'
)
test elapsed relative
5 w <- find_ngrams_2(dat, 2) 0.039 1.000
1 a <- find_ngrams(dat, 2) 0.041 1.051
6 x <- find_ngrams_2(dat, 3) 0.078 2.000
2 b <- find_ngrams(dat, 3) 0.081 2.077
7 y <- find_ngrams_2(dat, 4) 0.119 3.051
3 c <- find_ngrams(dat, 4) 0.123 3.154
4 d <- find_ngrams(dat, 10) 0.399 10.231
8 z <- find_ngrams_2(dat, 10) 0.436 11.179
+1 - 非常好的使用'embed'。你可以運行它:'lapply(dat,find_ngrams,2)' – Arun 2013-05-10 20:05:06
只是一個問題,也許?當n = 1時,你會返回兩倍的矢量長度。但它應該只返回一個。但是這當然可以用if語句來檢查。 – Arun 2013-05-10 20:06:58
@阿倫好抓。固定。 – 2013-05-10 20:09:46