這是一個小的基準測試。結果:使用標準設置,平均約readxl::read_xlsx
跨越不同行數(n
)和列(p
)的openxlsx::read.xlsx
兩倍。
options(scipen=999) # no scientific number format
nn <- c(1, 10, 100, 1000, 5000, 10000, 20000, 30000)
pp <- c(1, 5, 10, 20, 30, 40, 50)
# create some excel files
l <- list() # save results
tmp_dir <- tempdir()
for (n in nn) {
for (p in pp) {
name <-
cat("\n\tn:", n, "p:", p)
flush.console()
m <- matrix(rnorm(n*p), n, p)
file <- paste0(tmp_dir, "/n", n, "_p", p, ".xlsx")
# write
write.xlsx(m, file)
# read
elapsed <- system.time(x <- openxlsx::read.xlsx(file))["elapsed"]
df <- data.frame(fun = "openxlsx::read.xlsx", n = n, p = p,
elapsed = elapsed, stringsAsFactors = F, row.names = NULL)
l <- append(l, list(df))
elapsed <- system.time(x <- readxl::read_xlsx(file))["elapsed"]
df <- data.frame(fun = "readxl::read_xlsx", n = n, p = p,
elapsed = elapsed, stringsAsFactors = F, row.names = NULL)
l <- append(l, list(df))
}
}
# results
d <- do.call(rbind, l)
library(ggplot2)
ggplot(d, aes(n, elapsed, color= fun)) +
geom_line() + geom_point() +
facet_wrap(~ paste("columns:", p)) +
xlab("Number of rows") +
ylab("Seconds")
使用'readxl :: read_excel()',它通常是更快 – scoa
可嘗試在'openxlsx'或'readxl'package。 – Jaap
這是一個完全合理的問題,就像我們在SO中看到其他問題一樣,在R中讀取或寫入'.csv'文件的最快方法是什麼。問題的答案需要一個基準,它可能是有趣的吸引大量觀衆 –