2016-03-04 32 views

回答

1

讓作爲文件(wordMappings)

"from"|"to" 
###Words###### 
"this"|"ThIs" 
"is"|"Is" 
"a"|"A" 
"sample"|"SamPle" 

詞語首先removel;

readFile <- function(fileName, seperator) { 
    read.csv(paste0("data\\", fileName, ".txt"), 
          sep=seperator, #"\t", 
          quote = "\"", 
          comment.char = "#", 
          blank.lines.skip = TRUE, 
          stringsAsFactors = FALSE, 
          encoding = "UTF-8") 

} 

kelimeler <- c("this is a sample") 
corpus = Corpus(VectorSource(kelimeler)) 
seperatorOfTokens <- ' ' 
words <- readFile("wordMappings", "|") 

toSpace <- content_transformer(function(x, from) gsub(sprintf("(^|%s)%s(%s%s)", seperatorOfTokens, from,'$|', seperatorOfTokens, ')'), sprintf(" %s%s", ' ', seperatorOfTokens), x)) 
for (word in words$from) { 
    corpus <- tm_map(corpus, toSpace, word) 
} 

如果你想有一個更靈活的解決方案,例如不只是去除然後更換;

#Specific Transformations 
toMyToken <- content_transformer(function(x, from, to) 
    gsub(sprintf("(^|%s)%s(%s%s)", seperatorOfTokens, from,'$|', seperatorOfTokens, ')'), sprintf(" %s%s", to, seperatorOfTokens), x)) 

for (i in seq(1:nrow(words))) { 
    print(sprintf("%s -> %s ", words$from[i], words$to[i])) 
    corpus <- tm_map(corpus, toMyToken, words$from[i], words$to[i]) 
} 

現在運行示例;

[1] "this -> ThIs " 
[1] "is -> Is " 
[1] "a -> A " 
[1] "sample -> SamPle " 
> content(corpus[[1]]) 
[1] " ThIs Is A SamPle " 
> 
0

我的解決方案,這可能是麻煩和不雅:

#read in items to be removed 
removalList = as.matrix(read.csv(listOfWordsAndPunc, header = FALSE)) 
# 
#create document term matrix 
termListing = colnames(corpusFileDocs_dtm) 
# 
#find intersection of terms in removalList and termListing 
commonWords = intersect(removalList, termListing) 
removalIndxs = match(commonWords, termListing) 
# 
#create m for term frequency, etc. 
m = as.matrix(atsapFileDocs_dtm) 
# 
#use removalIndxs to drop irrelevant columns from m 
allColIndxs = 1 : length(termListing) 
keepColIndxs = setdiff(allColIndxs, removalIndxs) 
m = m[ ,keepColIndxs ] 
# 
#thence to tf-idf analysis with revised m 

任何改進的風格和計算的建議深表追捧。

BSL

相關問題