文本清理
rm(list=ls(all.names = TRUE))
library(NLP)
library(tm)
## Warning: package 'tm' was built under R version 3.4.4
library(jiebaRD)
## Warning: package 'jiebaRD' was built under R version 3.4.4
library(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.4.4
library(RColorBrewer)
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.4.4
filenames <- list.files(getwd(), pattern="*.txt")
files <- lapply(filenames, readLines)
docs <- Corpus(VectorSource(files))
#移除可能有問題的符號
toSpace <- content_transformer(function(x, pattern) {
return (gsub(pattern, " ", x))
}
)
docs <- tm_map(docs, toSpace, "※")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "※"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "◆")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "◆"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "‧")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "‧"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "的")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "的"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "我")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "我"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "是")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "是"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "看板")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "看板"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "作者")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "作者"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "發信站")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "發信站"): transformation
## drops documents
docs <- tm_map(docs, toSpace, "批踢踢實業坊")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "批踢踢實業坊"):
## transformation drops documents
docs <- tm_map(docs, toSpace, "[a-zA-Z]")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "[a-zA-Z]"): transformation
## drops documents
docs <- tm_map(docs, toSpace, "台大")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "台大"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "清大")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "清大"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "交大")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "交大"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "正取")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "正取"): transformation drops
## documents
#移除標點符號 (punctuation)
#移除數字 (digits)、空白 (white space)
docs <- tm_map(docs, removePunctuation)
## Warning in tm_map.SimpleCorpus(docs, removePunctuation): transformation
## drops documents
docs <- tm_map(docs, removeNumbers)
## Warning in tm_map.SimpleCorpus(docs, removeNumbers): transformation drops
## documents
docs <- tm_map(docs, stripWhitespace)
## Warning in tm_map.SimpleCorpus(docs, stripWhitespace): transformation drops
## documents
docs
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 11