library(tm)
## Warning: package 'tm' was built under R version 3.4.4
## Loading required package: NLP
library(tmcn)
## Warning: package 'tmcn' was built under R version 3.4.4
## # tmcn Version: 0.2-12
library(Matrix)
## Warning: package 'Matrix' was built under R version 3.4.4
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.4.4
## Loading required package: RColorBrewer
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 3.4.4
library(factoextra)
## Warning: package 'factoextra' was built under R version 3.4.4
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.4.4
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
docs<-readLines("txt1.txt")
docs<-gsub("\\[[0-9]+\\]", "", docs)
docs.corpus <- Corpus(VectorSource(docs))
docs.seg<-tm_map(docs.corpus, segmentCN)
## Warning in tm_map.SimpleCorpus(docs.corpus, segmentCN): transformation
## drops documents
docs.tdm<-TermDocumentMatrix(docs.seg, control = list())
docs.tf<-apply(as.matrix(docs.tdm), 2, function(doc){doc/sum(doc)})
idf.function<-function(word_doc){log2((length(word_doc)+1)/nnzero(word_doc))}
docs.idf<-apply(docs.tdm, 1, idf.function)
docs.tfidf<-docs.tf*docs.idf
docs<-readLines("txt1.txt")
docs<-gsub("\\[[0-9]+\\]", "", docs)
docs.corpus <- Corpus(VectorSource(docs))
docs.seg<-tm_map(docs.corpus, segmentCN)
## Warning in tm_map.SimpleCorpus(docs.corpus, segmentCN): transformation
## drops documents
docs.tdm<-TermDocumentMatrix(docs.seg, control = list())
docs.tf<-apply(as.matrix(docs.tdm), 2, function(doc){doc/sum(doc)})
idf.function<-function(word_doc){log2((length(word_doc)+1)/nnzero(word_doc))}
docs.idf<-apply(docs.tdm, 1, idf.function)
docs.tfidf<-docs.tf*docs.idf
docs.pca <- prcomp(docs.tfidf, scale = T)
fviz_eig(docs.pca, addlabels = TRUE, ylim = c(0, 10))

fviz_pca_ind(docs.pca, geom.ind = c("point"), col.ind = "cos2")

fviz_pca_var(docs.pca, col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07")
)

fviz_pca_biplot(docs.pca, geom.ind = "point")

docs.eig <- get_eig(docs.pca)
docs.var <- get_pca_var(docs.pca)
docs.ind <- get_pca_ind(docs.pca)
ind.coord2 <- docs.ind$coord[, 1:2]
wss <- c()
for (i in 1:10) { wss[i] <- kmeans(ind.coord2, i)$tot.withinss }
plot(wss, type = "b")

km <- kmeans(ind.coord2, 3)
plot(ind.coord2, col = km$cluster)
points(km$centers, col = 1:3, pch = 8, cex = 2)
