1500字范文 > R语言迪士尼点评文本挖掘

R语言迪士尼点评文本挖掘

时间：2019-12-08 09:45:22

相关推荐

R语言迪士尼点评文本挖掘

setwd("D:\\迪士尼点评文本挖掘")

#getwd()可查看目前的工作路径

#加载包()

library(DBI)

#library(RMySQL)

library(rJava)

library(openxlsx)

library(stringr)

library(xlsxjars)

library(reshape)

library(readxl)

library(xlsx)

library(sqldf)

library(wordcloud)

library(Rwordseg) #加载分词包

library(tm)

library(tmcn)

library(jiebaRD)

library(jiebaR) #里面的segmentC用于分词

#library(wordcloud2)

###########################读入表格###############################

comment_01 <- read_excel("点评文本挖掘.xlsx",1) #读入原始文本

comment_01 <- comment_01[,'点评内容'] #仅保留文本字段

stopwords_01 <- read_excel("停用词汇总.xlsx",1) # 读入停用词文本

#str函数，即structure,紧凑的显示对象内部结构，即对象里有什么。作用跟head相似

#str(comment_01)

#添加搜狗词汇词典查看已安装的词典用：listDict()。卸载词典：uninstallDict()。

installDict(dictpath = '旅游词汇大全【官方推荐】.scel',dictname = 'Vocabulary_books', dicttype = 'scel') #旅游词汇大全

installDict(dictpath = 'disney.scel',dictname = 'disney', dicttype = 'scel') #迪士尼词汇大全

installDict(dictpath = '自定义词典.txt',dictname = 'dictionary_01') #迪士尼词汇大全

#uninstallDict("disney")

#uninstallDict("Vocabulary_books")

#uninstallDict("dictionary_01")

#加词

#uninstallDict(disney)

listDict()

insertWords(c("排队","不满意","非常满意","很好","不方便","非常好","很棒","驴妈妈","不舒服","不值","七个小矮人","飞越地平线","创极速光轮","米奇大街","奇想花园","梦幻世界","探险岛","宝藏湾","明日世界","巴斯光年星际营救","喷气背包飞行器","太空幸会史迪奇","星球大战远征基地","皮克斯玩具总动员","快速通道","加勒比海盗","灯光秀","飞跃地平线"))

#去掉字母和数字

comment_02 <- gsub('[0-9a-zA-Z]','',comment_01)

#分词

segword <- unlist(lapply(X=comment_02,FUN=segmentCN))

#创建停止词

#head(stopwords_01) #查看数据

#class(stopwords_01) #查看变量类型，可知是属于数据框类型

#segword[1:10]

#需要将数据框格式的数据转化为向量格式

stopwords_01<- as.matrix(stopwords_01[,1])

stopwords_01<- as.vector(stopwords_01[,1])

#自定义删除停止词的函数