数据集为AG News
1. 清洗数据
1.1.将csv格式数据转为list
import csvdef csv_to_list(filename): with open(filename, 'r') as f:reader = csv.reader(f)res = []for row in reader:res.append([row[0],row[1]+' '+row[2]])del(res[0])return res
1) csv第一行为表头,通过del删去
2) 数据集第二列为title,第三列为description,经过实验发现忽略title的准确率为0.888289,将title并入后准确率为0.896316,拥有了一定的提升,因此在这里将title与description合并
1.2. 语料预处理
import refrom nltk import word_tokenizefrom nltk.corpus import stopwordsfrom nltk.stem.porter import *def clean_list(res):res1 = [[samp[0], re.sub(r'[-|\\]', ' ', samp[1])]for samp in res]res2 = [[samp[0], word_tokenize(samp[1])]for samp in res1]res3 = [[samp[0], [w.lower() for w in samp[1]]]for samp in res2]res4 = [[samp[0], [w for w in samp[1] if re.search('^[a-z]+$', w)]]for samp in res3]stop_list = stopwords.words('english')res5 = [[samp[0], [w for w in samp[1] if w not in stop_list]]for samp in res4]stemmer = PorterStemmer()res6 = [[samp[0], [stemmer.stem(w) for w in samp[1]]]for samp in res5]return res6
res1: 观察语料,注意到其中包含使用'-'以及'\'的分词方式。为了后续处理方便,这里将它们统一替换为空格
res2: 使用nltk的word_tokenize进行分词
res3: 英文小写化
res4: 删去英文以外的字符(这里假设数据对文本相似度的影响力可以忽略,对数字也进行了删除)
res5: 删去stop words
res6: stemming
2. 语料向量化
2.1. 创建字典
import gensimfrom gensim import corporadef make_dictionary(res6):dic_field = [samp[1] for samp in res6]dictionary = corpora.Dictionary(dic_field)return dictionary
2.2. 向量化并添加标签
def list_to_vec(res6, dictionary):label = [samp[0] for samp in res6]dic_field = [samp[1] for samp in res6]all_tf_vectors = [dictionary.doc2bow(doc) for doc in dic_field]all_data_as_dict = [{id:1 for (id, tf_value) in vec} for vec in all_tf_vectors]all_labeled_data = list(zip(all_data_as_dict, label))return all_labeled_data
3. 创建分类器
import nltkdef classifying(all_labeled_data):classifier = nltk.NaiveBayesClassifier.train(all_labeled_data)return classifier
4. 计算准确率
def calculate_acc(classifier, all_test_labeled_data):return nltk.classify.accuracy(classifier, all_test_labeled_data)
5. main
train = csv_to_list('train.csv')clean_train = clean_list(train)dictionary = make_dictionary(clean_train)all_labeled_data = list_to_vec(clean_train, dictionary)classifier = classifying(all_labeled_data)test = csv_to_list('test.csv')clean_test = clean_list(test)all_test_labeled_data = list_to_vec(clean_test, dictionary)accuracy = calculate_acc(classifier, all_test_labeled_data)print (accuracy)