import jieba import os import numpy as np content = '吸金能力爆炸!《宝可梦》系列软件全球销量已突破4.89亿份' tyc = open(r'停用词/cn_stopwords.txt', encoding='utf-8').read().split("\n") wordlist = []#分词结果 ['南京市', '长江大桥'] for word in jieba.cut(content): if word not in tyc: wordlist.append(word) listdir = os.listdir('model') #所有分类名 ['体育', '军事', '娱乐', '房产', '教育', '汽车', '游戏', '科技', '财经'] total = {} #计算去重总词数字典 for x in listdir: type1 = open(f'model/{x}/data.txt', encoding='utf-8').read().split(" ") for y in type1: total[y] = total.get(y, 1) + 1 total_num = len(total) #147908 gl_dict = {} # 概率词典 for x in listdir: type1 = open(f'model/{x}/data.txt', encoding='utf-8').read().split(" ") num = 0 for word in wordlist: num += np.log((type1.count(word)+1) / (len(type1)+total_num)) gl_dict[x] = num + np.log(len(type1) / total_num) print(max(gl_dict, key=gl_dict.get)) # 输出结果:游戏
05.文章分类-贝叶斯模型训练
本节1077字2025-05-24 12:33:30