机器学习基础——实现基本的决策树
一、决策树基本流程
决策树是常见的机器学习方法。我在学习周志华的机器学习的时候,用python实现了最基础的ID3算法,其基本思想是:
基于信息论中的信息增益理论,首先找出判断样本的最高的信息增益的属性(或者说特征),然后依次对按该属性划分成的子集
进行同样的选择信息增益最大的属性并进行划分的过程。这是一个典型的(divide-conquer)的过程。
二、最基本的决策树实现
下面,通过周志华《机器学习》(也叫西瓜书)第四章的例子,我用python实现了基于pandas的DataFrame的数据结构的数据来进行的决策树的实现(属性都是离散属性)。对
连续属性或更一般性的情况,可以调用sklearn的决策树包来做,这块放在最后。
样本如下:
- # 决策树
- # 习题4.3 实现基于信息熵进行划分选择的决策树算法,并为表4.3中数据
- # 生成一颗决策树
- # 表4.3既有离散属性,也有连续属性。
- # _*_ coding:UTF-8 _*_
- import pandas as pd
- import numpy as np
- import os
- from math import log2
- # 离散的特征的完成了,连续的我用sklearn可以做,就不在此处写了
- def Entrop(data):
- entropy = dict()
- for i in data.keys():
- good = data[i].count('是')
- bad = data[i].count('否')
- length = len(data[i])
- p1 = good / length
- p2 = bad / length
- if p1 == 0.0 or p2 == 0.0:
- entropy[i] = 0
- else:
- entropy[i] = -(p1 * log2(p1) + p2 * log2(p2))
- return entropy
- def DecisionTree(entropy, data, MaxKey, threshold, category):
- for i in entropy:
- sub_data = data[data[MaxKey] == i] # sub_data为子集合
- subs_entropy = entropy[i] # subs_entropy是信息熵
- sizes = sub_data.count(0)[0] # 这个子集合的样本数量
- if subs_entropy >= threshold:
- gains = dict() # 信息增益
- data_sample = dict()
- Ent = dict() # 信息熵
- for j in category:
- data_sample[j] = sub_data['好瓜'].groupby(sub_data[j]).sum()
- nn = len(data_sample[j])
- he = 0
- for m in range(nn):
- good = data_sample[j][m].count('是')
- bad = data_sample[j][m].count('否')
- length = len(data_sample[j][m])
- p1 = good / length
- p2 = bad / length
- if good == 0.0 or bad == 0.0 or length == 0:
- Ent[j] = 0
- else:
- Ent[j] = -(p1 * log2(p1) + p2 * log2(p2))
- he += (length * Ent[j]) / sizes
- gains[j] = subs_entropy - he
- if len(gains) > 0:
- maxKey = max(gains.items(), key=lambda x: x[1])[0]
- entropys = Entrop(data_sample[maxKey])
- category.pop(maxKey)
- print('{0}下面的第一分类是{1}'.format(i, maxKey))
- DecisionTree(entropys, sub_data, maxKey, threshold, category)
- else:
- highest_class = max(sub_data['好瓜'].values)
- if highest_class == '否':
- print('{0}下面的瓜都是坏瓜'.format(i))
- else:
- print('{0}下面的瓜都是好瓜'.format(i))
- else:
- highest_class = max(sub_data['好瓜'].values)
- if highest_class == '否':
- print('{0}下面的瓜都是坏瓜'.format(i))
- else:
- print('{0}下面的瓜都是好瓜'.format(i))
- def main():
- """
- 主函数
- """
- dataset_path = './datasets' # 数据集路径
- filename = 'watermalon_simple.xlsx' # 文件名称
- dataset_filepath = os.path.join(dataset_path, filename) # 数据集文件路径
- # step 1:读取数据
- data = pd.read_excel(dataset_filepath) # 获取数据
- header = data.columns # 获取列名
- sums = data.count()[0] # 样本个数
- # print(type(header))
- # print(header[1:])
- # step 2:取出6个特征中每个特征的子集
- category = dict()
- for i in header[1:-1]:
- category[i] = set(data[i])
- # step 3:计算信息熵
- # 1) 初始样本的信息熵
- Ent = dict()
- number = data['好瓜'].groupby(data['好瓜']).sum()
- p1 = len(number['是']) / sums # 好瓜的个数/总共个数
- p2 = len(number['否']) / sums # 坏瓜的个数/总共个数
- Ent["总"] = -(p1*log2(p1) + p2*log2(p2))
- # 2) 计算每个属性的信息增益
- Gain = dict()
- data_sample = dict()
- for i in category:
- data_sample[i] = data['好瓜'].groupby(data[i]).sum()
- # 每一个属性有几种维度 如色泽有 青绿、乌黑、浅白三种,则n = 3
- n = category[i]
- # print(len(data_sample[i]))
- # print(data_sample[i])
- he = 0
- for j in range(len(n)):
- good = data_sample[i][j].count('是')
- bad = data_sample[i][j].count('否')
- length = len(data_sample[i][j])
- p1 = good / length
- p2 = bad / length
- if p1 == 0.0 or p2 == 0.0:
- Ent[j] = 0
- else:
- Ent[j] = -(p1 * log2(p1) + p2 * log2(p2))
- he += (length * Ent[j])/sums
- Gain[i] = Ent['总'] - he
- # 3) 找到value最大对应的key 即找到按增益准则划分的分类最佳情况
- # 这里的MaxKey是纹理
- MaxKey = max(Gain.items(), key=lambda x: x[1])[0]
- print('主分类是{}'.format(MaxKey))
- # 4) 根据这个情况,分别计算其子分类的信息熵
- # entropy为字典
- entropy = Entrop(data_sample[MaxKey])
- print(entropy)
- # {'清晰': 0.7642045065086203, '稍糊': 0.7219280948873623, '模糊': 0}
- # 4.1)接下来的分类不能再用这个分了:即把这个类别扔掉
- category.pop(str(MaxKey))
- # print(category)
- # 5) 对信息熵大于某一阈值(threshold)的情况进行继续划分,如果小于则统一认为是某种情况,不再
- # 继续分类。例如:本例对模糊的Ent=0,所以不对模糊类别进行继续划分。
- threshold = 0.05
- DecisionTree(entropy, data, MaxKey, threshold, category)
- if __name__ == "__main__":
- main()
- from sklearn import tree
- import numpy as np
- from sklearn.feature_extraction import DictVectorizer
- # from sklearn import preprocessing
- from sklearn.cross_validation import train_test_split
- import xlrd
- workbook = xlrd.open_workbook('watermalon_simple.xlsx')
- booksheet = workbook.sheet_by_name('Sheet1')
- X = list()
- Y = list()
- for row in range(booksheet.nrows):
- row_data = []
- ll = dict()
- # ll = list()
- mm = list()
- for col in range(1, booksheet.ncols-1):
- cel = booksheet.cell(row, col)
- val = cel.value
- cat = ['色泽', '根蒂', '敲声', '纹理', '触感']
- # ll.append(val)
- ll[cat[col-1]] = val
- for col in range(booksheet.ncols-1,booksheet.ncols):
- cel = booksheet.cell(row, col)
- val = cel.value
- mm.append(val)
- print('第{}行读取完毕'.format(row))
- X.append(ll)
- Y.append(mm)
- # X = X[1:] #和下面的效果一样,即把第一列删除
- X.pop(0); Y.pop(0)
- # print(X)
- print(X); print(Y)
- # 1表示好瓜 0表示坏瓜
- y = [1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0]
- # Vector Feature
- dummyY = np.array(y)
- vec = DictVectorizer()
- dummyX = vec.fit_transform(X).toarray()
- #拆分数据
- x_train, x_test, y_train, y_test = train_test_split(dummyX, dummyY, test_size=0.3)
- # using desicionTree for classfication
- clf = tree.DecisionTreeClassifier(criterion="entropy")
- # 创建一个分类器,entropy决定了用ID3算法
- clf = clf.fit(x_train, y_train)
- # print(clf.feature_importances_)
- answer=clf.predict(x_test)
- print(u'系统进行测试')
- # print(x_train)
- print(answer)
- print(y_test)
- # print(np.mean(answer==y_train))
- # 可视化
- with open("alalala.dot", "w") as f:
- f = tree.export_graphviz(clf, feature_names=vec.get_feature_names(), out_file = f)