关联规则算法的实现及其应用 Apriori算法
实验一 关联规则算法的实现及其应用
【实验目的】
- 掌握关联规则挖掘算法Apriori的概念,理解算法的步骤。
- 加深对Apriori算法的理解,逐步培养解决实际问题的能力。
【实验性质】
设计型实验
【实验内容】
实现使用Apriori算法来挖掘关联规则
【实验环境】
Python 2
【实验结果】
- 完整的apriori.py文件
代码:
def loadDataSet():
return[[1,3,4],[2,3,5],[1,2,3,5],[2,5]]
def createC1(dataSet):
C1=[]
for transaction in dataSet:
for item in transaction:
if not [item] in C1:
C1.append([item])
C1.sort()
return map(frozenset,C1)
def scanD(D,Ck,minSupport):
ssCnt={}
for tid in D:
for can in Ck:
if can.issubset(tid):
if not ssCnt.has_key(can):ssCnt[can]=1
else:ssCnt[can]+=1
numItems = float(len(D))
retList =[]
supportData={}
for key in ssCnt:
support =ssCnt[key]/numItems
if support >=minSupport:
retList.insert(0,key)
supportData[key]=support
return retList,supportData
def aprioriGen(Lk, k): #creates Ck
retList = []
lenLk = len(Lk)
for i in range(lenLk):
for j in range(i+1, lenLk):
L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]
L1.sort(); L2.sort()
if L1==L2:
retList.append(Lk[i] | Lk[j])
return retList
def apriori(dataSet, minSupport = 0.5):
C1 = createC1(dataSet)
D = map(set, dataSet)
L1, supportData = scanD(D, C1, minSupport)
L = [L1]
k = 2
while (len(L[k-2]) > 0):
Ck = aprioriGen(L[k-2], k)
Lk, supK = scanD(D, Ck, minSupport)
supportData.update(supK)
L.append(Lk)
k += 1
return L, supportData
def generateRules(L,supportData,minConf=0.7):
bigRuleList=[]
for i in range(1,len(L)):
for freqSet in L[i]:
H1=(frozenset([item])for item in freqSet)
if(i>1):
rulesFromConseq(freqset,H1,supportData,bigRuleList,\
minConf)
else:
calcConf(freqSet,H1,supportData,bigRuleList,minConf)
return bigRuleList
def calcConf(freqSet,H,supportData,brl,minConf=0.7):
prunedH=[]
for conseq in H:
conf=supportData[freqSet]/supportData[freqSet-conseq]
if conf>=minConf:
print freqSet-conseq,'-->',conseq,'conf:',conf
brl.append((freqSet-conseq,conseq,conf))
prunedH.append(conseq)
return prunedH
def rulesFromConseq(freqSet,H,supportData,brl,minConf=0.7):
m=len(H[0])
if(len(freqSet)>(m+1)):
Hmp1=aprioriGen(H.m+1)
Hmp1=calcConf(freqSet,Hmp1,supportData,brl,minConf)
if(len(Hmp1)>1):
rulesFromConseq(freqSet,Hmp1,supportData,brl,minConf)
- 编写调用并测试apriori.py的代码
代码:
import apriori
dataSet=apriori.loadDataSet()
print dataSet
C1=apriori.createC1(dataSet)
print C1
D=map(set,dataSet)
print D
L1,suppData0=apriori.scanD(D,C1,0.5)
print L1
reload (apriori)
L,suppData=apriori.apriori(dataSet)
print L
print L[0]
print L[1]
print L[2]
print L[3]
print apriori.aprioriGen(L[0],2)
L,suppData=apriori.apriori(dataSet,minSupport=0.7)
print L
reload(apriori)
L,suppData=apriori.apriori(dataSet,minSupport=0.5)
rules=apriori.generateRules(L,suppData,minConf=0.7)
print rules
rules=apriori.generateRules(L,suppData,minConf=0.5)
print rules
- 运行结果:
【实验步骤】
第一步:生成后选项集
程序清单1:Apriori算法中的辅助函数
代码:
def loadDataSet():
return[[1,3,4],[2,3,5],[1,2,3,5],[2,5]]
def createC1(dataSet):
C1=[]
for transaction in dataSet:
for item in transaction:
if not [item] in C1:
C1.append([item])
C1.sort()
return map(frozenset,C1)
def scanD(D,Ck,minSupport):
ssCnt={}
for tid in D:
for can in Ck:
if can.issubset(tid):
if not ssCnt.has_key(can):ssCnt[can]=1
else:ssCnt[can]+=1
numItems = float(len(D))
retList =[]
supportData={}
for key in ssCnt:
support =ssCnt[key]/numItems
if support >=minSupport:
retList.insert(0,key)
supportData[key]=support
return retList,supportData
运行结果:
第二步:组织完整的Apriori算法
程序清单2:Apriori算法
代码:
def aprioriGen(Lk, k): #creates Ck
retList = []
lenLk = len(Lk)
for i in range(lenLk):
for j in range(i+1, lenLk):
L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]
L1.sort(); L2.sort()
if L1==L2:
retList.append(Lk[i] | Lk[j])
return retList
def apriori(dataSet, minSupport = 0.5):
C1 = createC1(dataSet)
D = map(set, dataSet)
L1, supportData = scanD(D, C1, minSupport)
L = [L1]
k = 2
while (len(L[k-2]) > 0):
Ck = aprioriGen(L[k-2], k)
Lk, supK = scanD(D, Ck, minSupport)
supportData.update(supK)
L.append(Lk)
k += 1
return L, supportData
运行结果:
第三步:从频繁项集中挖掘关联规则
程序清单3:关联规则生成函数
代码:
def generateRules(L,supportData,minConf=0.7):
bigRuleList=[]
for i in range(1,len(L)):
for freqSet in L[i]:
H1=(frozenset([item])for item in freqSet)
if(i>1):
rulesFromConseq(freqset,H1,supportData,bigRuleList,\
minConf)
else:
calcConf(freqSet,H1,supportData,bigRuleList,minConf)
return bigRuleList
def calcConf(freqSet,H,supportData,brl,minConf=0.7):
prunedH=[]
for conseq in H:
conf=supportData[freqSet]/supportData[freqSet-conseq]
if conf>=minConf:
print freqSet-conseq,'-->',conseq,'conf:',conf
brl.append((freqSet-conseq,conseq,conf))
prunedH.append(conseq)
return prunedH
def rulesFromConseq(freqSet,H,supportData,brl,minConf=0.7):
m=len(H[0])
if(len(freqSet)>(m+1)):
Hmp1=aprioriGen(H.m+1)
Hmp1=calcConf(freqSet,Hmp1,supportData,brl,minConf)
if(len(Hmp1)>1):
rulesFromConseq(freqSet,Hmp1,supportData,brl,minConf)
运行结果:
我想能看到这里的同学,无外乎两种人:来拷贝代码的人 和 来拷贝代码的人。
但,在拷贝走的时候,你要想清楚一件事,把代码拷走之后有个蛋用,搞明白对你来说才是最重要的。
好了,就酱紫。
老铁,这要是都不赞,说不过去吧!!!
最后对自己说:
你现在所遭遇的每一个不幸,都来自一个不肯努力的曾经。