def loadDataSet():
return [[1,2,5],[2,4],[2,3],[1,2,4],[1,3],[2,3],[1,3],[1,2,3,5],[1,2,3]]
#1. Build candidate 1 Itemsets C1
def createC1(dataSet):
C1 = []
for transaction in dataSet:
for item in transaction:
if not [item] in C1:
C1.append([item])
C1.sort()
return list(map(frozenset, C1))
#
# frozenset It's a frozen collection , It is immutable , There is a hash value .
# The advantage is that it can be used as a dictionary key, It can also be used as an element of other sets .
# The disadvantage is that once created, it cannot be changed , No, add,remove Method .
#
def apriori(dataSet, minSupport = 0.5):
C1 = createC1(dataSet)
# Get a set , For re scanD in , It is convenient to count the probability of each number .
L1, supportData = scanD(dataSet, C1, minSupport)
# Use scanD Function to calculate support , The concept of support is Association rules ( One ) It's been explained in .
L = [L1]
#
# Add one more layer of list
#[[frozenset({1}), frozenset({3}), frozenset({2}), frozenset({5})]]
# Be careful {4} No more , Because the probability of his appearance is 25% . Below the minimum support level 50%
#
k = 2
while (len(L[k-2]) > 0):
#
# First entry L If there is, continue
#
Lk_1 = L[k-2]
#
Ck = aprioriGen(Lk_1, k)
#
# Loop to get different Set set , It should be a set, so the default de duplication
# for example : [frozenset({1, 3}), frozenset({1, 2}), frozenset({1, 5}), frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5})]
# The second time by [frozenset({1, 2}), frozenset({1, 5}), frozenset({2, 5}), frozenset({2, 4}), frozenset({2, 3}), frozenset({1, 3})]
# third time by [frozenset({1, 2, 5}), frozenset({1, 2, 3})]
#
print("ck:",Ck)
Lk, supK = scanD(dataSet, Ck, minSupport)
#
# The probability that each number will appear in the list , Repeat in a list once
# Each character is returned , And the probability of each character .
# Circular calculation support
#
supportData.update(supK)
print("lk:", Lk)
L.append(Lk)
k += 1
return L, supportData
# Filter out collections that do not meet the support level
# return Frequent itemset list retList Support dictionary for all elements
# support = ssCnt[key] / numItems
# numItems yes len(D) Result ,D yes data Metadata , Start now load The amount of data .
# Ck It's a different set .
def scanD(D, Ck, minSupport):
# Candidate set count
ssCnt = {
}
for tid in D:
#
# D What came in was a list .
# The original dataset
#
for can in Ck:
#Ck What's coming in is frozenset type .
#frozenset It's a frozen collection , It is immutable , There is a hash value , The advantage is that it can be used as a dictionary key, It can also be used as an element of other sets . The disadvantage is that once created, it cannot be changed , No, add,remove Method .
if can.issubset(tid):
if can not in ssCnt.keys(): ssCnt[can] = 1
else: ssCnt[can] += 1
#
# The upper half gets each Numbers Number of occurrences , Save in a dictionary .
# ssCnt Dictionaries key by frozenset,volume Is the number of occurrences in the list , If it appears twice in the same list , Count once , It calculates the probability that each list will have this number .
#
numItems = float(len(D))
Lk= []
# Candidate set items Cn Generated frequent itemsets Lk
supportData = {
}
#
# Candidate set items Cn Support Dictionary
# Calculate the support of candidate item set , supportData key: Candidates , value: Support
#
for key in ssCnt:
support = ssCnt[key] / numItems
if support >= minSupport:
Lk.append(key)
supportData[key] = support
#
# Judge In the list The specified... Appears key Probability .
# If it is greater than minSupport The retention .
#
return Lk, supportData
#
#return The format is as follows :
#{frozenset({1}): 0.5,
#frozenset({3}): 0.75,
#frozenset({4}): 0.25,
#frozenset({2}): 0.75,
#frozenset({5}): 0.75})
#
# Loop to get different Set set , It should be a set, so the default de duplication
# for example : [frozenset({1, 3}), frozenset({1, 2}), frozenset({1, 5}), frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5})]
# The second time by [frozenset({1, 2}), frozenset({1, 5}), frozenset({2, 5}), frozenset({2, 4}), frozenset({2, 3}), frozenset({1, 3})]
# third time by [frozenset({1, 2, 5}), frozenset({1, 2, 3})]
def aprioriGen(Lk_1, k):
#
# apriori while Get into
# Recursively pass in the upper list .
# For the first time on the whole [frozenset({1}), frozenset({3}), frozenset({2}), frozenset({5})] Full incoming .
# k The first pass in is 2 , Then increase by degrees
#
Ck = []
lenLk = len(Lk_1)
for i in range(lenLk):
L1 = list(Lk_1[i])[:k - 2]
#
# The first time it came in, it was in this position List 【:k- 2】 It's empty .
#
L1.sort()
for j in range(i + 1, lenLk):
# front k-2 The items are the same , Merge two sets
L2 = list(Lk_1[j])[:k - 2]
L2.sort()
if L1 == L2:
Ck.append(Lk_1[i] | Lk_1[j])
#
# Loop to form union
# [frozenset({1, 3}), frozenset({1, 2}), frozenset({1, 5}), frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5})]
# Get each combination , In order to calculate the probability of each combination later . That is, support .
# Loop to get the first occurrence of itself character , Loop matches the combination behind itself , If the first character of the combination and the first character of itself equal Then Union .
#
return Ck
dataset = loadDataSet()
L, supportData = apriori(dataset, minSupport=0.2)
ck: [frozenset({1, 2}), frozenset({1, 5}), frozenset({1, 4}),
frozenset({1, 3}), frozenset({2, 5}), frozenset({2, 4}), frozenset({2,
3}), frozenset({4, 5}), frozenset({3, 5}), frozenset({3, 4})] lk:
[frozenset({1, 2}), frozenset({1, 5}), frozenset({2, 5}),
frozenset({2, 4}), frozenset({2, 3}), frozenset({1, 3})] ck:
[frozenset({1, 2, 5}), frozenset({1, 2, 3}), frozenset({1, 3, 5}),
frozenset({2, 4, 5}), frozenset({2, 3, 5}), frozenset({2, 3, 4})] lk:
[frozenset({1, 2, 5}), frozenset({1, 2, 3})] ck: [frozenset({1, 2, 3,
5})] lk: []
Expand one :
L
[[frozenset({
1}),
frozenset({
2}),
frozenset({
5}),
frozenset({
4}),
frozenset({
3})],
[frozenset({
1, 2}),
frozenset({
1, 5}),
frozenset({
2, 5}),
frozenset({
2, 4}),
frozenset({
2, 3}),
frozenset({
1, 3})],
[frozenset({
1, 2, 5}), frozenset({
1, 2, 3})],
[]]
supportData
{
frozenset({
1}): 0.6666666666666666,
frozenset({
2}): 0.7777777777777778,
frozenset({
5}): 0.2222222222222222,
frozenset({
4}): 0.2222222222222222,
frozenset({
3}): 0.6666666666666666,
frozenset({
1, 2}): 0.4444444444444444,
frozenset({
1, 5}): 0.2222222222222222,
frozenset({
2, 5}): 0.2222222222222222,
frozenset({
2, 4}): 0.2222222222222222,
frozenset({
2, 3}): 0.4444444444444444,
frozenset({
1, 4}): 0.1111111111111111,
frozenset({
1, 3}): 0.4444444444444444,
frozenset({
3, 5}): 0.1111111111111111,
frozenset({
1, 2, 5}): 0.2222222222222222,
frozenset({
1, 2, 3}): 0.2222222222222222,
frozenset({
1, 3, 5}): 0.1111111111111111,
frozenset({
2, 3, 5}): 0.1111111111111111,
frozenset({
1, 2, 3, 5}): 0.1111111111111111}
from numpy import *
# Structural data
def loadDataSet():
return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
# Convert all elements to frozenset Type Dictionary , Put it in a list
def createC1(dataSet):
C1 = []
for transaction in dataSet:
for item in transaction:
if not [item] in C1:
C1.append([item])
C1.sort()
# Use frozenset In order to use these values as dictionary keys later
return list(map(frozenset, C1)) # frozenset An immutable set ,set Variable set
# Filter out collections that do not meet the support level
# return Frequent itemset list retList Support dictionary for all elements
# support = ssCnt[key] / numItems
# numItems yes len(D) Result ,D yes data Metadata , Start now load The amount of data .
# Ck It's a different set .
def scanD(D, Ck, minSupport):
ssCnt = {
}
for tid in D:
for can in Ck:
if can.issubset(tid): # Judge can Whether it is tid Of 《 A subset of 》 ( Here we use the subset method to judge the relationship between the two )
if can not in ssCnt: # Count the number of times the value satisfies the subset in the whole record ( Record... In the form of a dictionary ,frozenset As the key )
ssCnt[can] = 1
else:
ssCnt[can] += 1
numItems = float(len(D))
retList = [] # Re record the data values that meet the conditions ( That is, data with support greater than the threshold )
supportData = {
} # Support of each data value
for key in ssCnt:
support = ssCnt[key] / numItems
if support >= minSupport:
retList.insert(0, key)
supportData[key] = support
return retList, supportData # Exclude elements that do not meet the support level Support of each element
# Generate all collections that can be combined
# Frequent itemset list Lk Number of itemset elements k [frozenset({2, 3}), frozenset({3, 5})] -> [frozenset({2, 3, 5})]
def aprioriGen(Lk, k):
retList = []
lenLk = len(Lk)
for i in range(lenLk): # Two layer cycle comparison Lk Each element in the and other elements
for j in range(i+1, lenLk):
L1 = list(Lk[i])[:k-2] # Convert set to list Post value
L2 = list(Lk[j])[:k-2]
L1.sort(); L2.sort() # Here's an explanation : This function compares two at a time list Before k-2 Elements , If they are the same, then the union set is obtained k Collection of elements
if L1==L2:
retList.append(Lk[i] | Lk[j]) # Union
return retList # Returns a list of frequent itemsets Ck
# Functions that encapsulate all steps
# return All combinations that meet greater than the threshold Set support list
def apriori(dataSet, minSupport = 0.5):
D = list(map(set, dataSet)) # Convert list records into dictionaries [{1, 3, 4}, {2, 3, 5}, {1, 2, 3, 5}, {2, 5}]
C1 = createC1(dataSet) # Transfer each element to frozenset Dictionaries [frozenset({1}), frozenset({2}), frozenset({3}), frozenset({4}), frozenset({5})]
L1, supportData = scanD(D, C1, minSupport) # Filtering data
L = [L1]
k = 2
while (len(L[k-2]) > 0): # If there is still a set that meets the support, continue to do correlation analysis
Ck = aprioriGen(L[k-2], k) # Ck Candidate frequent itemsets
Lk, supK = scanD(D, Ck, minSupport) # Lk Frequent itemsets
supportData.update(supK) # Update Dictionary ( Put the new collection : Support added to supportData in )
L.append(Lk)
k += 1 # Each new combination of elements adds only one , therefore k also +1(k Number of elements )
return L, supportData
# Get the encapsulation function of association rules
def generateRules(L, supportData, minConf=0.7): # supportData It's a dictionary
bigRuleList = []
for i in range(1, len(L)): # From for 2 Start with a collection of elements
for freqSet in L[i]:
# A list of collections that contain only a single element
H1 = [frozenset([item]) for item in freqSet] # frozenset({2, 3}) Convert to [frozenset({2}), frozenset({3})]
# If the set element is greater than 2 individual , You need to process to get the rules
if (i > 1):
rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf) # Collection elements Set split list ...
else:
calcConf(freqSet, H1, supportData, bigRuleList, minConf)
return bigRuleList
# Evaluate the rules Obtain the association rules that satisfy the minimum confidence
def calcConf(freqSet, H, supportData, brl, minConf=0.7):
prunedH = [] # Create a new list to return
for conseq in H:
conf = supportData[freqSet]/supportData[freqSet-conseq] # Calculate the confidence level
if conf >= minConf:
print(freqSet-conseq,'-->',conseq,'conf:',conf)
brl.append((freqSet-conseq, conseq, conf))
prunedH.append(conseq)
return prunedH
# Generate candidate rule set
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
m = len(H[0])
if (len(freqSet) > (m + 1)): # Try further merging
Hmp1 = aprioriGen(H, m+1) # Combine single set elements in pairs
Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
if (len(Hmp1) > 1): #need at least two sets to merge
rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
dataSet = loadDataSet()
L,suppData = apriori(dataSet,minSupport=0.5)
rules = generateRules(L,suppData,minConf=0.7)
# rules = generateRules(L,suppData,minConf=0.5)
print(" The itemsets with strong association rules are :",rules)
Extension 2 :
https://github.com/apachecn/AiLearning/blob/master/src/py2.x/ml/11.Apriori/apriori.py
【python primary 】hasattr Funct
With the development of societ