/* * Copyright Botwave.com All right reserved. */ package com.botwave.Apriori;
import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.Map; import java.util.Set;
/** * @author fisher 2010-2-22 下午05:03:13 */ public class MyApriori { /**----------------------------------------------------------------------------------------------** + Pass 1 + Generate the candidate itemsets in C1 + Save the frequent itemsets in L1 + Pass k + 1.Generate the candidate itemsets in Ck from the frequent + itemsets in Lk-1 + 1.Join Lk-1 p with Lk-1q, as follows: + insert into Ck + select p.item1, p.item2, . . . , p.itemk-1, q.itemk-1 + from Lk-1 p, Lk-1q + where p.item1 = q.item1, . . . p.itemk-2 = q.itemk-2, p.itemk-1 < q.itemk-1 + 2.Generate all (k-1)-subsets from the candidate itemsets in Ck + 3.Prune all candidate itemsets from Ck where some (k-1)-subset of the candidate itemset is not in the frequent itemset Lk-1 + 2.Scan the transaction database to determine the support for each candidate itemset in Ck + 3.Save the frequent itemsets in Lk *------------------------------------------------------------------------------------------------**/ /** * 方法说明:关联规则算法 * * @param aprioriss * 原始数据链 * @param k * 产生k项数据 * @param defaultSupoort * 默认需要满足的支持度 * @return * @author fisher * @see ~dbd/cs831/notes/itemsets/itemset_apriori.html */ public AprioriFrequentItemset<String> apriori(int[][] aprioriss, String split, int k, int defaultSupoort) { int hight = aprioriss[0].length; ApriorCandidateItemset<String> aci = new ApriorCandidateItemset<String>(); Set<String> cItemset = new LinkedHashSet<String>(); for(int i = 1; i <= hight; ++i) { cItemset.add("" + i); } aci.setItemsets(cItemset); AprioriFrequentItemset<String> afi = new AprioriFrequentItemset<String>(); for(int i = 1; i <= k; ++i) { afi = genFrequentItermsets(aprioriss, aci, split, defaultSupoort); aci = this.genCandidateItermsets(afi, split, k); } return afi; } /** * 方法说明:产生频繁集 * 例如:aci.iterms = {"1 2 3", "1 2 4", "1 3 4", "1 3 5"..}; * 1 2 3 支持度为:3;1 2 4 支持度为:2;1 3 4 支持度为:4;1 3 4 支持度为:2 * 如果默认支持度为3,则1 2 4 和 1 3 4 不满足,去除;返回剩余对象。 * @param aci * 候选集 * @param k * k项 * @return * @author fisher */ public AprioriFrequentItemset<String> genFrequentItermsets(int[][] aprioriss, ApriorCandidateItemset<String> aci, String split, int defaultSupoort) { /* * 空条件 */ if(aci.isEmpty()) return null; // 候选对象
Set<String> cItemsets = aci.getItemsets(); // 转为数组
Object[] citems = cItemsets.toArray(); int len = citems.length; AprioriFrequentItemset<String> afi = new AprioriFrequentItemset<String>(); Map<String, Integer> fItemsets = new LinkedHashMap<String, Integer>(); /* * 遍历候选对象,获取满足条件的频繁集 */ for(int i = 0; i < len; ++i) { String cIterm = (String) citems[i]; // 计算支持度
int support = calculateSupport(cIterm, split, aprioriss); // 支持度大于默认支持度,为频繁集
if(support >= defaultSupoort) { fItemsets.put(cIterm, support); } } afi.setItemsets(fItemsets); afi.setSupport(defaultSupoort); return afi; } /** * 方法说明:产生候选集 * * Generate the candidate itemsets in Ck from the frequent * itemsets in Lk-1 * 1 Join Lk-1 p with Lk-1q, as follows: * insert into Ck * select p.item1, p.item2, . . . , p.itemk-1, q.itemk-1 * from Lk-1 p, Lk-1q * where p.item1 = q.item1, . . . p.itemk-2 = q.itemk-2, p.itemk-1 < q.itemk-1 * 2 Generate all (k-1)-subsets from the candidate itemsets in Ck * * @param afi * Lk-1 上一个频复集 * @param k * 第K个 * @author fisher */ public ApriorCandidateItemset<String> genCandidateItermsets(AprioriFrequentItemset<String> afi, String split, int k) { /* * 空条件 */ if(afi.isEmpty() || k < 1) return null; // 第k-1个频繁集
Map<String, Integer> fItemsets = afi.getItemsets(); // key为数据存储的数据链,value为支持度
Set<String> fItemsetKey = fItemsets.keySet(); // 转为数组对象
Object[] fitems = fItemsetKey.toArray(); int len = fitems.length; ApriorCandidateItemset<String> aci = new ApriorCandidateItemset<String>(); Set<String> itemsets = new LinkedHashSet<String>(); int i, j; /* * 遍历频繁集,获取满足条件的候选集 */ for(i = 0; i < len; ++i) { for(j = i + 1; j < len; ++j) { String fitem1 = (String) fitems[i]; String fitem2 = (String) fitems[j]; String cIterm = getIntersection(fitem1, fitem2, split); if(!cIterm.isEmpty()) { itemsets.add(cIterm); } } } aci.setK(k); aci.setItemsets(itemsets); return aci; }
/** * 方法说明:计算候选对象支持度 * 例如:aprioriss = {{1,1,1,0,0}, {1,1,1,1,1 }, {1,0,1,1,0}, {1,0,1,1,1}, {1,1,1,1,0}}; * cIterm = "1 2 4"; split = " "; * 则支持度为2。(第2,5项支持) * * @param cIterm * 待计算字符串 * @param split * 分隔符 * @param aprioriss * 原始数据链 * @return * @author fisher */ public int calculateSupport(String cIterm, String split, int[][] aprioriss) { //TODO: 这是普通的计算方法,效率较低。
//TODO:Fast Algorithms for Mining Association Rules
/* * 空条件 */ if(cIterm.isEmpty() || split.isEmpty() || aprioriss.length < 1) return 0; // 分割出来
String[] cs = cIterm.split(split); // 记录cIterm所代表的aprioriss的下标
// 例如cs = {1, 2, 4},则代表的aprioriss下标为:0,1,3
int csLen = cs.length; Integer[] index = new Integer[csLen]; for(int i = 0; i < csLen; ++i) { index[i] = Integer.valueOf(cs[i]) -1; } /* * 开始计算 */ int count = 0, temp = 1; /* * 遍历aprioriss, 计数 */ for(int j = 0; j < aprioriss.length; ++j) { int[] aprioris = aprioriss[j]; /* * 遍历cs,看是否当前aprioris满足 */ for(int k = 0; k < csLen; ++k) { temp = 1; if(aprioris[index[k]] != 1) { // 如果有一项aprioriss下标为index[k]值不为1
// 表示存在一项不满足情况,则temp归0,count不增加
temp = 0; break; } } /* * temp > 0,满足,count加1 */ if(temp > 0) count = count + 1; } return count; } /** * 方法说明:获取两个字符的交集 * 条件:select p.item1, p.item2, . . . , p.itemk-1, q.itemk-1 * from Lk-1 p, Lk-1q * where p.item1 = q.item1, . . . p.itemk-2 = q.itemk-2, p.itemk-1 < q.itemk-1 * * 例1:str1 = "1 2 3"; str2 = "1 2 4"; split = " "; * 则存在交集:"1 2 3 4" * 例2:str1 = "1 3 4"; str2 = "1 2 3"; split = " "; * 则不存在交集 * @param str1 * 字符串1 * @param str2 * 字符串2 * @param split * 分隔符 * @return * 存在返回该交集,不存在返回"" * @author fisher */ private String getIntersection(String str1, String str2, String split) { String intersect= ""; if(isCoalition(str1, str2, split)) { /* * 存在交集 */ //以split分割获取str2最后对象
String last2 = getLast(str2, split); // 组装
intersect = str1 + split + last2; } return intersect; } /** * 方法说明:判断两个字符串是否存在交集 * 条件式:where p.item1 = q.item1, . . . p.itemk-2 = q.itemk-2, p.itemk-1 < q.itemk-1 * 例1:str1 = "1 2 3"; str2 = "1 2 4"; split = " "; * 则存在交集:"1 2 3 4",因为1=1, 2=2, 3 < 4 * 例2:str1 = "1 3 4"; str2 = "1 2 3"; split = " "; * 则不存在交集,因为1=1, 3!=2, 4 > 3 * @param str1 * 字符串1 * @param str2 * 字符串2 * @param split * 字符串分隔符 * @return * @author fisher */ private boolean isCoalition(String str1, String str2, String split) { /* * 空值情况 */ if(str1.isEmpty() || str2.isEmpty() || str1.length() != str2.length()) return false; /* * 分割 */ String[] s1 = str1.split(split); String[] s2 = str2.split(split);
/* * 长度必须相同 */ if(s1.length != s2.length) return false;
/* * p.item1 = q.item1, . . . p.itemk-2 = q.itemk-2 */ for(int i = 0; i < s1.length - 1; ++i) { if(!s1[i].equals(s2[i])) return false; } /* * p.itemk-1 < q.itemk-1 */ if(Integer.valueOf(getLast(str1, split)) >= Integer.valueOf(getLast(str2, split))) return false; /* * 否则返回true */ return true; } /** * 方法说明:通过指定分隔符,获取最后一个对象 * 例如:str = "1 2 3"; split = " "; * 则返回3 * * @param str * 原始字符串 * @param split * 分隔符 * @return * @author fisher */ public String getLast(String str, String split) { if(str.isEmpty()) return null; int index = str.lastIndexOf(split); return str.substring(index + 1); } public static void main(String[] args) {
int DEFAULTSUPOORT = (int) (5 * 0.4); String SPLIT = " "; int[][] APRIORISS = {{1,1,1,0,0}, {1,1,1,1,1 }, {1,0,1,1,0}, {1,0,1,1,1}, {1,1,1,1,0}}; Map<String, Integer> itemsets = new LinkedHashMap<String, Integer>(); itemsets.put("1 2", 3); itemsets.put("1 3", 2); itemsets.put("1 4", 3); itemsets.put("2 3", 4); itemsets.put("2 4", 5); itemsets.put("3 4", 3); AprioriFrequentItemset<String> afi = new AprioriFrequentItemset<String>(); afi.setSupport(1); afi.setItemsets(itemsets); MyApriori myApriori = new MyApriori(); String cIterm = "1"; System.out.println(myApriori.calculateSupport(cIterm, SPLIT, APRIORISS)); System.out.println(myApriori.genCandidateItermsets(afi, SPLIT, 3)); System.out.println(myApriori.genFrequentItermsets(APRIORISS, myApriori.genCandidateItermsets(afi, SPLIT, 3), SPLIT, 3)); System.out.println(myApriori.apriori(APRIORISS, SPLIT, 4, DEFAULTSUPOORT)); } }
|