Chinaunix首页 | 论坛 | 博客
  • 博客访问: 1966834
  • 博文数量: 606
  • 博客积分: 9991
  • 博客等级: 中将
  • 技术积分: 5725
  • 用 户 组: 普通用户
  • 注册时间: 2008-07-17 19:07
文章分类

全部博文(606)

文章存档

2011年(10)

2010年(67)

2009年(155)

2008年(386)

分类: Java

2010-02-23 12:47:06

MyApriori.java
 

/*
 * Copyright Botwave.com All right reserved.
 */

package com.botwave.Apriori;

import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;

/**
 * @author fisher 2010-2-22 下午05:03:13
 */

public class MyApriori {
    
    /**----------------------------------------------------------------------------------------------**
     + Pass 1
     +     Generate the candidate itemsets in C1
     +     Save the frequent itemsets in L1
     + Pass k
     +     1.Generate the candidate itemsets in Ck from the frequent
     +         itemsets in Lk-1
     +         1.Join Lk-1 p with Lk-1q, as follows:
     +          insert into Ck
     +          select p.item1, p.item2, . . . , p.itemk-1, q.itemk-1
     +          from Lk-1 p, Lk-1q
     +          where p.item1 = q.item1, . . . p.itemk-2 = q.itemk-2, p.itemk-1 < q.itemk-1
     +         2.Generate all (k-1)-subsets from the candidate itemsets in Ck
     +         3.Prune all candidate itemsets from Ck where some (k-1)-subset of the candidate itemset is not in the frequent itemset Lk-1
     +     2.Scan the transaction database to determine the support for each candidate itemset in Ck
     +     3.Save the frequent itemsets in Lk
     *------------------------------------------------------------------------------------------------**/

    /**
     * 方法说明:关联规则算法
     *
     * @param aprioriss
     *             原始数据链
     * @param k
     *             产生k项数据
     * @param defaultSupoort
     *             默认需要满足的支持度
     * @return
     * @author fisher
     * @see ~dbd/cs831/notes/itemsets/itemset_apriori.html
     */

    public AprioriFrequentItemset<String> apriori(int[][] aprioriss, String split, int k, int defaultSupoort) {
        int hight = aprioriss[0].length;
        ApriorCandidateItemset<String> aci = new ApriorCandidateItemset<String>();
        Set<String> cItemset = new LinkedHashSet<String>();
        for(int i = 1; i <= hight; ++i) {
            cItemset.add("" + i);
        }
        aci.setItemsets(cItemset);
        
        AprioriFrequentItemset<String> afi = new AprioriFrequentItemset<String>();
        for(int i = 1; i <= k; ++i) {
            afi = genFrequentItermsets(aprioriss, aci, split, defaultSupoort);
            aci = this.genCandidateItermsets(afi, split, k);
        }
        return afi;
    }
    
    /**
     * 方法说明:产生频繁集
     *             例如:aci.iterms = {"1 2 3", "1 2 4", "1 3 4", "1 3 5"..};
     *                  1 2 3 支持度为:3;1 2 4 支持度为:2;1 3 4 支持度为:4;1 3 4 支持度为:2
     *                  如果默认支持度为3,则1 2 4 和 1 3 4 不满足,去除;返回剩余对象。
     * @param aci
     *             候选集
     * @param k
     *             k项
     * @return
     * @author fisher
     */

    public AprioriFrequentItemset<String> genFrequentItermsets(int[][] aprioriss,
            ApriorCandidateItemset<String> aci, String split, int defaultSupoort) {
        /*
         * 空条件
         */

        if(aci.isEmpty())
            return null;
        
        // 候选对象

        Set<String> cItemsets = aci.getItemsets();
        // 转为数组

        Object[] citems = cItemsets.toArray();
        
        int len = citems.length;
        AprioriFrequentItemset<String> afi = new AprioriFrequentItemset<String>();
        Map<String, Integer> fItemsets = new LinkedHashMap<String, Integer>();
        
        /*
         * 遍历候选对象,获取满足条件的频繁集
         */

        for(int i = 0; i < len; ++i) {
            String cIterm = (String) citems[i];
            // 计算支持度

            int support = calculateSupport(cIterm, split, aprioriss);
            // 支持度大于默认支持度,为频繁集

            if(support >= defaultSupoort) {
                fItemsets.put(cIterm, support);
            }
                
        }
        
        afi.setItemsets(fItemsets);
        afi.setSupport(defaultSupoort);
        return afi;
    }
    
    /**
     * 方法说明:产生候选集
     *
     *            Generate the candidate itemsets in Ck from the frequent
     *            itemsets in Lk-1
     *            1 Join Lk-1 p with Lk-1q, as follows:
     *             insert into Ck
     *             select p.item1, p.item2, . . . , p.itemk-1, q.itemk-1
     *             from Lk-1 p, Lk-1q
     *             where p.item1 = q.item1, . . . p.itemk-2 = q.itemk-2, p.itemk-1 < q.itemk-1
     * 2 Generate all (k-1)-subsets from the candidate itemsets in Ck
     *
     * @param afi
     *             Lk-1 上一个频复集
     * @param k
     *             第K个
     * @author fisher
     */

    public ApriorCandidateItemset<String> genCandidateItermsets(AprioriFrequentItemset<String> afi,
            String split, int k) {
        /*
         * 空条件
         */

        if(afi.isEmpty() || k < 1)
            return null;
        
        // 第k-1个频繁集

        Map<String, Integer> fItemsets = afi.getItemsets();
        // key为数据存储的数据链,value为支持度

        Set<String> fItemsetKey = fItemsets.keySet();
        // 转为数组对象

        Object[] fitems = fItemsetKey.toArray();
        int len = fitems.length;
        ApriorCandidateItemset<String> aci = new ApriorCandidateItemset<String>();
        Set<String> itemsets = new LinkedHashSet<String>();
        int i, j;
        /*
         * 遍历频繁集,获取满足条件的候选集
         */

        for(i = 0; i < len; ++i) {
            for(j = i + 1; j < len; ++j) {
                String fitem1 = (String) fitems[i];
                String fitem2 = (String) fitems[j];
                String cIterm = getIntersection(fitem1, fitem2, split);
                if(!cIterm.isEmpty()) {
                    itemsets.add(cIterm);
                }
            }
        }
        aci.setK(k);
        aci.setItemsets(itemsets);
        
        return aci;
    }

    /**
     * 方法说明:计算候选对象支持度
     *             例如:aprioriss = {{1,1,1,0,0}, {1,1,1,1,1 }, {1,0,1,1,0}, {1,0,1,1,1}, {1,1,1,1,0}};
     *                  cIterm = "1 2 4"; split = " ";
     *                  则支持度为2。(第2,5项支持)
     *                     
     * @param cIterm
     *             待计算字符串
     * @param split
     *             分隔符
     * @param aprioriss
     *             原始数据链
     * @return
     * @author fisher
     */

    public int calculateSupport(String cIterm, String split, int[][] aprioriss) {
        //TODO: 这是普通的计算方法,效率较低。

        //TODO:Fast Algorithms for Mining Association Rules

        /*
         * 空条件
         */

        if(cIterm.isEmpty() || split.isEmpty()
                || aprioriss.length < 1)
            return 0;
        
        // 分割出来

        String[] cs = cIterm.split(split);    
        
        // 记录cIterm所代表的aprioriss的下标

        // 例如cs = {1, 2, 4},则代表的aprioriss下标为:0,1,3

        int csLen = cs.length;
        Integer[] index = new Integer[csLen];
        for(int i = 0; i < csLen; ++i) {
            index[i] = Integer.valueOf(cs[i]) -1;
        }
        
        /*
         * 开始计算
         */

        int count = 0, temp = 1;
        /*
         * 遍历aprioriss, 计数
         */

        for(int j = 0; j < aprioriss.length; ++j) {
            int[] aprioris = aprioriss[j];
            
            /*
             * 遍历cs,看是否当前aprioris满足
             */

            for(int k = 0; k < csLen; ++k) {
                temp = 1;
                if(aprioris[index[k]] != 1) {
                    // 如果有一项aprioriss下标为index[k]值不为1

                    // 表示存在一项不满足情况,则temp归0,count不增加

                    temp = 0;
                    break;
                }
            }
            
            /*
             * temp > 0,满足,count加1
             */

            if(temp > 0)
                count = count + 1;
        }
        return count;
    }
    
    /**
     * 方法说明:获取两个字符的交集
     *            条件:select p.item1, p.item2, . . . , p.itemk-1, q.itemk-1
     *                 from Lk-1 p, Lk-1q
     *                 where p.item1 = q.item1, . . . p.itemk-2 = q.itemk-2, p.itemk-1 < q.itemk-1
     *
     *            例1:str1 = "1 2 3"; str2 = "1 2 4"; split = " ";
     *                 则存在交集:"1 2 3 4"
     *            例2:str1 = "1 3 4"; str2 = "1 2 3"; split = " ";
     *                 则不存在交集
     * @param str1
     *             字符串1
     * @param str2
     *             字符串2
     * @param split
     *             分隔符
     * @return
     *          存在返回该交集,不存在返回""
     * @author fisher
     */

    private String getIntersection(String str1, String str2, String split) {
        String intersect= "";
        
        if(isCoalition(str1, str2, split)) {
            /*
             * 存在交集
             */

            //以split分割获取str2最后对象

            String last2 = getLast(str2, split);
            // 组装

            intersect = str1 + split + last2;
        }
        
        return intersect;
            
    }
    
    /**
     * 方法说明:判断两个字符串是否存在交集
     *              条件式:where p.item1 = q.item1, . . . p.itemk-2 = q.itemk-2, p.itemk-1 < q.itemk-1
     *            例1:str1 = "1 2 3"; str2 = "1 2 4"; split = " ";
     *                 则存在交集:"1 2 3 4",因为1=1, 2=2, 3 < 4
     *            例2:str1 = "1 3 4"; str2 = "1 2 3"; split = " ";
     *                 则不存在交集,因为1=1, 3!=2, 4 > 3
     * @param str1
     *             字符串1
     * @param str2
     *             字符串2
     * @param split
     *             字符串分隔符
     * @return
     * @author fisher
     */

    private boolean isCoalition(String str1, String str2, String split) {
        
        /*
         * 空值情况
         */

        if(str1.isEmpty() || str2.isEmpty()
                || str1.length() != str2.length())
            return false;
        
        /*
         * 分割
         */

        String[] s1 = str1.split(split);
        String[] s2 = str2.split(split);

        /*
         * 长度必须相同
         */

        if(s1.length != s2.length)
            return false;

        /*
         * p.item1 = q.item1, . . . p.itemk-2 = q.itemk-2
         */

        for(int i = 0; i < s1.length - 1; ++i) {
            if(!s1[i].equals(s2[i]))
                return false;
        }
        /*
         * p.itemk-1 < q.itemk-1
         */

        if(Integer.valueOf(getLast(str1, split)) >= Integer.valueOf(getLast(str2, split)))
            return false;
        
        /*
         * 否则返回true
         */

        return true;
    }
    
    /**
     * 方法说明:通过指定分隔符,获取最后一个对象
     *             例如:str = "1 2 3"; split = " ";
     *                  则返回3
     *
     * @param str
     *             原始字符串
     * @param split
     *             分隔符
     * @return
     * @author fisher
     */

    public String getLast(String str, String split) {
        if(str.isEmpty())
            return null;
        int index = str.lastIndexOf(split);
        return str.substring(index + 1);
    }
    
    public static void main(String[] args) {

        int DEFAULTSUPOORT = (int) (5 * 0.4);
        String SPLIT = " ";
        int[][] APRIORISS = {{1,1,1,0,0}, {1,1,1,1,1 }, {1,0,1,1,0}, {1,0,1,1,1}, {1,1,1,1,0}};
        
        Map<String, Integer> itemsets = new LinkedHashMap<String, Integer>();
        itemsets.put("1 2", 3);
        itemsets.put("1 3", 2);
        itemsets.put("1 4", 3);
        itemsets.put("2 3", 4);
        itemsets.put("2 4", 5);
        itemsets.put("3 4", 3);
        AprioriFrequentItemset<String> afi = new AprioriFrequentItemset<String>();
        afi.setSupport(1);
        afi.setItemsets(itemsets);
        
        MyApriori myApriori = new MyApriori();
        String cIterm = "1";
        System.out.println(myApriori.calculateSupport(cIterm, SPLIT, APRIORISS));
        
        System.out.println(myApriori.genCandidateItermsets(afi, SPLIT, 3));
        
        System.out.println(myApriori.genFrequentItermsets(APRIORISS, myApriori.genCandidateItermsets(afi, SPLIT, 3), SPLIT, 3));
        
        System.out.println(myApriori.apriori(APRIORISS, SPLIT, 4, DEFAULTSUPOORT));
    }
}


相关资料:

1. http://www.cnblogs.com/zgw21cn/archive/2009/05/31/1492809.html

2.

3.

4.

阅读(1936) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~