先驗算法是實現頻繁項挖掘的一種經典算法,利用關聯式規則不斷擴展頻繁項子集以得到所有的頻繁項集合。解釋一下關聯式規則,所謂關聯式是指在大量的數據中找出的項與項之間的關係。例如消費者購買了產品A,通常都會購買產品B,這就是一條關聯式。html
先驗算法被設計用來處理包含事務的數據庫,這裏的每個事務都被當成是一組項集,給定一個閾值C,咱們須要找出至少出現C次的事務子集(即子項)。這邊這個C值就是最小支持度,規定了一個項集出現多少次才能被認爲是一個頻繁項。java
先驗算法的核心思想基於如下一個事實:一個項集是頻繁項集,其全部子集必定是頻繁項集;一個項集不是頻繁項,其超集必定不是頻繁項集。那麼咱們在尋找事務中的最大頻繁項的過程當中,只須要擴展是頻繁項的子集,這就大大地縮減了搜索空間。下面是該算法的僞代碼實現:算法
其中T表明的是事務集合,表示最小支持度,Li對應的是第i次迭代獲得的頻繁項集,Ci表示對於第i次迭代的候選項集。僞代碼的第一行遍歷了事務中的全部項,統計每一項出現的頻數,保留全部出現次數高於最小支持度的項。數據庫
優勢:app
缺點:ide
1 import java.util.ArrayList; 2 import java.util.HashMap; 3 import java.util.HashSet; 4 import java.util.Iterator; 5 import java.util.List; 6 import java.util.Map; 7 import java.util.Set; 8 9 /** 10 * 這是一個簡單的先驗算法的實現,包含了以下假設: 11 * 1.事務對應一個TID和一個項集合,每一個項對應一個int類型的數值,保證同一個事務內部的項惟一 12 * 2.算法在實現方式上還存在不少能夠優化剪枝的部分,這裏省略了優化,是結構更加清晰 13 * 3.這只是一個樣例程序 14 * @author LuZhou 15 * @email 448287076@qq.com 16 */ 17 public class Aprior { 18 19 20 public static class TransactionItem { 21 22 private Integer itemValue; 23 24 public TransactionItem( int i ){ 25 itemValue = i; 26 } 27 28 @Override 29 public boolean equals(Object obj) { 30 TransactionItem it = (TransactionItem) obj; 31 return this.itemValue == it.itemValue; 32 } 33 34 35 @Override 36 public String toString() { 37 return itemValue.toString(); 38 } 39 40 @Override 41 public int hashCode() { 42 return itemValue % 31; 43 } 44 } 45 46 47 public static class ItemCollection{ 48 private Set<TransactionItem> items = new HashSet<Aprior.TransactionItem>(); 49 50 public ItemCollection(){ 51 } 52 53 public ItemCollection(int[] items){ 54 for (int i = 0; i < items.length; i++) { 55 this.add(new TransactionItem(items[i])); 56 } 57 } 58 59 public boolean itemContains(Set<TransactionItem> sub){ 60 return items.containsAll(sub); 61 } 62 63 @Override 64 public boolean equals(Object obj) { 65 ItemCollection ic = (ItemCollection) obj; 66 return ic.items.containsAll(items) && 67 items.containsAll(ic.items); 68 } 69 70 @Override 71 public int hashCode() { 72 int v = 0; 73 for (TransactionItem ts : items) { 74 v = ( v + ts.hashCode() ) % 31; 75 } 76 return v; 77 } 78 79 public boolean contains(ItemCollection sub){ 80 return items.containsAll(sub.items); 81 } 82 83 public boolean containItem(TransactionItem item){ 84 return items.contains(item); 85 } 86 87 public Set<TransactionItem> getItems(){ 88 return items; 89 } 90 91 public void add(TransactionItem item){ 92 items.add(item); 93 } 94 95 @Override 96 public String toString() { 97 StringBuilder sb = new StringBuilder(); 98 sb.append("{"); 99 for (TransactionItem item : getItems()) { 100 sb.append( item.toString() ); 101 sb.append(","); 102 } 103 sb.append("}"); 104 return sb.toString(); 105 } 106 107 } 108 109 public static class Transaction{ 110 private int id; 111 private ItemCollection ic; 112 113 public Transaction(int id , ItemCollection itemCollection){ 114 this.id = id; 115 this.ic = itemCollection; 116 } 117 118 public boolean itemContains(ItemCollection sub){ 119 return ic.contains(sub); 120 } 121 122 public Set<TransactionItem> getItems(){ 123 return ic.getItems(); 124 } 125 126 } 127 128 private Set<TransactionItem> restItems = new HashSet<Aprior.TransactionItem>(); 129 private List<Transaction> transactions = new ArrayList<Aprior.Transaction>(); 130 131 //統計一項的出現次數 132 private int itemsFrequency( ItemCollection sub ){ 133 int frequency = 0 ; 134 135 for (Transaction t : transactions) { 136 if(t.itemContains(sub)){ 137 frequency ++; 138 } 139 } 140 141 return frequency; 142 } 143 144 //肯定擴展項是否爲頻繁項 145 private Set< ItemCollection > filterWithThreadshold( 146 Set< ItemCollection > items, int threadshold ){ 147 148 Set< ItemCollection > result = new HashSet<ItemCollection>(); 149 for (ItemCollection subItems : items) { 150 if( threadshold <= itemsFrequency(subItems)) 151 result.add(subItems); 152 } 153 154 return result; 155 } 156 157 //根據現有的子項集獲取全部的可能擴展 158 private Set< ItemCollection > extendItems( Set< ItemCollection > current){ 159 Set< ItemCollection > extend = new HashSet<Aprior.ItemCollection>(); 160 161 //找出剩餘的項集 162 restItems.clear(); 163 for (ItemCollection ic : current) { 164 for (TransactionItem item : ic.getItems()) { 165 restItems.add(item); 166 } 167 } 168 169 //須要找到當前頻繁項子集 170 for (ItemCollection itemCollection : current) { 171 for (TransactionItem rest : restItems) { 172 if( ! itemCollection.containItem(rest) ){ 173 //找到一個b 不屬於 頻繁項集 A 可是存在於剩餘項中 用其構造其他的子項 174 extend.add( buildCollection( itemCollection, rest )); 175 } 176 } 177 178 } 179 180 return extend; 181 } 182 183 //result = ic.items & item 184 private ItemCollection buildCollection(ItemCollection ic,TransactionItem item){ 185 ItemCollection ex = new ItemCollection(); 186 187 Iterator<TransactionItem> it = ic.getItems().iterator(); 188 while(it.hasNext()){ 189 ex.add( it.next() ); 190 } 191 192 ex.add(item); 193 194 return ex; 195 } 196 197 198 //初始化剩餘項 199 private Set<ItemCollection> initCollection( int threadshold ){ 200 201 Map<TransactionItem , Integer> tc = new HashMap<Aprior.TransactionItem, Integer>(); 202 203 for ( Transaction t : transactions ) { 204 205 for ( TransactionItem item : t.getItems() ) { 206 207 if( !restItems.contains(item) ){ 208 restItems.add(item); 209 tc.put(item, 1); 210 }else 211 tc.put( item, 1 + tc.get(item) ); 212 213 } 214 } 215 216 217 Set<ItemCollection> collection = new HashSet<Aprior.ItemCollection>(); 218 219 Iterator< TransactionItem > it = tc.keySet().iterator(); 220 while( it.hasNext() ){ 221 TransactionItem item = it.next(); 222 if( threadshold <= tc.get( item ) ){ 223 ItemCollection ic = new ItemCollection(); 224 ic.add( item ); 225 collection.add( ic ); 226 } 227 } 228 229 return collection; 230 } 231 232 233 /** 234 * 找出頻繁項 235 * @param threadshold 最小支持度 236 * @return 237 */ 238 public Set< ItemCollection > frequentItem( int threadshold ){ 239 240 Set< ItemCollection > current = initCollection( threadshold ); 241 Set< ItemCollection > result = new HashSet<Aprior.ItemCollection>(); 242 243 while( current.size() > 0 ){ 244 245 result.addAll( current ); 246 247 Set<ItemCollection> extendList = extendItems( current ); 248 249 current = filterWithThreadshold( extendList, threadshold ); 250 251 } 252 253 254 return result; 255 } 256 257 public void addTransaction(Transaction ts){ 258 transactions.add(ts); 259 } 260 261 262 public static void main(String args[]){ 263 264 final int MINSUPPROT = 2; 265 Aprior aprior = new Aprior(); 266 /** 267 * 數據集包括 268 * T1 {1,3,4} 269 * T2 {2,3,5} 270 * T3 {1,2,3,5} 271 * T4 {2,5} 272 * 最小支持度爲2,請最大頻繁項 273 */ 274 aprior.addTransaction(new Transaction(1, new ItemCollection(new int[]{1,3,4}))); 275 aprior.addTransaction(new Transaction(2, new ItemCollection(new int[]{2,3,5}))); 276 aprior.addTransaction(new Transaction(3, new ItemCollection(new int[]{1,2,3,5}))); 277 aprior.addTransaction(new Transaction(4, new ItemCollection(new int[]{2,5}))); 278 279 System.out.println(aprior.frequentItem( MINSUPPROT )); 280 281 } 282 283 284 }
http://en.wikipedia.org/wiki/Apriori_algorithm優化
http://rakesh.agrawal-family.com/papers/vldb94apriori.pdfui
http://www.cnblogs.com/gaizai/archive/2010/03/31/1701573.htmlthis