with association1 as ( -- 合計購入者数 select count(distinct userId) as allPu from dataset.table ) , association2 as ( -- productId別購入数と合計購入者数のまとめ select a.userId, a.productId, count(distinct userId) over(partition by a.productId) as product_cnt, b.allPu from dataset.table as a cross join (select allPu from association1) as b ) , associationBase as ( -- 買い合わせproductId別でのアソシエーションのベースデータ select a.productId as A_productId, b.productId as B_productId, a.product_cnt as A_count, b.product_cnt as B_count, count(*) as A_B_count, a.allPu from association2 as a inner join association2 as b on a.userId = b.userId where a <> b group by 1, 2, 3, 4, 6 ) , reports as ( select A_productId, B_productId, -- 支持度=同時購入者数/購入者全体数 round(1.0 * A_B_count / allPu, 4) as support, -- 信頼度=同時購入者数/商品A購入者数 round(1.0 * A_B_count / A_count, 4) as confidence, -- 期待信頼度=商品B購入者数/購入者全体数 round(1.0 * B_count / allPu), 4) as expected_confidence, -- リフト値=信頼度/期待信頼度 round(1.0 * (A_B_count / A_count) / B_count , 4) as lift, A_count, B_count, A_B_count, allPu from associationBase ) select * from reports
▼出力結果例
A_ productId | B_ productId | sup port | confi dence | expected_ confidence | lift | A_ count | B_ count | A_B_ count | allPu |
---|---|---|---|---|---|---|---|---|---|
A01 | B01 | 0.02 | 0.2 | 0.2 | 0.002 | 50 | 100 | 10 | 500 |
A01 | B02 | 0.04 | 0.5 | 0.6 | 0.0017 | 40 | 300 | 20 | 500 |
B01 | B02 | 0.14 | 0.7 | 0.6 | 0.0023 | 100 | 300 | 70 | 500 |
アソシエーション分析に関しては、以下のblogがすごくわかりやすかったので、詳細知りたい方はリンク先を参照ください。