博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
Hadoop/MapReduce购物篮分析:关联规则挖掘
阅读量:2490 次
发布时间:2019-05-11

本文共 8345 字,大约阅读时间需要 27 分钟。

购物篮分析目的:查找一个给定超市或者网店购物篮中最常出现的商品对(阶数为1,2...)例如:如果有5个商品{A,B,C,D,E},对应以下6个交易:Transaction 1:A,CTransaction 2:B,DTransaction 3:A,C,ETransaction 4:C,ETransaction 5:A,B,ETransaction 6:B,E我们的目标是构建项集F1(大小=1)和F2(大小=2)F1={[C,3],[A,3],[B,3],[E,4]}F2={[
,2],[
,2],[
,2],[
,2]}那么问题来了:为什么没有D呢?在这个例子中,我们使用的最小支持度为2。支持度是一个模式在整个交易集中出现的次数,因此要去除[D,1]项集F1和F2可以用来生成交易的关联规则。关联规则形式:LHS(左件) => RHS(右件)可乐 => 薯片如果顾客购买可乐,他们也会购买薯片。关联规则的两个度量标准:支持度,是一个模式在整个交易集中出现的次数置信度,关联规则中左件与右件同时出现的频繁程度输入:crackers,bread,bananacrackers,coke,butter,coffeecrackers,breadcrackers,bread,coffeebutter,cokebutter,coke,bread,crackers思路:每个map接受一个交易,这是一个顾客购买的一个商品集{I1,I2..In}。映射器首先对这些商品排序(升序或者降序),生成{S1,S2...Sn}然后发出(key,1)对,这里key=Tuple2(Si,Sj),Si<=Sj,而且value为1,表示这个键已经见过一次。组合器和规约器的任务是聚集和统计频度。对商品进行排序?避免类似(crackers,bread)和(bread,crackers)的重复键新建一个工具类Combinations定义一个findSortedCombinations方法,可以为任意阶数创建一个唯一组合eg:List
elements = Arrays.asList("a", "b", "c", "d", "e");List
> combinations = findSortedCombinations(elements, 2); System.out.println(combinations); 结果为: [[a, b], [a, c], [a, d], [a, e], [b, c], [b, d], [b, e], [c, d], [c, e], [d, e]] package MBA;import java.util.List;import java.util.Arrays;import java.util.ArrayList;import java.util.Collection;import java.util.Collections;public class Combination { public static
> List
> findSortedCombinations(Collection
elements) { List
> result = new ArrayList
>(); for (int i = 0; i <= elements.size(); i++) { result.addAll(findSortedCombinations(elements, i)); } return result; } public static
> List
> findSortedCombinations(Collection
elements, int n) { List
> result = new ArrayList
>(); if (n == 0) { result.add(new ArrayList
()); return result; } List
> combinations = findSortedCombinations(elements, n - 1);//假设已经生成了n-1阶的唯一组合集合 for (List
combination: combinations) { //对于n-1阶的唯一组合集合中的每一个集合 for (T element: elements) { //对于给定组合中的每一个元素 if (combination.contains(element)) { continue; } //如果n-1阶的唯一组合集合中的这个集合中不包含这个元素,证明这个元素可以跟这个集合一起构成n阶唯一组合集合中的一个元素 List
list = new ArrayList
(); list.addAll(combination); if (list.contains(element)) { continue; } list.add(element); //sort items not to duplicate the items // example: (a, b, c) and (a, c, b) might become // different items to be counted if not sorted Collections.sort(list); if (result.contains(list)) { continue; } result.add(list); } } return result; } /** * Basic Test of findSortedCombinations() * * @param args */ public static void main(String[] args) { List
elements = Arrays.asList("a", "b", "c", "d", "e"); List
> combinations = findSortedCombinations(elements, 2); System.out.println(combinations); }} package MBA;import java.io.IOException;import java.util.List;import java.util.ArrayList;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import org.apache.log4j.Logger;import org.apache.commons.lang3.StringUtils;public class MBAMapper extends Mapper
{ public static final Logger THE_LOGGER = Logger.getLogger(MBAMapper.class); public static final int DEFAULT_NUMBER_OF_PAIRS = 2; //默认阶数为2阶 private static final Text reducerKey = new Text(); private static final IntWritable NUMBER_ONE = new IntWritable(1); int numberOfPairs; //记录设置的阶数 @Override protected void setup(Context context) throws IOException, InterruptedException { this.numberOfPairs = context.getConfiguration().getInt("number.of.pairs", DEFAULT_NUMBER_OF_PAIRS);//从设置中得到阶数 THE_LOGGER.info("setup() numberOfPairs = " + numberOfPairs); } @Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); List
items = convertItemsToList(line);//将每一行的商品名转为列表 if ((items == null) || (items.isEmpty())) { return; } generateMapperOutput(numberOfPairs, items, context); } private static List
convertItemsToList(String line) { if ((line == null) || (line.length() == 0)) { return null; } String[] tokens = StringUtils.split(line, ","); if ( (tokens == null) || (tokens.length == 0) ) { return null; } List
items = new ArrayList
(); for (String token : tokens) { if (token != null) { items.add(token.trim()); } } return items; } /*** * 产生映射器输出 * @param numberOfPairs * @param items * @param context * @throws IOException * @throws InterruptedException */ private void generateMapperOutput(int numberOfPairs, List
items, Context context) throws IOException, InterruptedException { List
> sortedCombinations = Combination.findSortedCombinations(items, numberOfPairs); for (List
itemList: sortedCombinations) { System.out.println("itemlist="+itemList.toString()); reducerKey.set(itemList.toString()); context.write(reducerKey, NUMBER_ONE); } } } package MBA;import java.io.IOException;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.mapreduce.Reducer;/** * reduce的任务很简单,对于相同key出现的次数进行累加求和 * @author chenjie * */public class MBAReducer extends Reducer
{ @Override public void reduce(Text key, Iterable
values, Context context) throws IOException, InterruptedException { int sum = 0; // total items paired for (IntWritable value : values) { sum += value.get(); } context.write(key, new IntWritable(sum)); }} package MBA;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.log4j.Logger;public class MBADriver extends Configured implements Tool { private static final String INPATH = "input/mba.txt";// 输入文件路径 private static final String OUTPATH = "output/mba";// 输出文件路径 public static final Logger THE_LOGGER = Logger.getLogger(MBADriver.class); // main to start from the command public static void main(String args[]) throws Exception { args = new String[3]; args[0] = INPATH; args[1] = OUTPATH; args[2] = "2"; if(args.length != 3){ printUsage(); System.exit(1); } int exitStatus = ToolRunner.run(new MBADriver(), args); THE_LOGGER.info("exitStatus="+exitStatus); System.exit(exitStatus); } private static int printUsage(){ System.out.println("USAGE: [input-path] [output-path] [number-of-pairs]"); ToolRunner.printGenericCommandUsage(System.out); return -1; } @Override public int run(String args[]) throws Exception { String inputPath = args[0]; String outputPath = args[1]; int numberOfPairs = Integer.parseInt(args[2]); THE_LOGGER.info("inputPath: " + inputPath); THE_LOGGER.info("outputPath: " + outputPath); THE_LOGGER.info("numberOfPairs: " + numberOfPairs); // job configuration Job job = new Job(getConf()); job.setJobName("MBADriver"); job.getConfiguration().setInt("number.of.pairs", numberOfPairs); // job.setJarByClass(MBADriver.class); // add jars to distributed cache // HadoopUtil.addJarsToDistributedCache(job, "/lib/"); //input/output path FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); //Mapper K, V output job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); //output format job.setOutputFormatClass(TextOutputFormat.class); //Reducer K, V output job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // set mapper/reducer job.setMapperClass(MBAMapper.class); job.setCombinerClass(MBAReducer.class); job.setReducerClass(MBAReducer.class); //delete the output path if exists to avoid "existing dir/file" error Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); long startTime = System.currentTimeMillis(); boolean status = job.waitForCompletion(true); THE_LOGGER.info("job status="+status); long endTime = System.currentTimeMillis(); long elapsedTime = endTime - startTime; THE_LOGGER.info("Elapsed time: " + elapsedTime + " milliseconds"); return status ? 0 : 1; }} 结果: [banana, bread] 1 [banana, crackers] 1 [bread, butter] 1 [bread, coffee] 1 [bread, coke] 1 [bread, crackers] 4 [butter, coffee] 1 [butter, coke] 3 [butter, crackers] 2 [coffee, coke] 1 [coffee, crackers] 2 [coke, crackers] 2

转载地址:http://vkqrb.baihongyu.com/

你可能感兴趣的文章
看完此文再不懂区块链算我输,用Python从零开始创建区块链
查看>>
C/S框架-WebService架构用户凭证(令牌)解决方案
查看>>
UVA 11149.Power of Matrix-矩阵快速幂倍增
查看>>
ajax post 请求415\ 400 错误
查看>>
使用 CSS 用户选择控制选择
查看>>
PHP程序性能优化的50种方法
查看>>
css3 动画的播放、暂停和重新开始
查看>>
IOS 上传ipa文件失败
查看>>
eclipse Android 开发基础 Activity 窗体 界面
查看>>
怎样玩转千万级别的数据
查看>>
input输入框修改后自动跳到最后一个字符
查看>>
Windows与Linux之间海量文件的传输与Linux下大小写敏感问题
查看>>
HDU 3948 不同回文子串个数
查看>>
Leetcode: Valid Parentheses
查看>>
Python
查看>>
自己动手开发调试器 01
查看>>
Python基础-包
查看>>
POJ 2696 计算表达式的值
查看>>
都江堰很美-佩服古人_Crmhf的一天
查看>>
Linux系统资源查询命令(cpu、io、mem)
查看>>