MapReducer--从Hbase读取数据-zpf1218-ChinaUnix博客

月月鸟飞

首页　| 　博文目录　| 　关于我

zpf1218

博客访问： 1125446
博文数量： 143
博客积分： 969
博客等级：准尉
技术积分： 1765
用户组：普通用户
注册时间： 2011-07-30 12:09

文章分类

全部博文（143）

krenel（4）
大数据（2）
spark（8）
java（2）
FastDFS（6）
mysql（5）
系统性能（7）
mongodb（0）
算法数据结构（1）
Redis（2）
分布式云计算（46）

storm（3）

MapReducer（18）

hadoop（1）
Linux（25）
chrome（2）
网络（7）
C++（10）
web（2）
软件设计（6）
程序设计（0）
操作系统（2）
数据库（2）
未分配的博文（4）

文章存档

2023年（4）

2021年（2）

2020年（4）

2019年（4）

2018年（33）

2017年（6）

2016年（13）

2014年（7）

2013年（23）

2012年（33）

2011年（14）

我的朋友

相关博文

MapReducer--从Hbase读取数据

分类：大数据

2018-07-19 10:40:41

MapReduce输入数据源不仅可以是文件还可以是Hbase，从Hbase读取数据需要注意以下细节

1、Mapper类需要从TableMapper继承，并且实现函数
void map(ImmutableBytesWritable key, Result columns, Context context)
ImmutableBytesWritable key 实际上是Hbase表记录的
rowkeyResult columns 是hbase表记录的字段集合通过如下方式获取字段值，
其中RR表示列簇，
createDate是要获取的字段
String createDate = Bytes.toString(columns.getValue(Bytes.toBytes("RR"), Bytes.toBytes("createDate")));
2、在驱动程序中需要做如下操作
A)、设置hbase地址和端口
Configuration conf = HBaseConfiguration.create();
//设置HbaseIP地址
conf.set("hbase.zookeeper.quorum", "192.168.10.100");
//设置Hbase port
conf.set("hbase.zookeeper.property.clientPort", String.valueOf(2181));
B)、设置扫描器及caching，这样避免MapReduce输出日志过多导致MR失败
Scan scan = new Scan();
String startRowKey = "ROM.01.";
scan.setStartRow(Bytes.toBytes(startRowKey));scan.setCaching(3000);
C)、设置要读取的hbase表、扫描器，Mapper类及输出key，输出valueTableMapReduceUtil.initTableMapperJob("report", scan, DetailMapper.class, Text.class, Text.class, job);

具体实现如下：

点击(此处)折叠或打开

import org.apache.commons.cli.*;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import java.io.IOException;
public class DetailJob {
private static final Log log = LogFactory.getLog(DetailJob.class);
private static final String NAME = "DetailJob";
public static class DetailMapper extends TableMapper<Text, Text> {
private String key="";
@Override
protected void setup(Context context){
key = context.getConfiguration().get("key");
System.out.println("setup key:"+key);
}
@Override
protected void map(ImmutableBytesWritable key, Result columns, Context context) throws IOException,
InterruptedException {
String emmcId = Bytes.toString(columns.getValue(Bytes.toBytes("RR"), Bytes.toBytes("emmcid")));
String cpuId = Bytes.toString(columns.getValue(Bytes.toBytes("RR"), Bytes.toBytes("cpuid")));
String osVersion = Bytes.toString(columns.getValue(Bytes.toBytes("RR"), Bytes.toBytes("rom")));
String createDate = Bytes.toString(columns.getValue(Bytes.toBytes("RR"), Bytes.toBytes("createDate")));
String ram = Bytes.toString(columns.getValue(Bytes.toBytes("RR"), Bytes.toBytes("ram")));
StringBuilder sb = new StringBuilder();
sb.append(emmcId).append("\t");
sb.append(cpuId ).append("\t");
sb.append(osVersion).append("\t");
sb.append(ram).append("\t");
sb.append(createDate);
context.write(new Text(cpuId), new Text(sb.toString()));
}
}
public static class DetailReducer extends Reducer<Text, Text,
NullWritable,
Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws
IOException, InterruptedException {
for (Text val : values) {
context.write(NullWritable.get(), new Text(val.toString()));
}
}
}
private static CommandLine parseArgs(String[] args) {
Options options = new Options();
Option o = new Option("d", "table", true,
"table to read from (must exist)");
o.setArgName("table-name");
o.setRequired(true);
options.addOption(o);
o = new Option("o", "output", true,
"the directory to write to");
o.setArgName("path-in-HDFS");
o.setRequired(true);
options.addOption(o);
o = new Option("k", "key", true,
"the directory to write to");
o.setArgName("path-in-HDFS");
o.setRequired(true);
options.addOption(o);
CommandLineParser parser = new PosixParser();
CommandLine cmd = null;
try {
cmd = parser.parse(options, args);
} catch (Exception e) {
System.err.println("ERROR: " + e.getMessage() + "\n");
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(NAME + " ", options, true);
System.exit(-1);
}
return cmd;
}
public static void main(String[] args) throws Exception {
Configuration conf = HBaseConfiguration.create();
//设置HbaseIP地址
conf.set("hbase.zookeeper.quorum", "192.168.10.100");
//设置Hbase port
conf.set("hbase.zookeeper.property.clientPort", String.valueOf(2181));
String[] otherArgs =
new GenericOptionsParser(conf, args).getRemainingArgs();
CommandLine cmd = parseArgs(otherArgs);
String date = cmd.getOptionValue("d");
String output = cmd.getOptionValue("o");
String key = cmd.getOptionValue("k");
conf.set("key",key);
//设置Hbase扫描数据的起始row
// String date = cmd.getOptionValue("d");
Scan scan = new Scan();
if (date != null && date.trim().length() > 0) {
//String nextDayStr = getNextDayStr(date);
String startRowKey = "ROM.01." + date.replace("-", "");
//String endRowKey = "ROM.01." + nextDayStr.replace("-", "");
scan.setStartRow(Bytes.toBytes(startRowKey));
// scan.setStopRow(Bytes.toBytes(endRowKey));
}
scan.setCaching(3000);
Job job = new Job(conf, NAME);
job.setJarByClass(DetailJob.class);
//设置要读取的hbase表、扫描器，Mapper类及输出key，输出value
TableMapReduceUtil.initTableMapperJob("report", scan, DetailMapper.class, Text.class, Text.class, job);
job.setReducerClass(DetailReducer.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
job.setNumReduceTasks(1);
FileOutputFormat.setOutputPath(job, new Path(output));
job.waitForCompletion(true);
}

阅读(1797) | 评论(0) | 转发(0) |

上一篇：hadoop集群安装

下一篇：Spark学习笔记-架构

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6