<返回更多

Hadoop数据去重

2022-06-16    陈群53016
加入收藏

package cn.mr.dedup;

import JAVA.io.IOException;

import org.Apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.MApper;

public class DedupMapper extends Mapper<LongWritable, Text, Text, NullWritable> {

private static Text field = new Text();

// <0,2018-3-3 c><11,2018-3-4 d>

@Override

protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

field = value;

context.write(field, NullWritable.get());

}

// <2018-3-3 c,null> <2018-3-4 d,null>

}

package cn.mr.dedup;

import java.io.IOException;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Reducer;

public class DedupReducer extends Reducer<Text, NullWritable, Text, NullWritable> {

// <2018-3-3 c,null> <2018-3-4 d,null><2018-3-4 d,null>

@Override

protected void reduce(Text key, Iterable<NullWritable> values, Context context)

throws IOException, InterruptedException {

context.write(key, NullWritable.get());

}

}

package cn.mr.dedup;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class DedupRunner {

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

Configuration conf = new Configuration();

Job job = Job.getInstance(conf);

job.setJarByClass(DedupRunner.class);

job.setMapperClass(DedupMapper.class);

job.setReducerClass(DedupReducer.class);

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(NullWritable.class);

FileInputFormat.setInputPaths(job, new Path("D:\Dedup\input"));

// 指定处理完成之后的结果所保存的位置

FileOutputFormat.setOutputPath(job, new Path("D:\Dedup\output"));

job.waitForCompletion(true);

}

}

声明:本站部分内容来自互联网,如有版权侵犯或其他问题请与我们联系,我们将立即删除或处理。
▍相关推荐
更多资讯 >>>