當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

Hadoop MapReduce的一些相关代码Code

發布時間：2025/3/21 编程问答 21 豆豆

生活随笔收集整理的這篇文章主要介紹了 Hadoop MapReduce的一些相关代码Code 小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

MapReduce是一種分布式計算模型（distributed programming model），由Google于2004年左右提出，主要用于搜索領域，解決海量數據的計算問題。

MapReduce由兩個階段組成：即Map階段和Reduce階段，用戶需要實現map()函數和reduce()函數，用于實現分布式計算。

1、WordCountApp.java

package cmd;import java.net.URI;import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner;public class WordCountApp extends Configured implements Tool{static String INPUT_PATH = "";static String OUT_PATH = "";@Overridepublic int run(String[] arg0) throws Exception {INPUT_PATH = arg0[0];OUT_PATH = arg0[1];Configuration conf = new Configuration();final FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), conf);final Path outPath = new Path(OUT_PATH);if(fileSystem.exists(outPath)){fileSystem.delete(outPath, true);}final Job job = new Job(conf , WordCountApp.class.getSimpleName());//打包運行必須執行的秘密方法job.setJarByClass(WordCountApp.class);//1.1指定讀取的文件位于哪里FileInputFormat.setInputPaths(job, INPUT_PATH);//指定如何對輸入文件進行格式化，把輸入文件每一行解析成鍵值對//job.setInputFormatClass(TextInputFormat.class);//1.2 指定自定義的map類job.setMapperClass(MyMapper.class);//map輸出的<k,v>類型。如果<k3,v3>的類型與<k2,v2>類型一致，則可以省略//job.setMapOutputKeyClass(Text.class);//job.setMapOutputValueClass(LongWritable.class);//1.3 分區//job.setPartitionerClass(HashPartitioner.class);//有一個reduce任務運行//job.setNumReduceTasks(1);//1.4 TODO 排序、分組//1.5 TODO 規約//2.2 指定自定義reduce類job.setReducerClass(MyReducer.class);//指定reduce的輸出類型job.setOutputKeyClass(Text.class);job.setOutputValueClass(LongWritable.class);//2.3 指定寫出到哪里FileOutputFormat.setOutputPath(job, outPath);//指定輸出文件的格式化類//job.setOutputFormatClass(TextOutputFormat.class);//把job提交給JobTracker運行job.waitForCompletion(true);return 0;}public static void main(String[] args) throws Exception {ToolRunner.run(new WordCountApp(), args);}/*** KEYIN 即k1 表示行的偏移量* VALUEIN 即v1 表示行文本內容* KEYOUT 即k2 表示行中出現的單詞* VALUEOUT 即v2 表示行中出現的單詞的次數，固定值1*/static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable>{protected void map(LongWritable k1, Text v1, Context context) throws java.io.IOException ,InterruptedException {final String[] splited = v1.toString().split("\t");for (String word : splited) {context.write(new Text(word), new LongWritable(1));}};}/*** KEYIN 即k2 表示行中出現的單詞* VALUEIN 即v2 表示行中出現的單詞的次數* KEYOUT 即k3 表示文本中出現的不同單詞* VALUEOUT 即v3 表示文本中出現的不同單詞的總次數**/static class MyReducer extends Reducer<Text, LongWritable, Text, LongWritable>{protected void reduce(Text k2, java.lang.Iterable<LongWritable> v2s, Context ctx) throws java.io.IOException ,InterruptedException {long times = 0L;for (LongWritable count : v2s) {times += count.get();}ctx.write(k2, new LongWritable(times));};}}

2、GroupApp.java

package group;import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.net.URI;import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.RawComparator; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableComparator; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;public class GroupApp {static final String INPUT_PATH = "hdfs://cloud4:9000/input";static final String OUT_PATH = "hdfs://cloud4:9000/out";public static void main(String[] args) throws Exception{final Configuration configuration = new Configuration();final FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), configuration);if(fileSystem.exists(new Path(OUT_PATH))){fileSystem.delete(new Path(OUT_PATH), true);}final Job job = new Job(configuration, GroupApp.class.getSimpleName());//1.1 指定輸入文件路徑FileInputFormat.setInputPaths(job, INPUT_PATH);//指定哪個類用來格式化輸入文件job.setInputFormatClass(TextInputFormat.class);//1.2指定自定義的Mapper類job.setMapperClass(MyMapper.class);//指定輸出<k2,v2>的類型job.setMapOutputKeyClass(NewK2.class);job.setMapOutputValueClass(LongWritable.class);//1.3 指定分區類job.setPartitionerClass(HashPartitioner.class);job.setNumReduceTasks(1);//1.4 TODO 排序、分區job.setGroupingComparatorClass(MyGroupingComparator.class);//1.5 TODO （可選）合并//2.2 指定自定義的reduce類job.setReducerClass(MyReducer.class);//指定輸出<k3,v3>的類型job.setOutputKeyClass(LongWritable.class);job.setOutputValueClass(LongWritable.class);//2.3 指定輸出到哪里FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));//設定輸出文件的格式化類job.setOutputFormatClass(TextOutputFormat.class);//把代碼提交給JobTracker執行job.waitForCompletion(true);}static class MyMapper extends Mapper<LongWritable, Text, NewK2, LongWritable>{protected void map(LongWritable key, Text value, org.apache.hadoop.mapreduce.Mapper<LongWritable,Text,NewK2,LongWritable>.Context context) throws java.io.IOException ,InterruptedException {final String[] splited = value.toString().split("\t");final NewK2 k2 = new NewK2(Long.parseLong(splited[0]), Long.parseLong(splited[1]));final LongWritable v2 = new LongWritable(Long.parseLong(splited[1]));context.write(k2, v2);};}static class MyReducer extends Reducer<NewK2, LongWritable, LongWritable, LongWritable>{protected void reduce(NewK2 k2, java.lang.Iterable<LongWritable> v2s, org.apache.hadoop.mapreduce.Reducer<NewK2,LongWritable,LongWritable,LongWritable>.Context context) throws java.io.IOException ,InterruptedException {long min = Long.MAX_VALUE;for (LongWritable v2 : v2s) {if(v2.get()<min){min = v2.get();}}context.write(new LongWritable(k2.first), new LongWritable(min));};}/*** 問：為什么實現該類？* 答：因為原來的v2不能參與排序，把原來的k2和v2封裝到一個類中，作為新的k2**/static class NewK2 implements WritableComparable<NewK2>{Long first;Long second;public NewK2(){}public NewK2(long first, long second){this.first = first;this.second = second;}@Overridepublic void readFields(DataInput in) throws IOException {this.first = in.readLong();this.second = in.readLong();}@Overridepublic void write(DataOutput out) throws IOException {out.writeLong(first);out.writeLong(second);}/*** 當k2進行排序時，會調用該方法.* 當第一列不同時，升序；當第一列相同時，第二列升序*/@Overridepublic int compareTo(NewK2 o) {final long minus = this.first - o.first;if(minus !=0){return (int)minus;}return (int)(this.second - o.second);}@Overridepublic int hashCode() {return this.first.hashCode()+this.second.hashCode();}@Overridepublic boolean equals(Object obj) {if(!(obj instanceof NewK2)){return false;}NewK2 oK2 = (NewK2)obj;return (this.first==oK2.first)&&(this.second==oK2.second);}}/*** 問：為什么自定義該類？* 答：業務要求分組是按照第一列分組，但是NewK2的比較規則決定了不能按照第一列分。只能自定義分組比較器。*/static class MyGroupingComparator implements RawComparator<NewK2>{@Overridepublic int compare(NewK2 o1, NewK2 o2) {return (int)(o1.first - o2.first);}/*** @param arg0 表示第一個參與比較的字節數組* @param arg1 表示第一個參與比較的字節數組的起始位置* @param arg2 表示第一個參與比較的字節數組的偏移量* * @param arg3 表示第二個參與比較的字節數組* @param arg4 表示第二個參與比較的字節數組的起始位置* @param arg5 表示第二個參與比較的字節數組的偏移量*/@Overridepublic int compare(byte[] arg0, int arg1, int arg2, byte[] arg3,int arg4, int arg5) {return WritableComparator.compareBytes(arg0, arg1, 8, arg3, arg4, 8);}} }

3、data

#當第一列相同時，求出第二列的最小值 3 3 3 2 3 1 2 2 2 1 1 1 ------------------- 3 1 2 1 1 1

4、KpiApp.java

package mapreduce;import java.io.DataInput; import java.io.DataOutput; import java.io.IOException;import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;public class KpiApp {static final String INPUT_PATH = "hdfs://cloud4:9000/wlan";static final String OUT_PATH = "hdfs://cloud4:9000/out";public static void main(String[] args) throws Exception{final Job job = new Job(new Configuration(), KpiApp.class.getSimpleName());//1.1 指定輸入文件路徑FileInputFormat.setInputPaths(job, INPUT_PATH);//指定哪個類用來格式化輸入文件job.setInputFormatClass(TextInputFormat.class);//1.2指定自定義的Mapper類job.setMapperClass(MyMapper.class);//指定輸出<k2,v2>的類型job.setMapOutputKeyClass(Text.class);job.setMapOutputValueClass(KpiWritable.class);//1.3 指定分區類job.setPartitionerClass(HashPartitioner.class);job.setNumReduceTasks(1);//1.4 TODO 排序、分區//1.5 TODO （可選）合并//2.2 指定自定義的reduce類job.setReducerClass(MyReducer.class);//指定輸出<k3,v3>的類型job.setOutputKeyClass(Text.class);job.setOutputValueClass(KpiWritable.class);//2.3 指定輸出到哪里FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));//設定輸出文件的格式化類job.setOutputFormatClass(TextOutputFormat.class);//把代碼提交給JobTracker執行job.waitForCompletion(true);}static class MyMapper extends Mapper<LongWritable, Text, Text, KpiWritable>{protected void map(LongWritable key, Text value, org.apache.hadoop.mapreduce.Mapper<LongWritable,Text,Text,KpiWritable>.Context context) throws IOException ,InterruptedException {final String[] splited = value.toString().split("\t");final String msisdn = splited[1];final Text k2 = new Text(msisdn);final KpiWritable v2 = new KpiWritable(splited[6],splited[7],splited[8],splited[9]);context.write(k2, v2);};}static class MyReducer extends Reducer<Text, KpiWritable, Text, KpiWritable>{/*** @param k2 表示整個文件中不同的手機號碼 * @param v2s 表示該手機號在不同時段的流量的集合*/protected void reduce(Text k2, java.lang.Iterable<KpiWritable> v2s, org.apache.hadoop.mapreduce.Reducer<Text,KpiWritable,Text,KpiWritable>.Context context) throws IOException ,InterruptedException {long upPackNum = 0L;long downPackNum = 0L;long upPayLoad = 0L;long downPayLoad = 0L;for (KpiWritable kpiWritable : v2s) {upPackNum += kpiWritable.upPackNum;downPackNum += kpiWritable.downPackNum;upPayLoad += kpiWritable.upPayLoad;downPayLoad += kpiWritable.downPayLoad;}final KpiWritable v3 = new KpiWritable(upPackNum+"", downPackNum+"", upPayLoad+"", downPayLoad+"");context.write(k2, v3);};} }class KpiWritable implements Writable{long upPackNum;long downPackNum;long upPayLoad;long downPayLoad;public KpiWritable(){}public KpiWritable(String upPackNum, String downPackNum, String upPayLoad, String downPayLoad){this.upPackNum = Long.parseLong(upPackNum);this.downPackNum = Long.parseLong(downPackNum);this.upPayLoad = Long.parseLong(upPayLoad);this.downPayLoad = Long.parseLong(downPayLoad);}@Overridepublic void readFields(DataInput in) throws IOException {this.upPackNum = in.readLong();this.downPackNum = in.readLong();this.upPayLoad = in.readLong();this.downPayLoad = in.readLong();}@Overridepublic void write(DataOutput out) throws IOException {out.writeLong(upPackNum);out.writeLong(downPackNum);out.writeLong(upPayLoad);out.writeLong(downPayLoad);}@Overridepublic String toString() {return upPackNum + "\t" + downPackNum + "\t" + upPayLoad + "\t" + downPayLoad;} }

5、SortApp.java

package sort;import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.net.URI;import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;public class SortApp {static final String INPUT_PATH = "hdfs://cloud4:9000/input";static final String OUT_PATH = "hdfs://cloud4:9000/out";public static void main(String[] args) throws Exception{final Configuration configuration = new Configuration();final FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), configuration);if(fileSystem.exists(new Path(OUT_PATH))){fileSystem.delete(new Path(OUT_PATH), true);}final Job job = new Job(configuration, SortApp.class.getSimpleName());//1.1 指定輸入文件路徑FileInputFormat.setInputPaths(job, INPUT_PATH);//指定哪個類用來格式化輸入文件job.setInputFormatClass(TextInputFormat.class);//1.2指定自定義的Mapper類job.setMapperClass(MyMapper.class);//指定輸出<k2,v2>的類型job.setMapOutputKeyClass(NewK2.class);job.setMapOutputValueClass(LongWritable.class);//1.3 指定分區類job.setPartitionerClass(HashPartitioner.class);job.setNumReduceTasks(1);//1.4 TODO 排序、分區//1.5 TODO （可選）合并//2.2 指定自定義的reduce類job.setReducerClass(MyReducer.class);//指定輸出<k3,v3>的類型job.setOutputKeyClass(LongWritable.class);job.setOutputValueClass(LongWritable.class);//2.3 指定輸出到哪里FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));//設定輸出文件的格式化類job.setOutputFormatClass(TextOutputFormat.class);//把代碼提交給JobTracker執行job.waitForCompletion(true);}static class MyMapper extends Mapper<LongWritable, Text, NewK2, LongWritable>{protected void map(LongWritable key, Text value, org.apache.hadoop.mapreduce.Mapper<LongWritable,Text,NewK2,LongWritable>.Context context) throws java.io.IOException ,InterruptedException {final String[] splited = value.toString().split("\t");final NewK2 k2 = new NewK2(Long.parseLong(splited[0]), Long.parseLong(splited[1]));final LongWritable v2 = new LongWritable(Long.parseLong(splited[1]));context.write(k2, v2);};}static class MyReducer extends Reducer<NewK2, LongWritable, LongWritable, LongWritable>{protected void reduce(NewK2 k2, java.lang.Iterable<LongWritable> v2s, org.apache.hadoop.mapreduce.Reducer<NewK2,LongWritable,LongWritable,LongWritable>.Context context) throws java.io.IOException ,InterruptedException {context.write(new LongWritable(k2.first), new LongWritable(k2.second));};}/*** 問：為什么實現該類？* 答：因為原來的v2不能參與排序，把原來的k2和v2封裝到一個類中，作為新的k2**/static class NewK2 implements WritableComparable<NewK2>{Long first;Long second;public NewK2(){}public NewK2(long first, long second){this.first = first;this.second = second;}@Overridepublic void readFields(DataInput in) throws IOException {this.first = in.readLong();this.second = in.readLong();}@Overridepublic void write(DataOutput out) throws IOException {out.writeLong(first);out.writeLong(second);}/*** 當k2進行排序時，會調用該方法.* 當第一列不同時，升序；當第一列相同時，第二列升序*/@Overridepublic int compareTo(NewK2 o) {final long minus = this.first - o.first;if(minus !=0){return (int)minus;}return (int)(this.second - o.second);}@Overridepublic int hashCode() {return this.first.hashCode()+this.second.hashCode();}@Overridepublic boolean equals(Object obj) {if(!(obj instanceof NewK2)){return false;}NewK2 oK2 = (NewK2)obj;return (this.first==oK2.first)&&(this.second==oK2.second);}}}

6、TopKApp.java

package suanfa;import java.net.URI;import mapreduce.WordCountApp;import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; /*** 作業：求最大的100個值*/ public class TopKApp {static final String INPUT_PATH = "hdfs://cloud4:9000/input";static final String OUT_PATH = "hdfs://cloud4:9000/out";public static void main(String[] args) throws Exception {Configuration conf = new Configuration();final FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), conf);final Path outPath = new Path(OUT_PATH);if(fileSystem.exists(outPath)){fileSystem.delete(outPath, true);}final Job job = new Job(conf , WordCountApp.class.getSimpleName());FileInputFormat.setInputPaths(job, INPUT_PATH);job.setMapperClass(MyMapper.class);job.setReducerClass(MyReducer.class);job.setOutputKeyClass(LongWritable.class);job.setOutputValueClass(NullWritable.class);FileOutputFormat.setOutputPath(job, outPath);job.waitForCompletion(true);}static class MyMapper extends Mapper<LongWritable, Text, LongWritable, NullWritable>{long max = Long.MIN_VALUE;protected void map(LongWritable k1, Text v1, Context context) throws java.io.IOException ,InterruptedException {final long temp = Long.parseLong(v1.toString());if(temp>max){max = temp;}};protected void cleanup(org.apache.hadoop.mapreduce.Mapper<LongWritable,Text,LongWritable, NullWritable>.Context context) throws java.io.IOException ,InterruptedException {context.write(new LongWritable(max), NullWritable.get());};}static class MyReducer extends Reducer<LongWritable, NullWritable, LongWritable, NullWritable>{long max = Long.MIN_VALUE;protected void reduce(LongWritable k2, java.lang.Iterable<NullWritable> arg1, org.apache.hadoop.mapreduce.Reducer<LongWritable,NullWritable,LongWritable,NullWritable>.Context arg2) throws java.io.IOException ,InterruptedException {final long temp = k2.get();if(temp>max){max = temp;}};protected void cleanup(org.apache.hadoop.mapreduce.Reducer<LongWritable,NullWritable,LongWritable,NullWritable>.Context context) throws java.io.IOException ,InterruptedException {context.write(new LongWritable(max), NullWritable.get());};} }

總結

以上是生活随笔為你收集整理的Hadoop MapReduce的一些相关代码Code的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇： Hadoop HDFS的一些相关代码co
下一篇：数据结构之堆Heap