当前位置：首页 > news >正文

头哥实践平台之MapReduce基础实战

news 2025/12/10 15:17:24

一. 第1关：成绩统计

编程要求

使用MapReduce计算班级每个学生的最好成绩，输入文件路径为/user/test/input，请将计算后的结果输出到/user/test/output/目录下。

先写命令行,如下:
一行就是一个命令

touch file01
echo Hello World Bye World
cat file01
echo Hello World Bye World >file01
cat file01
touch file02
echo Hello Hadoop Goodbye Hadoop >file02
cat file02
start-dfs.sh
hadoop fs -mkdir /usr
hadoop fs -mkdir /usr/input
hadoop fs -ls /usr/output
hadoop fs -ls /
hadoop fs -ls /usr
hadoop fs -put file01 /usr/input
hadoop fs -put file02 /usr/input
hadoop fs -ls /usr/input

代码段部分:

import java.util.StringTokenizer;import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;public class WordCount {/********** Begin **********///Mapper函数public static class TokenizerMapper extends Mapper<LongWritable, Text, Text, IntWritable> {private final static IntWritable one = new IntWritable(1);private Text word = new Text();private int maxValue = 0;public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {StringTokenizer itr = new StringTokenizer(value.toString(),"\n");while (itr.hasMoreTokens()) {String[] str = itr.nextToken().split(" ");String name = str[0];one.set(Integer.parseInt(str[1]));word.set(name);context.write(word,one);}//context.write(word,one);}}public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {private IntWritable result = new IntWritable();public void reduce(Text key, Iterable<IntWritable> values, Context context)throws IOException, InterruptedException {int maxAge = 0;int age = 0;for (IntWritable intWritable : values) {maxAge = Math.max(maxAge, intWritable.get());}result.set(maxAge);context.write(key, result);}}public static void main(String[] args) throws Exception {Configuration conf = new Configuration();Job job = new Job(conf, "word count");job.setJarByClass(WordCount.class);job.setMapperClass(TokenizerMapper.class);job.setCombinerClass(IntSumReducer.class);job.setReducerClass(IntSumReducer.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);String inputfile = "/user/test/input";String outputFile = "/user/test/output/";FileInputFormat.addInputPath(job, new Path(inputfile));FileOutputFormat.setOutputPath(job, new Path(outputFile));job.waitForCompletion(true);/********** End **********/}
}

二. 第2关：文件内容合并去重

编程要求

接下来我们通过一个练习来巩固学习到的MapReduce知识吧。

对于两个输入文件，即文件file1和文件file2，请编写MapReduce程序，对两个文件进行合并，并剔除其中重复的内容，得到一个新的输出文件file3。
为了完成文件合并去重的任务，你编写的程序要能将含有重复内容的不同文件合并到一个没有重复的整合文件，规则如下：

第一列按学号排列；
学号相同，按x,y,z排列；
输入文件路径为：/user/tmp/input/；
输出路径为：/user/tmp/output/。
注意：输入文件后台已经帮你创建好了，不需要你再重复创建。

请先启动Hadoop再点击评测！
所以要先在命令行输入下面启动命令


start-dfs.sh


import java.io.IOException;import java.util.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;public class Merge {/*** @param args* 对A,B两个文件进行合并，并剔除其中重复的内容，得到一个新的输出文件C*///在这重载map函数，直接将输入中的value复制到输出数据的key上 注意在map方法中要抛出异常：throws IOException,InterruptedExceptionpublic static class Map  extends Mapper<Object, Text, Text, Text>{/********** Begin **********/public void map(Object key, Text value, Context content) throws IOException, InterruptedException {  Text text1 = new Text();Text text2 = new Text();StringTokenizer itr = new StringTokenizer(value.toString());while (itr.hasMoreTokens()) {text1.set(itr.nextToken());text2.set(itr.nextToken());content.write(text1, text2);}}  /********** End **********/} //在这重载reduce函数，直接将输入中的key复制到输出数据的key上  注意在reduce方法上要抛出异常：throws IOException,InterruptedExceptionpublic static class  Reduce extends Reducer<Text, Text, Text, Text> {/********** Begin **********/public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {Set<String> set = new TreeSet<String>();for(Text tex : values){set.add(tex.toString());}for(String tex : set){context.write(key, new Text(tex));}}  /********** End **********/}public static void main(String[] args) throws Exception{// TODO Auto-generated method stubConfiguration conf = new Configuration();conf.set("fs.default.name","hdfs://localhost:9000");Job job = Job.getInstance(conf,"Merge and duplicate removal");job.setJarByClass(Merge.class);job.setMapperClass(Map.class);job.setCombinerClass(Reduce.class);job.setReducerClass(Reduce.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);String inputPath = "/user/tmp/input/";  //在这里设置输入路径String outputPath = "/user/tmp/output/";  //在这里设置输出路径FileInputFormat.addInputPath(job, new Path(inputPath));FileOutputFormat.setOutputPath(job, new Path(outputPath));System.exit(job.waitForCompletion(true) ? 0 : 1);}}

三. 第3关：信息挖掘 - 挖掘父子关系

编程要求

你编写的程序要能挖掘父子辈关系，给出祖孙辈关系的表格。规则如下：

孙子在前，祖父在后；
输入文件路径：/user/reduce/input；
输出文件路径：/user/reduce/output。

请先启动Hadoop再点击评测！
所以要先在命令行输入下面启动命令


start-dfs.sh


import java.io.IOException;
import java.util.*;import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;public class simple_data_mining {public static int time = 0;/*** @param args* 输入一个child-parent的表格* 输出一个体现grandchild-grandparent关系的表格*///Map将输入文件按照空格分割成child和parent，然后正序输出一次作为右表，反序输出一次作为左表，需要注意的是在输出的value中必须加上左右表区别标志public static class Map extends Mapper<Object, Text, Text, Text>{public void map(Object key, Text value, Context context) throws IOException,InterruptedException{/********** Begin **********/String line = value.toString();String[] childAndParent = line.split(" ");List<String> list = new ArrayList<>(2);for (String childOrParent : childAndParent) {if (!"".equals(childOrParent)) {list.add(childOrParent);} } if (!"child".equals(list.get(0))) {String childName = list.get(0);String parentName = list.get(1);String relationType = "1";context.write(new Text(parentName), new Text(relationType + "+"+ childName + "+" + parentName));relationType = "2";context.write(new Text(childName), new Text(relationType + "+"+ childName + "+" + parentName));}/********** End **********/}}public static class Reduce extends Reducer<Text, Text, Text, Text>{public void reduce(Text key, Iterable<Text> values,Context context) throws IOException,InterruptedException{/********** Begin **********///输出表头if (time == 0) {context.write(new Text("grand_child"), new Text("grand_parent"));time++;}//获取value-list中value的child
List<String> grandChild = new ArrayList<>();//获取value-list中value的parentList<String> grandParent = new ArrayList<>();//左表，取出child放入grand_childfor (Text text : values) {String s = text.toString();String[] relation = s.split("\\+");String relationType = relation[0];String childName = relation[1];String parentName = relation[2];if ("1".equals(relationType)) {grandChild.add(childName);} else {grandParent.add(parentName);}}//右表，取出parent放入grand_parentint grandParentNum = grandParent.size();int grandChildNum = grandChild.size();if (grandParentNum != 0 && grandChildNum != 0) {for (int m = 0; m < grandChildNum; m++) {for (int n = 0; n < grandParentNum; n++) {//输出结果context.write(new Text(grandChild.get(m)), new Text(grandParent.get(n)));}}}/********** End **********/}}public static void main(String[] args) throws Exception{// TODO Auto-generated method stubConfiguration conf = new Configuration();Job job = Job.getInstance(conf,"Single table join");job.setJarByClass(simple_data_mining.class);job.setMapperClass(Map.class);job.setReducerClass(Reduce.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);String inputPath = "/user/reduce/input";   //设置输入路径String outputPath = "/user/reduce/output";   //设置输出路径FileInputFormat.addInputPath(job, new Path(inputPath));FileOutputFormat.setOutputPath(job, new Path(outputPath));System.exit(job.waitForCompletion(true) ? 0 : 1);}
}

头哥实践平台之MapReduce基础实战

一. 第1关：成绩统计

编程要求

二. 第2关：文件内容合并去重

编程要求

三. 第3关：信息挖掘 - 挖掘父子关系

编程要求

相关文章：

头哥实践平台之MapReduce基础实战

Linux基础知识——tmux和vim

Java Web——TomcatWeb服务器

Zookeeper 命令使用和数据说明

索尼RSV文件怎么恢复为MP4视频

pytorch-gpu（Anaconda3+cuda+cudnn）

解析数据洁净之道：BI中数据清理对见解的深远影响

efcore反向共工程,单元测试

利用IP风险画像强化金融行业网络安全防御

1334. 阈值距离内邻居最少的城市

Live800:客服行业的发展历程及未来前景

exsi的安装和配置

基于springboot实现校园医疗保险管理系统【项目源码】

Python 如何实现组合（Composite）设计模式？什么是组合设计模式？

编辑器vim和编译器gcc/g++

linux 系统下文本编辑常用的命令

3D Gaussian Splatting文件的压缩【3D高斯泼溅】

Spring Boot 整合xxl-job实现分布式定时任务

16.最接近的三数之和

php 插入排序算法实现

AI-调查研究-01-正念冥想有用吗？对健康的影响及科学指南

stm32G473的flash模式是单bank还是双bank？

java 实现excel文件转pdf | 无水印 | 无限制

LeetCode - 394. 字符串解码

【2025年】解决Burpsuite抓不到https包的问题

解决本地部署 SmolVLM2 大语言模型运行 flash-attn 报错

Element Plus 表单(el-form)中关于正整数输入的校验规则

蓝桥杯冶炼金属

C/C++ 中附加包含目录、附加库目录与附加依赖项详解

C++.OpenGL （20/64）混合（Blending）