專利數據的特性java
首先拿到專利數據:http://data.nber.org/patents/apache
本文使用是的cite75-99.txt,該文件涵蓋了自1975年到1999年間對美國專利的引用,包含超過1600萬條數據,前幾行以下圖:api
其中第一列爲專利號、第二列爲被第一列引用的專利號網絡
CITING | CITED |
---|---|
3858241 | 956203 |
import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.*; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import java.io.IOException; import java.util.Iterator; // 讀取專利引用數據,對於每個專利找到哪些專列對他進行了引用並進行合併。 public class FindCitedPatentsAndOrder extends Configured implements Tool { public static class MapClass extends MapReduceBase implements Mapper<Text, Text, Text, Text> { @Override public void map(Text key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { output.collect(value, key); // 關鍵點 } } public static class ReduceClass extends MapReduceBase implements Reducer<Text, Text, Text, Text> { @Override public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { String csv = ""; while (values.hasNext()) { if (csv.length() > 0) csv += ","; csv += values.next().toString(); } output.collect(key, new Text(csv)); } } @Override public int run(String[] args) throws Exception { JobConf job = new JobConf(getConf(), getClass()); Path in = new Path(args[0]); Path out = new Path(args[1]); FileInputFormat.addInputPath(job, in); FileOutputFormat.setOutputPath(job, out); job.setJobName("FindCitedPatentsAndOrder"); job.setMapperClass(MapClass.class); job.setReducerClass(ReduceClass.class); job.setInputFormat(KeyValueTextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.set("key.value.separator.in.input.line", ","); JobClient.runJob(job); return 0; } public static void main(String[] args) throws Exception { int exitCode = ToolRunner.run(new FindCitedPatentsAndOrder(), args); System.exit(exitCode); } }
import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.*; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import java.io.IOException; import java.util.Iterator; // 計算不一樣引用次數專利的數目 public class CitedPatentsNumberCounter extends Configured implements Tool { public static class MapClass extends MapReduceBase implements Mapper<Text, Text, Text, Text> { @Override public void map(Text key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { output.collect(value, key); // 關鍵點 } } public static class ReduceClass extends MapReduceBase implements Reducer<Text, Text, Text, IntWritable> { @Override public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { int count = 0; while (values.hasNext()){ values.next(); count++; } output.collect(key, new IntWritable(count)); } } @Override public int run(String[] args) throws Exception { JobConf job = new JobConf(getConf(), getClass()); Path in = new Path(args[0]); Path out = new Path(args[1]); FileInputFormat.addInputPath(job, in); FileOutputFormat.setOutputPath(job, out); job.setJobName("CitedPatentsNumberCounter"); job.setMapperClass(MapClass.class); job.setReducerClass(ReduceClass.class); job.setInputFormat(KeyValueTextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // 同時設置K2,V2和K3,V3的類型 JobClient.runJob(job); return 0; } public static void main(String[] args) throws Exception { int exitCode = ToolRunner.run(new CitedPatentsNumberCounter(), args); System.exit(exitCode); } }
import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.*; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import java.io.IOException; import java.util.Iterator; public class CitationFrequencyStatistics extends Configured implements Tool { public static class MapClass extends MapReduceBase implements Mapper<Text, Text, IntWritable, IntWritable> { private final static IntWritable UNO = new IntWritable(1); // 單位1 private IntWritable citationCount = new IntWritable(); @Override public void map(Text key, Text value, OutputCollector<IntWritable, IntWritable> output, Reporter reporter) throws IOException { citationCount.set(Integer.parseInt(value.toString())); output.collect(citationCount, UNO); // 關鍵點 } } public static class ReduceClass extends MapReduceBase implements Reducer<IntWritable, IntWritable, IntWritable, IntWritable> { @Override public void reduce(IntWritable key, Iterator<IntWritable> values, OutputCollector<IntWritable, IntWritable> output, Reporter reporter) throws IOException { int count = 0; while (values.hasNext()){ values.next(); count++; } output.collect(key, new IntWritable(count)); } } @Override public int run(String[] args) throws Exception { JobConf job = new JobConf(getConf(), getClass()); Path in = new Path(args[0]); Path out = new Path(args[1]); FileInputFormat.addInputPath(job, in); FileOutputFormat.setOutputPath(job, out); job.setJobName("CitationFrequencyStatistics"); job.setMapperClass(MapClass.class); job.setReducerClass(ReduceClass.class); job.setInputFormat(KeyValueTextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); // 同時設置K2,V2和K3,V3的類型 JobClient.runJob(job); return 0; } public static void main(String[] args) throws Exception { int exitCode = ToolRunner.run(new CitationFrequencyStatistics(), args); System.exit(exitCode); } }
Hadoop最新版本的MapReduce Release 0.20.0的API包括了一個全新的Mapreduce JAVA API,有時候也稱爲上下文對象。app
新的API類型上不兼容之前的API,因此,之前的應用程序須要重寫才能使新的API發揮其做用 。ide
新的API和舊的API之間有下面幾個明顯的區別。函數
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import java.io.IOException; public class MyJob extends Configured implements Tool { public static class MapClass extends Mapper<LongWritable, Text, Text, Text> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] citation = value.toString().split(","); context.write(new Text(citation[1]), new Text(citation[0])); } } public static class ReduceClass extends Reducer<Text, Text, Text, Text> { @Override protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { String csv = ""; for (Text val: values) { if (csv.length() > 0) csv += ","; csv += val.toString(); } context.write(key, new Text(csv)); } } @Override public int run(String[] strings) throws Exception { Configuration conf = getConf(); Job job = new Job(conf, "Myjob"); job.setJarByClass(MyJob.class); job.setMapperClass(MapClass.class); job.setReducerClass(ReduceClass.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); return 0; } }
專利數據網址oop
Hadoop實戰之專利數據處理.net