hadoop筆記五：對象的序列化和比較

時間 2019-11-19

標籤 hadoop 筆記對象序列比較欄目 Hadoop 简体版

原文原文鏈接

1.序列化和反序列化

1）序列化相關的接口和類

java中類能夠序列化是實現接口Serializable。html

hadoop中類能夠序列化是實現接口Writable。java

hadoop對應java基本數據類型實現序列化類：apache

Writable接口中定義了兩個方法：api

readFields(DataInput in )反序列化方法，write(DataOutput out)序列化方法。數組

官網例子：oracle

public class MyWritable implements Writable{
	
	// Some data     
    private int counter;
    private long timestamp;
    
    public static MyWritable read(DataInput in) throws IOException {
        MyWritable w = new MyWritable();
        w.readFields(in);
        return w;
      }

	public void write(DataOutput out) throws IOException {
		//反序列化，從流中讀取數據
		out.writeInt(counter);
        out.writeLong(timestamp);		
	}

	public void readFields(DataInput in) throws IOException {
		//序列化，將對象數據讀入到流中
		 counter = in.readInt();
         timestamp = in.readLong();
	}

}

2）經過實例比較java和hadoop序列化差異

經過hadoop的IntWritable和java的Integer對比 oop

package com.jf.hdfs;

import java.io.ByteArrayOutputStream;
import java.io.ObjectOutputStream;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Writable;

public class SerializationCompare {

	// hadoop全部對象類型的父類型Writable
	public static byte[] serialize(Writable writable) throws Exception {
		//序列化其實就是將對象轉行爲字節數組
		ByteArrayOutputStream baos = new ByteArrayOutputStream();
		ObjectOutputStream oos = new ObjectOutputStream(baos);
		writable.write(oos);
		oos.close();
		return baos.toByteArray();
	}
	
	//java中序列化將類類型對象轉化爲字節數組
	public static byte[] serialize(Integer integer) throws Exception{
		ByteArrayOutputStream baos = new ByteArrayOutputStream();
		ObjectOutputStream oos = new ObjectOutputStream(baos);
		oos.writeInt(integer);
		oos.close();
		return baos.toByteArray();
	}

	public static void main(String[] args) throws Exception {
		IntWritable intWritable = new IntWritable(200);
		byte[] bytes = serialize(intWritable);
		System.out.println("hadoop序列化:"+bytes.length);
		
		Integer integer = new Integer(200);
		byte[] bytes2 = serialize(integer);
		System.out.println("java序列化:"+bytes2.length);
		
	}
}

執行結果：雖然同樣，其實在大數據裏面hadoop更佔優點。測試

hadoop序列化:10
java序列化:10大數據

4）hadoop中複雜對象類型序列化

package com.jf.hdfs;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;

public class ObjecSerialize {

	public static void main(String[] args) throws Exception {
		Student student = new Student();
		student.setId(new IntWritable(10001));
		student.setName(new Text("sean"));
		student.setGender(true);
		List<Text> list = new ArrayList<Text>();
		list.add(new Text("學校"));
		list.add(new Text("年紀"));
		list.add(new Text("班級"));
		student.setList(list);

		// 對象序列化，將對象寫入到流中
		ByteArrayOutputStream baos = new ByteArrayOutputStream();
		DataOutputStream dos = new DataOutputStream(baos);
		student.write(dos);

		byte[] b = baos.toByteArray();
		System.out.println("序列化以後結果:" + Arrays.toString(b) + ",字節數組長度:" + b.length);

		// 進行反序列化
		ByteArrayInputStream bais = new ByteArrayInputStream(b);
		DataInputStream dis = new DataInputStream(bais);

		Student student2 = new Student();
		student2.readFields(dis);
		
		System.out.println("反序列化ID="+student2.getId().get()+",name="+student2.getName().toString()+",gender="+student2.isGender()+",list=["+student2.getList().get(0).toString()+","+student2.getList().get(1).toString()+","+student2.getList().get(2).toString()+"]");

	}
}

class Student implements Writable {

	private IntWritable id;

	private Text name;

	private boolean gender;

	private List<Text> list = new ArrayList<Text>();

	Student() {
		id = new IntWritable();
		name = new Text();
	}

	Student(Student student) {
		// 這種屬於引用複製，hadoop中嚴格杜絕
		// this.id = student.id;
		// this.name = student.name;

		// 在hadoop中要使用這種屬性值的複製
		id = new IntWritable(student.id.get());
		name = new Text(student.name.toString());
	}

	public void write(DataOutput out) throws IOException {
		// 序列化過程，將對象中全部數據寫入到流中
		id.write(out);
		name.write(out);
		BooleanWritable genter = new BooleanWritable(gender);
		genter.write(out);

		// 在hadoop中序列化集合時，要將集合的長度也進行序列化
		int size = list.size();
		new IntWritable(size).write(out);
		// 而後再序列化集合中的每個元素
		for (int i = 0; i < size; i++) {
			Text text = list.get(i);
			text.write(out);
		}
	}

	// 反序列化將流中的二進制讀出到對象中
	public void readFields(DataInput in) throws IOException {
		id.readFields(in);
		name.readFields(in);
		// 從流中讀出Writable類型，而後再複製給java基本類型
		BooleanWritable bw = new BooleanWritable();
		bw.readFields(in);
		gender = bw.get();

		// 反序列化集合時首選將集合長度進行反序列化
		IntWritable size = new IntWritable();
		size.readFields(in);
		list.clear();
		// 再反序列化流中集合的每個元素
		for (int i = 0; i < size.get(); i++) {
			Text text = new Text();
			text.readFields(in);
			list.add(text);
		}
	}

	public IntWritable getId() {
		return id;
	}

	public void setId(IntWritable id) {
		this.id = id;
	}

	public Text getName() {
		return name;
	}

	public void setName(Text name) {
		this.name = name;
	}

	public boolean isGender() {
		return gender;
	}

	public void setGender(boolean gender) {
		this.gender = gender;
	}

	public List<Text> getList() {
		return list;
	}

	public void setList(List<Text> list) {
		this.list = list;
	}
}

執行結果：this

序列化以後結果:[0, 0, 39, 17, 4, 115, 101, 97, 110, 1, 0, 0, 0, 3, 6, -27, -83, -90, -26, -96, -95, 6, -27, -71, -76, -25, -70, -86, 6, -25, -113, -83, -25, -70, -89],字節數組長度:35
反序列化ID=10001,name=sean,gender=true,list=[學校,年紀,班級]

2.對象比較

1）WritableComparable

WritableComparable<T>接口繼承Comparable<T>和Writable接口，繼承過來三個方法，從Writable繼承過來readFields, write，從Comparable<T>繼承過來compareTo。

官網提供例子：

package com.jf.hdfs;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;

public class MyWritableComparable implements WritableComparable {

	private int counter;
	private long timestamp;

	public void write(DataOutput out) throws IOException {
		out.writeInt(counter);
		out.writeLong(timestamp);
	}

	public void readFields(DataInput in) throws IOException {
		counter = in.readInt();
		timestamp = in.readLong();
	}

	public int compareTo(Object o) {
		MyWritableComparable obj = (MyWritableComparable) o;
		int value = this.counter;
		int value2 = obj.counter;

		return value < value2 ? -1 : (value == value2 ? 0 : 1);
	}

	public int hashCode() {
		final int prime = 31;
		int result = 1;
		result = prime * result + counter;
		result = prime * result + (int) (timestamp ^ (timestamp >>> 32));
		return result;
	}

}

2）RawComparator

RawComparator<T>接口繼承了java.util.Comparator<T>接口，除了從Comparator<T>繼承過來的兩個方法compare、equals以外，它本身也定義了一個方法compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2)有6個參數。該方法是在字節流的層面上去作比較，第一個參數：指定字節數組，第二個參數：從哪裏開始比較，第三個參數：比較多長。

3）WritableComparator

WritableComparator類，實現了Comparator, Configurable, RawComparator三個接口。

構造方法

部分實現方法

4）hadoop中已經實現了一些能夠序列化又能夠比較的類

5）比較兩個對象大小

有兩種方式，一種是該類實現WritableComparator接口，另外一種是經過實現一個比較器去進行比較。

這裏經過WritableComparator接口實現一個自定義類的比較方法。

package com.jf.hdfs;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;

public class AccountWritable implements WritableComparable<AccountWritable> {

	private IntWritable code;
	private Text name;
	private BooleanWritable gender;

	AccountWritable() {
		code = new IntWritable();
		name = new Text();
		gender = new BooleanWritable();
	}

	// 把參數類型和類類型相同的構造器，叫複製構造器
	AccountWritable(AccountWritable accountWritable) {
		code = new IntWritable(accountWritable.code.get());
		name = new Text(accountWritable.name.toString());
		gender = new BooleanWritable(accountWritable.gender.get());
	}

	// 注意要賦值類型，不要賦引用類型
	public void set(IntWritable code, Text name, BooleanWritable gender) {
		this.code = new IntWritable(code.get());
		this.name = new Text(name.toString());
		this.gender = new BooleanWritable(gender.get());
	}

	// 將值寫到輸出流中
	public void write(DataOutput out) throws IOException {
		code.write(out);
		name.write(out);
		gender.write(out);
	}

	// 將值從輸入流中讀取出來
	public void readFields(DataInput in) throws IOException {
		code.readFields(in);
		name.readFields(in);
		gender.readFields(in);
	}

	// 比較方法
	public int compareTo(AccountWritable o) {

		int result = this.code.compareTo(o.code);
		if (result == 0) {
			result = this.name.compareTo(o.name);
			if (result == 0) {
				result = this.gender.compareTo(o.gender);
			}
		}
		return result;
	}

	public int hashCode() {
		final int prime = 31;
		int result = 1;
		result = prime * result + code.get();
		result = prime * result + (int) (name.toString().hashCode() ^ (name.toString().hashCode() >>> 32));
		return result;
	}

	public IntWritable getCode() {
		return code;
	}

	public void setCode(IntWritable code) {
		this.code = code;
	}

	public Text getName() {
		return name;
	}

	public void setName(Text name) {
		this.name = name;
	}

	public BooleanWritable getGender() {
		return gender;
	}

	public void setGender(BooleanWritable gender) {
		this.gender = gender;
	}
}

測試：

public static void main(String[] args) {
		AccountWritable a1 = new AccountWritable();
		a1.set(new IntWritable(30), new Text("sean"), new BooleanWritable(true));

		AccountWritable a2 = new AccountWritable();
		a2.set(new IntWritable(30), new Text("sean"), new BooleanWritable(true));
		//比較a1和a2
		System.out.println(a1.compareTo(a2));

	}

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。