Java開發中各類集合框架簡介

時間 2019-12-14

標籤 java 開發各類集合框架簡介欄目 Java 简体版

原文原文鏈接

在大數據MapReduce做業開發中，咱們常常會遇到一些大小表的join，這是若是這個小表足夠「小」的話，咱們可使用進行「map-join-side」，這要就能夠有效的下降reduce端的壓力，可是在經常使用的JDK的集合中的Map有些許雞肋，所以，各路大神們針對這個問題開發出了不一樣的集合框架，用以替換原始集合，下面咱們具體介紹幾種經常使用的集合框架：
首先，咱們設想了一個場景——計算不一樣事業部01五、2016年老客，新客-轉化，新客-新增的用戶數量，這三種類型的用戶的定義以下：
老客：前一年和當前年均購買過服百事業部商品
新客-轉化：前一年購買過圖書，當前年購買了服百事業部商品
新客-新增：前一年什麼也沒買，當前年購買了服百事業部商品
所以，根據上述定義，舉例：2016年老客就是根據cust_id（用戶ID）在服百分類（fubaiArrayList ）和服百總和（fubaiAllArrayList ）兩個集合查看2016年和2015年均存在的用戶。2016年新客-轉化就是根據cust_id（用戶ID）在圖書（bookArrayList ）存在2015年購買記錄，在服百分類（fubaiArrayList ）和服百總和（fubaiAllArrayList ）兩個集合查看2016年存在的用戶。2016年新客-新增就是根據cust_id（用戶ID）在全部用戶（allArrayList ）不存在2015年購買記錄，但在服百分類（fubaiArrayList ）和服百總和（fubaiAllArrayList ）兩個集合查看2016年存在的用戶。
所以，根據上述解釋，咱們構造了原始實現代碼爲：app

public static class Map extends Mapper<LongWritable, Text, Text, Text> {

public static ArrayList<String> bookArrayList = null;
public static ArrayList<String> fubaiAllArrayList = null;
public static ArrayList<String> fubaiArrayList = null;
public static ArrayList<String> allArrayList = null;

@Override
protected void setup(Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
bookArrayList = new ArrayList<String>();
Configuration configuration = context.getConfiguration();
FileSystem fs = FileSystem.get(configuration);
InputStream in = null;
BufferedReader reader = null;
String tempString = null;
Path book_path = new Path("/personal/zhoujie/recommend/book.csv");//14 15年整年購買過書的用戶名單
if (fs.exists(book_path)) {
in = fs.open(book_path);
reader = new BufferedReader(new InputStreamReader(in, "utf-8"));
while ((tempString = reader.readLine()) != null) {
//年份    cust_id    圖書事業部
String parts[] = tempString.split(TAB, -1);
if(parts.length!=3)continue;
bookArrayList.add(parts[0]+TAB+parts[1]);
}
}
fubaiAllArrayList = new ArrayList<String>();
Path fubai_all_path = new Path("/personal/zhoujie/recommend/fubaiall.csv");//14 15年整年購買過服百的所有用戶名單
if (fs.exists(fubai_all_path)) {
in = fs.open(fubai_all_path);
reader = new BufferedReader(new InputStreamReader(in, "utf-8"));
while ((tempString = reader.readLine()) != null) {
//年份    cust_id    服百事業部總和
String parts[] = tempString.split(TAB, -1);
if(parts.length!=3)continue;
fubaiAllArrayList.add(parts[0]+TAB+parts[1]);
}
}
fubaiArrayList = new ArrayList<String>();
Path fubai_path = new Path("/personal/zhoujie/recommend/fubaiall.csv");//14 15年整年購買過各服百事業部的所有用戶名單
if (fs.exists(fubai_path)) {
in = fs.open(fubai_path);
reader = new BufferedReader(new InputStreamReader(in, "utf-8"));
while ((tempString = reader.readLine()) != null) {
//年份    cust_id    各服百事業部
String parts[] = tempString.split(TAB, -1);
if(parts.length!=3)continue;
fubaiArrayList.add(parts[0]+TAB+parts[1]);
}
}
allArrayList = new ArrayList<String>();
Path all_path = new Path("/personal/zhoujie/recommend/all_order.csv");//14 15年整年下單用戶
if (fs.exists(all_path)) {
in = fs.open(all_path);
reader = new BufferedReader(new InputStreamReader(in, "utf-8"));
while ((tempString = reader.readLine()) != null) {
//年份    cust_id    事業部
String parts[] = tempString.split(TAB, -1);
if(parts.length!=3)continue;
allArrayList.add(parts[0]+TAB+parts[1]);
}
}
}

@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
InputSplit inputSplit = context.getInputSplit();
String fileName = ((FileSplit) inputSplit).getPath().toString();

if(fileName.contains("/personal/zhoujie/recommend/orderdetail/")){
//date+TAB+app_id+TAB+permanentid+TAB+toProductid    "APP全站" "服百事業部" order_id 單價 個數 cust_id
String[] splited = value.toString().split(TAB, -1);
if(splited.length!=10)return;
String year = splited[0].substring(0, 4);
String cust_id = splited[9];
String department = splited[5];
if("2015".equals(year)){
if("服百事業部總和".equals(department)){//所有服百事業部
if (fubaiAllArrayList.contains("2014"+TAB+cust_id)) {//說明14年在服百事業部買過，做爲老用戶
context.write(new Text("2015"+TAB+"服百事業部總和"+TAB+"老用戶"), new Text(cust_id));
}else if(bookArrayList.contains("2014"+TAB+cust_id)){//說明14年在圖書事業部買過，做爲新用戶-轉化用戶
context.write(new Text("2015"+TAB+"服百事業部總和"+TAB+"新用戶-轉化用戶"), new Text(cust_id));
}else if(!allArrayList.contains("2014"+TAB+cust_id)){//說明在14年沒有買過任何東西
context.write(new Text("2015"+TAB+"服百事業部總和"+TAB+"新用戶-新增用戶"), new Text(cust_id));
}
}else {//各服百事業部
if (fubaiArrayList.contains("2014"+TAB+cust_id)) {//說明14年在子服百事業部買過，做爲老用戶
context.write(new Text("2015"+TAB+department+TAB+"老用戶"), new Text(cust_id));
}else if(bookArrayList.contains("2014"+TAB+cust_id)){//說明14年在圖書事業部買過，做爲新用戶-轉化用戶
context.write(new Text("2015"+TAB+department+TAB+"新用戶-轉化用戶"), new Text(cust_id));
}else if(!allArrayList.contains("2014"+TAB+cust_id)){//說明在14年沒有買過任何東西
context.write(new Text("2015"+TAB+department+TAB+"新用戶-新增用戶"), new Text(cust_id));
}
}
}else if ("2016".equals(year)) {
if("服百事業部總和".equals(department)){//所有服百事業部
if (fubaiAllArrayList.contains("2015"+TAB+cust_id)) {//說明15年在服百事業部買過，做爲老用戶
context.write(new Text("2016"+TAB+"服百事業部總和"+TAB+"老用戶"), new Text(cust_id));
}else if(bookArrayList.contains("2015"+TAB+cust_id)){//說明15年在圖書事業部買過，做爲新用戶-轉化用戶
context.write(new Text("2016"+TAB+"服百事業部總和"+TAB+"新用戶-轉化用戶"), new Text(cust_id));
}else if(!allArrayList.contains("2015"+TAB+cust_id)){//說明在15年沒有買過任何東西
context.write(new Text("2016"+TAB+"服百事業部總和"+TAB+"新用戶-新增用戶"), new Text(cust_id));
}
}else {//各服百事業部
if (fubaiArrayList.contains("2015"+TAB+cust_id)) {//說明15年在子服百事業部買過，做爲老用戶
context.write(new Text("2016"+TAB+department+TAB+"老用戶"), new Text(cust_id));
}else if(bookArrayList.contains("2015"+TAB+cust_id)){//說明15年在圖書事業部買過，做爲新用戶-轉化用戶
context.write(new Text("2016"+TAB+department+TAB+"新用戶-轉化用戶"), new Text(cust_id));
}else if(!allArrayList.contains("2015"+TAB+cust_id)){//說明在15年沒有買過任何東西
context.write(new Text("2016"+TAB+department+TAB+"新用戶-新增用戶"), new Text(cust_id));
}
}
}
}
}
}

1、JDK集合類
不用說，這個不是咱們今天介紹的重點。正是因爲原始集合的效率低下才有了這篇文章的存在。即上述代碼就是JDK集合類的實現代碼，通過屢次測試，做業消耗時間大概在三個小時做業。
2、FastUtil集合框架
通過測試，FastUtil的集合類替換原始集合的時候，用時兩小時：
bookArrayList = new ObjectBigArrayBigList<String>()
3、HPPC集合框架
通過測試，FastUtil的集合類替換原始集合的時候，用時三分鐘：
bookArrayList = new ObjectHashSet<String>()
好快！
通過這三個集合類的測試，發現HPPC集合框架的查詢效率是最高的。框架