從給定的list中作數據抽樣,須要保證採樣結果數據的分佈平衡。 java
/** * 從min - max之間取出總數爲items的隨機數 * @param min * @param items * @param max * @return */ private static List<Integer> getRandomId(int min,int items,int max){ List<Integer> ids = Lists.newArrayList(); while(ids.size()<items){ int randomId = ThreadLocalRandom.current().nextInt(max)+min; if(!ids.contains(randomId))ids.add(randomId); } ids.sort((x,y)->x-y); return ids; } /** * 從給定的list中作數據抽樣,須要保證採樣數據的均勻分佈 * 步驟: * 1:將原數據分 m 份, * 2:取得每份的最小index和最大index * 3:從最小index到最大index之間取 n 個list id * 4:從原list中取出對應的id數據 * @param srcDatas 源數據 * @param sampleTotal 抽樣的數據總數 * @param splitCopies 將原數據拆分的份數 */ public static List<String> sampleData(List<String> srcDatas,int sampleTotal,int splitCopies){ if(splitCopies<=0)splitCopies = 1; int items = sampleTotal/splitCopies; //從每份中抽取的數據,數據總數將等於sampleTotal List<String> filterRes = Lists.newArrayList();//用於保存最終的抽樣結果 ListSplit<String> listSplit = new ListSplit<>();//對list作拆分算法,源碼:https://my.oschina.net/u/2391658/blog/703032 List<List<String>> splitRes = listSplit.split(srcDatas,splitCopies); int preListSize = 0;//初始化第一份List的最小下標 for(int i=0;i<splitCopies;i++){ List<String> listBlock = splitRes.get(i);//取出拆分後的list單元 System.out.println(preListSize+"-->"+(preListSize+listBlock.size()-1)); List<Integer> ids = getRandomId(preListSize,items,listBlock.size());//取到排序後的抽樣數據id System.out.println(Arrays.toString(ids.toArray())); ids.forEach(id->filterRes.add(srcDatas.get(id)));//取出list 下標id對應的值 preListSize = preListSize+listBlock.size(); //從新初始化list的最小下標 } System.out.println("抽樣結果:"); System.out.println(Arrays.toString(filterRes.toArray())); return filterRes; } public static void main(String args[]){ List<String> datas = Lists.newArrayList(); for(int y=0;y<1004;y++)datas.add(y+"");//構造數據 sampleData(datas,100,10); }
執行結果: 算法
0-->99 [5, 6, 25, 28, 29, 38, 48, 69, 72, 81] //從每份list中抽樣結果 100-->199 [104, 142, 144, 145, 159, 164, 172, 174, 180, 188] 200-->299 [207, 212, 219, 228, 239, 250, 264, 281, 298, 299] 300-->399 [309, 313, 316, 324, 329, 331, 363, 364, 368, 377] 400-->499 [401, 417, 429, 441, 442, 448, 453, 484, 490, 493] 500-->599 [527, 532, 537, 544, 555, 556, 575, 584, 593, 596] 600-->699 [601, 628, 649, 655, 656, 659, 662, 675, 684, 696] 700-->799 [707, 709, 729, 734, 752, 763, 767, 770, 773, 774] 800-->899 [804, 820, 828, 831, 837, 848, 858, 865, 887, 893] 900-->999 [914, 926, 929, 940, 943, 954, 964, 979, 981, 998] 抽樣結果: [5, 6, 25, 28, 29, 38, 48, 69, 72, 81, 104, 142, 144, 145, 159, 164, 172, 174, 180, 188, 207, 212, 219, 228, 239, 250, 264, 281, 298, 299, 309, 313, 316, 324, 329, 331, 363, 364, 368, 377, 401, 417, 429, 441, 442, 448, 453, 484, 490, 493, 527, 532, 537, 544, 555, 556, 575, 584, 593, 596, 601, 628, 649, 655, 656, 659, 662, 675, 684, 696, 707, 709, 729, 734, 752, 763, 767, 770, 773, 774, 804, 820, 828, 831, 837, 848, 858, 865, 887, 893, 914, 926, 929, 940, 943, 954, 964, 979, 981, 998]