hive 開窗函數

測試數據

-- 建表
create table student_scores(
id int,
studentId int,
language int,
math int,
english int,
classId string,
departmentId string
);
-- 寫入數據
insert into table student_scores values 
  (1,111,68,69,90,'class1','department1'),
  (2,112,73,80,96,'class1','department1'),
  (3,113,90,74,75,'class1','department1'),
  (4,114,89,94,93,'class1','department1'),
  (5,115,99,93,89,'class1','department1'),
  (6,121,96,74,79,'class2','department1'),
  (7,122,89,86,85,'class2','department1'),
  (8,123,70,78,61,'class2','department1'),
  (9,124,76,70,76,'class2','department1'),
  (10,211,89,93,60,'class1','department2'),
  (11,212,76,83,75,'class1','department2'),
  (12,213,71,94,90,'class1','department2'),
  (13,214,94,94,66,'class1','department2'),
  (14,215,84,82,73,'class1','department2'),
  (15,216,85,74,93,'class1','department2'),
  (16,221,77,99,61,'class2','department2'),
  (17,222,80,78,96,'class2','department2'),
  (18,223,79,74,96,'class2','department2'),
  (19,224,75,80,78,'class2','department2'),
  (20,225,82,85,63,'class2','department2');

聚合開窗函數

count

-- count 開窗函數

select studentId,math,departmentId,classId,
-- 以符合條件的全部行做爲窗口
count(math) over() as count1,
 -- 以按classId分組的全部行做爲窗口
count(math) over(partition by classId) as count2,
 -- 以按classId分組、按math排序的全部行做爲窗口
count(math) over(partition by classId order by math) as count3,
 -- 以按classId分組、按math排序、按 當前行+往前1行+日後2行的行做爲窗口
count(math) over(partition by classId order by math rows between 1 preceding and 2 following) as count4
from student_scores where departmentId='department1';

結果
studentid   math    departmentid    classid count1  count2  count3  count4
111         69      department1     class1  9       5       1       3
113         74      department1     class1  9       5       2       4
112         80      department1     class1  9       5       3       4
115         93      department1     class1  9       5       4       3
114         94      department1     class1  9       5       5       2
124         70      department1     class2  9       4       1       3
121         74      department1     class2  9       4       2       4
123         78      department1     class2  9       4       3       3
122         86      department1     class2  9       4       4       2

結果解釋:
studentid=115,count1爲全部的行數9,count2爲分區class1中的行數5,count3爲分區class1中math值<=93的行數4,
count4爲分區class1中math值向前+1行向後+2行(實際只有1行)的總行數3。

sum

-- sum開窗函數

select studentId,math,departmentId,classId,
-- 以符合條件的全部行做爲窗口
sum(math) over() as sum1,
-- 以按classId分組的全部行做爲窗口
sum(math) over(partition by classId) as sum2,
 -- 以按classId分組、按math排序後、按到當前行(含當前行)的全部行做爲窗口
sum(math) over(partition by classId order by math) as sum3,
 -- 以按classId分組、按math排序後、按當前行+往前1行+日後2行的行做爲窗口
sum(math) over(partition by classId order by math rows between 1 preceding and 2 following) as sum4
from student_scores where departmentId='department1';

結果
studentid   math    departmentid    classid sum1    sum2    sum3    sum4
111         69      department1     class1  718     410     69      223
113         74      department1     class1  718     410     143     316
112         80      department1     class1  718     410     223     341
115         93      department1     class1  718     410     316     267
114         94      department1     class1  718     410     410     187
124         70      department1     class2  718     308     70      222
121         74      department1     class2  718     308     144     308
123         78      department1     class2  718     308     222     238
122         86      department1     class2  718     308     308     164

結果解釋:
    同count開窗函數

min

-- min 開窗函數

select studentId,math,departmentId,classId,
-- 以符合條件的全部行做爲窗口
min(math) over() as min1,
-- 以按classId分組的全部行做爲窗口
min(math) over(partition by classId) as min2,
 -- 以按classId分組、按math排序後、按到當前行(含當前行)的全部行做爲窗口
min(math) over(partition by classId order by math) as min3,
 -- 以按classId分組、按math排序後、按當前行+往前1行+日後2行的行做爲窗口
min(math) over(partition by classId order by math rows between 1 preceding and 2 following) as min4
from student_scores where departmentId='department1';

結果
studentid   math    departmentid    classid min1    min2    min3    min4
111         69      department1     class1  69      69      69      69
113         74      department1     class1  69      69      69      69
112         80      department1     class1  69      69      69      74
115         93      department1     class1  69      69      69      80
114         94      department1     class1  69      69      69      93
124         70      department1     class2  69      70      70      70
121         74      department1     class2  69      70      70      70
123         78      department1     class2  69      70      70      74
122         86      department1     class2  69      70      70      78

結果解釋:
    同count開窗函數

max

-- max 開窗函數

select studentId,math,departmentId,classId,
-- 以符合條件的全部行做爲窗口
max(math) over() as max1,
-- 以按classId分組的全部行做爲窗口
max(math) over(partition by classId) as max2,
 -- 以按classId分組、按math排序後、按到當前行(含當前行)的全部行做爲窗口
max(math) over(partition by classId order by math) as max3,
 -- 以按classId分組、按math排序後、按當前行+往前1行+日後2行的行做爲窗口
max(math) over(partition by classId order by math rows between 1 preceding and 2 following) as max4
from student_scores where departmentId='department1';

結果
studentid   math    departmentid    classid max1    max2    max3    max4
111         69      department1     class1  94      94      69      80
113         74      department1     class1  94      94      74      93
112         80      department1     class1  94      94      80      94
115         93      department1     class1  94      94      93      94
114         94      department1     class1  94      94      94      94
124         70      department1     class2  94      86      70      78
121         74      department1     class2  94      86      74      86
123         78      department1     class2  94      86      78      86
122         86      department1     class2  94      86      86      86

結果解釋:
    同count開窗函數

avg

-- avg 開窗函數

select studentId,math,departmentId,classId,
-- 以符合條件的全部行做爲窗口
avg(math) over() as avg1,
-- 以按classId分組的全部行做爲窗口
avg(math) over(partition by classId) as avg2,
 -- 以按classId分組、按math排序後、按到當前行(含當前行)的全部行做爲窗口
avg(math) over(partition by classId order by math) as avg3,
 -- 以按classId分組、按math排序後、按當前行+往前1行+日後2行的行做爲窗口
avg(math) over(partition by classId order by math rows between 1 preceding and 2 following) as avg4
from student_scores where departmentId='department1';

結果
studentid   math    departmentid    classid avg1                avg2    avg3                avg4
111         69      department1     class1  79.77777777777777   82.0    69.0                74.33333333333333
113         74      department1     class1  79.77777777777777   82.0    71.5                79.0
112         80      department1     class1  79.77777777777777   82.0    74.33333333333333   85.25
115         93      department1     class1  79.77777777777777   82.0    79.0                89.0
114         94      department1     class1  79.77777777777777   82.0    82.0                93.5
124         70      department1     class2  79.77777777777777   77.0    70.0                74.0
121         74      department1     class2  79.77777777777777   77.0    72.0                77.0
123         78      department1     class2  79.77777777777777   77.0    74.0                79.33333333333333
122         86      department1     class2  79.77777777777777   77.0    77.0                82.0

結果解釋:
    同count開窗函數

first_value

-- first_value 開窗函數

select studentId,math,departmentId,classId,
-- 以符合條件的全部行做爲窗口
first_value(math) over() as first_value1,
-- 以按classId分組的全部行做爲窗口
first_value(math) over(partition by classId) as first_value2,
 -- 以按classId分組、按math排序後、按到當前行(含當前行)的全部行做爲窗口
first_value(math) over(partition by classId order by math) as first_value3,
 -- 以按classId分組、按math排序後、按當前行+往前1行+日後2行的行做爲窗口
first_value(math) over(partition by classId order by math rows between 1 preceding and 2 following) as first_value4
from student_scores where departmentId='department1';

結果
studentid   math    departmentid    classid first_value1    first_value2    first_value3    first_value4
111         69      department1     class1  69              69              69              69
113         74      department1     class1  69              69              69              69
112         80      department1     class1  69              69              69              74
115         93      department1     class1  69              69              69              80
114         94      department1     class1  69              69              69              93
124         70      department1     class2  69              74              70              70
121         74      department1     class2  69              74              70              70
123         78      department1     class2  69              74              70              74
122         86      department1     class2  69              74              70              78

結果解釋:
    studentid=124 first_value1:第一個值是69,first_value2:classId=class1分區 math的第一個值是69。

last_value

-- last_value 開窗函數

select studentId,math,departmentId,classId,
-- 以符合條件的全部行做爲窗口
last_value(math) over() as last_value1,
-- 以按classId分組的全部行做爲窗口
last_value(math) over(partition by classId) as last_value2,
 -- 以按classId分組、按math排序後、按到當前行(含當前行)的全部行做爲窗口
last_value(math) over(partition by classId order by math) as last_value3,
 -- 以按classId分組、按math排序後、按當前行+往前1行+日後2行的行做爲窗口
last_value(math) over(partition by classId order by math rows between 1 preceding and 2 following) as last_value4
from student_scores where departmentId='department1';

結果
studentid   math    departmentid    classid last_value1 last_value2 last_value3 last_value4
111         69      department1     class1  70          93          69          80
113         74      department1     class1  70          93          74          93
112         80      department1     class1  70          93          80          94
115         93      department1     class1  70          93          93          94
114         94      department1     class1  70          93          94          94
124         70      department1     class2  70          70          70          78
121         74      department1     class2  70          70          74          86
123         78      department1     class2  70          70          78          86
122         86      department1     class2  70          70          86          86

lag

lag(col,n,default) 用於統計窗口內往上第n個值。
    col:列名
    n:往上第n行
    default:往上第n行爲NULL時候,取默認值,不指定則取NULL



-- lag 開窗函數

select studentId,math,departmentId,classId,
 --窗口內 往上取第二個 取不到時賦默認值60
lag(math,2,60) over(partition by classId order by math) as lag1,
 --窗口內 往上取第二個 取不到時賦默認值NULL
lag(math,2) over(partition by classId order by math) as lag2
from student_scores where departmentId='department1';

結果
studentid   math    departmentid    classid lag1    lag2
111         69      department1     class1  60      NULL
113         74      department1     class1  60      NULL
112         80      department1     class1  69      69
115         93      department1     class1  74      74
114         94      department1     class1  80      80
124         70      department1     class2  60      NULL
121         74      department1     class2  60      NULL
123         78      department1     class2  70      70
122         86      department1     class2  74      74

結果解釋:
    第3行 lag1:窗口內(69 74 80) 當前行80 向上取第二個值爲69
    倒數第3行 lag2:窗口內(70 74) 當前行74 向上取第二個值爲NULL

lead

lead(col,n,default) 用於統計窗口內往下第n個值。
    col:列名
    n:往下第n行
    default:往下第n行爲NULL時候,取默認值,不指定則取NULL


-- lead開窗函數

select studentId,math,departmentId,classId,
 --窗口內 往下取第二個 取不到時賦默認值60
lead(math,2,60) over(partition by classId order by math) as lead1,
 --窗口內 往下取第二個 取不到時賦默認值NULL
lead(math,2) over(partition by classId order by math) as lead2
from student_scores where departmentId='department1';

結果
studentid   math    departmentid    classid lead1   lead2
111         69      department1     class1  80      80
113         74      department1     class1  93      93
112         80      department1     class1  94      94
115         93      department1     class1  60      NULL
114         94      department1     class1  60      NULL
124         70      department1     class2  78      78
121         74      department1     class2  86      86
123         78      department1     class2  60      NULL
122         86      department1     class2  60      NULL

結果解釋:
    第4行lead1 窗口內向下第二個值爲空,賦值60

cume_dist

計算某個窗口或分區中某個值的累積分佈。假定升序排序,則使用如下公式肯定累積分佈:
小於等於當前值x的行數 / 窗口或partition分區內的總行數。其中,x 等於 order by 子句中指定的列的當前行中的值。



-- cume_dist 開窗函數

select studentId,math,departmentId,classId,
-- 統計小於等於當前分數的人數佔總人數的比例
cume_dist() over(order by math) as cume_dist1,
-- 統計大於等於當前分數的人數佔總人數的比例
cume_dist() over(order by math desc) as cume_dist2,
-- 統計分區內小於等於當前分數的人數佔總人數的比例
cume_dist() over(partition by classId order by math) as cume_dist3
from student_scores where departmentId='department1';

結果
studentid   math    departmentid    classid cume_dist1              cume_dist2          cume_dist3
111         69      department1     class1  0.1111111111111111      1.0                 0.2
113         74      department1     class1  0.4444444444444444      0.7777777777777778  0.4
112         80      department1     class1  0.6666666666666666      0.4444444444444444  0.6
115         93      department1     class1  0.8888888888888888      0.2222222222222222  0.8
114         94      department1     class1  1.0                     0.1111111111111111  1.0
124         70      department1     class2  0.2222222222222222      0.8888888888888888  0.25
121         74      department1     class2  0.4444444444444444      0.7777777777777778  0.5
123         78      department1     class2  0.5555555555555556      0.5555555555555556  0.75
122         86      department1     class2  0.7777777777777778      0.3333333333333333  1.0

結果解釋:
    第三行:
        cume_dist1=小於等於80的人數爲6/總人數9=0.6666666666666666
        cume_dist2=大於等於80的人數爲4/總人數9=0.4444444444444444
        cume_dist3=分區內小於等於80的人數爲3/分區內總人數5=0.6

排序開窗函數

rank

肯定一組值中一個值的排名。若是存在partition by ,則爲每一個分區組中的每一個值排名。排名可能不是連續的。例如,若是兩個行的排名爲 1,則下一個排名爲 3。


-- rank 開窗函數

select *,
-- 對所有學生按數學分數排序 
rank() over(order by math) as rank1,
-- 對院系 按數學分數排序
rank() over(partition by departmentId order by math) as rank2,
-- 對每一個院系每一個班級 按數學分數排序
rank() over(partition by departmentId,classId order by math) as rank3
from student_scores;

結果

id  studentid   language    math    english     classid departmentid    rank1   rank2   rank3
1   111         68          69      90          class1  department1     1       1       1
3   113         90          74      75          class1  department1     3       3       2
2   112         73          80      96          class1  department1     9       6       3
5   115         99          93      89          class1  department1     15      8       4
4   114         89          94      93          class1  department1     17      9       5
9   124         76          70      76          class2  department1     2       2       1
6   121         96          74      79          class2  department1     3       3       2
8   123         70          78      61          class2  department1     7       5       3
7   122         89          86      85          class2  department1     14      7       4
15  216         85          74      93          class1  department2     3       1       1
14  215         84          82      73          class1  department2     11      5       2
11  212         76          83      75          class1  department2     12      6       3
10  211         89          93      60          class1  department2     15      8       4
12  213         71          94      90          class1  department2     17      9       5
13  214         94          94      66          class1  department2     17      9       5
18  223         79          74      96          class2  department2     3       1       1
17  222         80          78      96          class2  department2     7       3       2
19  224         75          80      78          class2  department2     9       4       3
20  225         82          85      63          class2  department2     13      7       4
16  221         77          99      61          class2  department2     20      11      5

dense_rank

dense_rank與rank有一點不一樣,當排名同樣的時候,接下來的行是連續的。如兩個行的排名爲 1,則下一個排名爲 2。


-- dense_rank 開窗函數

select *,
-- 對所有學生按數學分數排序
dense_rank() over(order by math) as dense_rank1,
-- 對院系 按數學分數排序
dense_rank() over(partition by departmentId order by math) as dense_rank2,
-- 對每一個院系每一個班級 按數學分數排序
dense_rank() over(partition by departmentId,classId order by math) as dense_rank3
from student_scores;

結果:
id  studentid   language    math    english classid departmentid    dense_rank1 dense_rank2 dense_rank3
1   111         68          69      90      class1  department1     1           1           1
3   113         90          74      75      class1  department1     3           3           2
2   112         73          80      96      class1  department1     5           5           3
5   115         99          93      89      class1  department1     10          7           4
4   114         89          94      93      class1  department1     11          8           5
9   124         76          70      76      class2  department1     2           2           1
6   121         96          74      79      class2  department1     3           3           2
8   123         70          78      61      class2  department1     4           4           3
7   122         89          86      85      class2  department1     9           6           4
15  216         85          74      93      class1  department2     3           1           1
14  215         84          82      73      class1  department2     6           4           2
11  212         76          83      75      class1  department2     7           5           3
10  211         89          93      60      class1  department2     10          7           4
12  213         71          94      90      class1  department2     11          8           5
13  214         94          94      66      class1  department2     11          8           5
18  223         79          74      96      class2  department2     3           1           1
17  222         80          78      96      class2  department2     4           2           2
19  224         75          80      78      class2  department2     5           3           3
20  225         82          85      63      class2  department2     8           6           4
16  221         77          99      61      class2  department2     12          9           5

ntile

將分區中已排序的行劃分爲大小盡量相等的指定數量的排名的組,並返回給定行所在的組的排名。

-- ntile 開窗函數

select *,
-- 對分區內的數據分紅兩組
ntile(2) over(partition by departmentid order by math) as ntile1,
-- 對分區內的數據分紅三組
ntile(3) over(partition by departmentid order by math) as ntile2
from student_scores;

結果
id  studentid   language    math    english classid departmentid    ntile1  ntile2
1   111         68          69      90      class1  department1     1       1
9   124         76          70      76      class2  department1     1       1
6   121         96          74      79      class2  department1     1       1
3   113         90          74      75      class1  department1     1       2
8   123         70          78      61      class2  department1     1       2
2   112         73          80      96      class1  department1     2       2
7   122         89          86      85      class2  department1     2       3
5   115         99          93      89      class1  department1     2       3
4   114         89          94      93      class1  department1     2       3
18  223         79          74      96      class2  department2     1       1
15  216         85          74      93      class1  department2     1       1
17  222         80          78      96      class2  department2     1       1
19  224         75          80      78      class2  department2     1       1
14  215         84          82      73      class1  department2     1       2
11  212         76          83      75      class1  department2     1       2
20  225         82          85      63      class2  department2     2       2
10  211         89          93      60      class1  department2     2       2
12  213         71          94      90      class1  department2     2       3
13  214         94          94      66      class1  department2     2       3
16  221         77          99      61      class2  department2     2       3

結果解釋:
    第8行
        ntile1:對分區的數據均勻分紅2組後,當前行的組排名爲2
        ntile2:對分區的數據均勻分紅3組後,當前行的組排名爲3

row_number

從1開始對分區內的數據排序。


- row_number 開窗函數

select studentid,departmentid,classid,math,
-- 對分區departmentid,classid內的數據按math排序
row_number() over(partition by departmentid,classid order by math) as row_number
from student_scores;

結果
studentid   departmentid    classid math    row_number
111         department1     class1  69      1
113         department1     class1  74      2
112         department1     class1  80      3
115         department1     class1  93      4
114         department1     class1  94      5
124         department1     class2  70      1
121         department1     class2  74      2
123         department1     class2  78      3
122         department1     class2  86      4
216         department2     class1  74      1
215         department2     class1  82      2
212         department2     class1  83      3
211         department2     class1  93      4
213         department2     class1  94      5
214         department2     class1  94      6
223         department2     class2  74      1
222         department2     class2  78      2
224         department2     class2  80      3
225         department2     class2  85      4
221         department2     class2  99      5

結果解釋:
    同一分區,相同值,不一樣序。如studentid=213 studentid=214 值都爲94 排序爲5,6。

precent_rank

計算給定行的百分比排名。能夠用來計算超過了百分之多少的人。如360小助手開機速度超過了百分之多少的人。
(當前行的rank值-1)/(分組內的總行數-1)

-- percent_rank 開窗函數

select studentid,departmentid,classid,math,
row_number() over(partition by departmentid,classid order by math) as row_number,
percent_rank() over(partition by departmentid,classid order by math) as percent_rank
from student_scores;

結果
studentid   departmentid    classid math    row_number  percent_rank
111         department1     class1  69      1           0.0
113         department1     class1  74      2           0.25
112         department1     class1  80      3           0.5
115         department1     class1  93      4           0.75
114         department1     class1  94      5           1.0
124         department1     class2  70      1           0.0
121         department1     class2  74      2           0.3333333333333333
123         department1     class2  78      3           0.6666666666666666
122         department1     class2  86      4           1.0
216         department2     class1  74      1           0.0
215         department2     class1  82      2           0.2
212         department2     class1  83      3           0.4
211         department2     class1  93      4           0.6
213         department2     class1  94      5           0.8
214         department2     class1  94      6           0.8
223         department2     class2  74      1           0.0
222         department2     class2  78      2           0.25
224         department2     class2  80      3           0.5
225         department2     class2  85      4           0.75
221         department2     class2  99      5           1.0

結果解釋:
    studentid=115,percent_rank=(4-1)/(5-1)=0.75
    studentid=123,percent_rank=(3-1)/(4-1)=0.6666666666666666
相關文章
相關標籤/搜索