O3都是怪物,這裏分析的是CLANG怪物,示例程序遍歷數組每一個元素而後放大。數組
void foreach_scale(int arr[],int elem){ for(int i=0;i<elem;i++){ arr[i] += (elem*1024); } }
這裏刪去了用處不大的內容,只保留了關鍵的LLVM IR。經過分析能夠看到,若是循環小於8 LLVM IR會使用vector,vector使用SIMD指令高效進行計算,若是大於8則是普通的for形式。less
; Function Attrs: norecurse nounwind define void @"\01?foreach_scale@@YAXQAHH@Z"(i32* nocapture %arr, i32 %elem) local_unnamed_addr #0 { entry: ;elem>0則進入循環,不然整個函數結束 %cmp5 = icmp sgt i32 %elem, 0 br i1 %cmp5, label %for.body.lr.ph, label %for.cond.cleanup for.body.lr.ph: %mul = shl i32 %elem, 10; ; elem和8進行比較(utl表示unsigned less than) ; elem<8則跳到正常循環%for.body.preheader,不然跳到%vector.ph %min.iters.check = icmp ult i32 %elem, 8 br i1 %min.iters.check, label %for.body.preheader, label %vector.ph for.body.preheader: ;phi表示SSA裏面的φ函數,詳細參見LLVM DOC %i.06.ph = phi i32 [ 0, %for.body.lr.ph ], [ %n.vec, %middle.block ] br label %for.body vector.ph: %n.vec = and i32 %elem, -8 ;首先構造<%mul val val val>,而後shufflevector構造<%mul %mul %mul %mul> %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %mul, i32 0 %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer ;ditto, %broadcast.splatinsert9 == <%mul %mul %mul %mul> %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %mul, i32 0 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer br label %vector.body vector.body: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] ;從arr指向的內存加載數據 ;%0如今表示<arr[0] arr[1] arr[2] arr[3]> %0 = getelementoptr inbounds i32, i32* %arr, i32 %index %1 = bitcast i32* %0 to <4 x i32>* %wide.load = load <4 x i32>, <4 x i32>* %1, align 4, !tbaa !3 ;%2表示<arr[4] arr[5] arr[6] arr[7]> %2 = getelementptr i32, i32* %0, i32 4 %3 = bitcast i32* %2 to <4 x i32>* %wide.load8 = load <4 x i32>, <4 x i32>* %3, align 4, !tbaa !3 ;<arr[0] arr[1] arr[2] arr[3]>與<%mul %mul %mul %mul>相加,獲得vector:%4 ;<arr[4] arr[5] arr[6] arr[7]>與<%mul %mul %mul %mul>相加,獲得vector:%5 %4 = add nsw <4 x i32> %wide.load, %broadcast.splat10 %5 = add nsw <4 x i32> %wide.load8, %broadcast.splat12 ;%4,%5寫回內存 %6 = bitcast i32* %0 to <4 x i32>* store <4 x i32> %4, <4 x i32>* %6, align 4, !tbaa !3 %7 = bitcast i32* %2 to <4 x i32>* store <4 x i32> %5, <4 x i32>* %7, align 4, !tbaa !3 %index.next = add i32 %index, 8 %8 = icmp eq i32 %index.next, %n.vec br i1 %8, label %middle.block, label %vector.body, !llvm.loop !7 middle.block: %cmp.n = icmp eq i32 %n.vec, %elem br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader ;函數返回 for.cond.cleanup: ret void for.body: %i.06 = phi i32 [ %inc, %for.body ], [ %i.06.ph, %for.body.preheader ]‘ ; arr[i] = arr[i]+ (elem*1024),其中%mul=(elem*1024) %arrayidx = getelementptr inbounds i32, i32* %arr, i32 %i.06 %9 = load i32, i32* %arrayidx, align 4, !tbaa !3 %add = add nsw i32 %9, %mul store i32 %add, i32* %arrayidx, align 4, !tbaa !3 ; i++ %inc = add nuw nsw i32 %i.06, 1 ; 循環條件i<elem判斷 %exitcond = icmp eq i32 %inc, %elem br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10 }