數值變量卡方分箱-R版本

小編近期接的項目中不少要求要用R來作（小編但是Python高手！），因此基本上將Python代碼用R重寫（翻譯）了一遍。小編差很少三年沒正兒八經地用R了，但R是小編最喜歡也是最先使用的統計工具。關於工具：Python、R和SAS，小編都能用一點點（生活所迫！），因此有關工具使用、建模、分析的，歡迎交流；有項目合做的，也歡迎交流！
算法

卡方分箱的概念，請參考數值變量-卡方分箱。固然能夠將其中的分箱標準-卡方改寫成基尼、信息熵等。微信

算法app

這裏把具體的實現算法再重述一遍：函數

（1）將數值變量按照等距方式分紅SplitNum段(好比100段)，此爲初始分箱；工具

（2）計算每段的總樣本數、好樣本數、壞樣本數、樣本佔比等統計值；學習

（3）計算相鄰兩段的卡方值，合併卡方值最小的相鄰兩段；測試

（4）重複步驟（2）和（3），直至分段個數<=BinMax；大數據

（5）檢查每段是否同時含有壞樣本和好樣本，若某段只含有壞樣本或好樣本，則將與該段卡方值最小的相鄰一段和該段進行合併；ui

（6）重複步驟（5），直至每段同時含有壞樣本和好樣本；url

（7）檢查每段的樣本佔比是否>=BinPcntMin，若某段的樣本佔比<BinPcntMin，則將與該段卡方值最小的相鄰一段和該段進行合併；

（8）重複步驟（7），直至每段的樣本佔比>=BinPcntMin。

R代碼

一、數據列等距分割函數：

splitCol <- function(col, SplitNum, exclude_attr=NULL){
    # Copyright by 小石頭(bigdata_0819@163.com)
    # col: 數據列
    # SplitNum: 等距分割的段數
    # exclude_attri: 不參與分割的值
    # return: 分割節點值列表
    
    col <- setdiff(col, exclude_attr)
    splitPoint <- seq(min(col), max(col), length=SplitNum+1)
    splitPoint[length(splitPoint)] <- Inf
    splitPoint <- splitPoint[-1]
    
    return(splitPoint)
}

assignSplit <- function(x, splitPoint){
    # Copyright by 小石頭(bigdata_0819@163.com)
    # x: 標量數值
    # splitPoint：分割節點向量
    # return：落入的分割段
    
    if(x <= splitPoint[1]){
        sP <- splitPoint[1]
    }else{
        for(i in 1:(length(splitPoint)-1)){
            if(x>splitPoint[i] && x<=splitPoint[i+1]){
                sP <- splitPoint[i+1]
            }
        }
    }
    return(sP)
}

二、計算變量總樣本、好樣本、壞樣本、壞樣本率的函數：

BinBadRate <- function(df, col, target, BadRateIndicator){
    # Copyright by 小石頭(bigdata_0819@163.com)
    # df: 須要計算好壞比率的數據集
    # col: 須要計算好壞比率的變量
    # target: 好壞標籤
    # BadRateIndicator: 是否計算好壞比
    
    group1 <- aggregate(df[, target], list(df[, col]), sum)
    colnames(group1) <- c(col, 'bad')
    group2 <- aggregate(df[, target], list(df[, col]), length)
    colnames(group2) <- c(col, 'total')
    group <- merge(group1, group2, by=col)
    group$good <- group$total-group$bad
    group <- group[c(col, 'bad', 'good', 'total')]
    
    if(BadRateIndicator){
        group$BadRate <- group$bad/group$total
    }
    
    return(group)
}

三、計算卡方值函數：

calcChi2 <- function(df, total_col, bad_col, good_col){
    # Copyright by 小石頭(bigdata_0819@163.com)
    # df: 包含各屬性的所有樣本個數、壞樣本個數、好樣本個數的數據框
    # total_col: 所有樣本的個數
    # bad_col: 壞樣本的個數
    # good_col：好樣本的個數
    
    badRate <- sum(df[, bad_col])/sum(df[, total_col])
    goodRate <- sum(df[, good_col])/sum(df[, total_col])
    
    if(badRate %in% c(0,1)){
        return(0)
    }
    
    df$badExp <- df[, total_col]*badRate
    df$goodExp <- df[, total_col]*goodRate
    
    badChi2 <- sum((df[, bad_col]-df$badExp)^2/df$badExp)
    goodChi2 <- sum((df[, good_col]-df$goodExp)^2/df$goodExp)
    
    Chi2 <- badChi2 + goodChi2
    
    return(Chi2)
}

四、接下來實現單變量分箱的函數，其中會調用上面的幾個函數，返回單變量分箱的結果。分箱函數分三個部分，(1)合併相鄰兩個分組、(2)檢查是否每一個分組同時含有好和壞、(3)檢查每一個分組的佔比是否大於等於BinPcntMin。其中spe_attri是特殊屬性值，初始分箱時將各特殊屬性值分別單獨做爲一組，singleIndicator是特殊屬性值在接下來的合併過程當中是否參與合併的標識，取值T，不參與合併，取值F，則參與合併：

ContVarChi2Bin <- function(df, col, target, BinMax, BinPcntMin, SplitNum, spe_attri=NULL, singleIndicator){
    # Copyright by 小石頭(bigdata_0819@163.com)
    # df: 包含目標變量與分箱變量的數據框
    # col: 須要分箱的變量
    # target: 目標變量，取值0或1
    # BinMax: 最大分箱數
    # BinPcntMin：每箱的最小佔比
    # SplitNum：數值變量初始切分的段數，初始將變量等距切分紅SplitNum段
    # spe_attri：特殊屬性
    # singleIndicator: T:特殊屬性單獨做爲一組不參與卡方分箱，F:特殊屬性做爲一組參與卡方分箱
    
    if(length(spe_attri)>=1){
        df1 <- df[df[, col] %in% spe_attri, ]
        df2 <- df[!df[, col] %in% spe_attri, ]
    }else{
        df2 <- df
    }
    
    split_col <- splitCol(df2[, col], SplitNum)
    df2$temp <- apply(df2[col], 1, assignSplit, split_col)
    binBadRate <- BinBadRate(df2, 'temp', target, BadRateIndicator=F)
    
    if(length(spe_attri)>=1 && singleIndicator==F && nrow(df1)>0){
        df1$temp <- df1[, col]
        binBadRate1 <- BinBadRate(df1, 'temp', target, BadRateIndicator=F)
        binBadRate <- rbind(binBadRate1, binBadRate)
    }
    if(length(spe_attri)>=1 && singleIndicator==T){
        BinMax <- BinMax-length(unique(df1[, col]))
    }
        
    # 一、迭代合併相鄰兩個組，直至分箱數<=BinMax
    while(nrow(binBadRate)>BinMax){
        chi2_ <- NULL
        for(i in 1:(nrow(binBadRate)-1)){
            temp_binBadRate <- binBadRate[i:(i+1), ]
            chi2 <- calcChi2(temp_binBadRate, 'total', 'bad', 'good')
            chi2_ <- c(chi2_, chi2)
        }

        combineIndex <- which.min(chi2_)
        combine_binBadRate = binBadRate[combineIndex:(combineIndex+1), ]
        
        binBadRate[combineIndex+1, 'total'] <- sum(combine_binBadRate$total)
        binBadRate[combineIndex+1, 'bad'] <- sum(combine_binBadRate$bad)
        binBadRate[combineIndex+1, 'good'] <- sum(combine_binBadRate$good)
        
        binBadRate = binBadRate[-combineIndex, ]
        rownames(binBadRate) <- NULL
    }
    
    # 二、檢查每組是否同時含有好和壞
    binBadRate$BadRate <- binBadRate$bad/binBadRate$total
    minBadRate <- min(binBadRate$BadRate)
    maxBadRate <- max(binBadRate$BadRate) 
    while(minBadRate==0 || minBadRate==1){
        BadRate_01 <-binBadRate[binBadRate$BadRate %in% c(0,1), ]
        index_01 <- as.numeric(rownames(BadRate_01)[1])
        
        if(index_01==1){
            combineIndex <- 1
            combine_binBadRate = binBadRate[combineIndex:(combineIndex+1), ]
            
            binBadRate[combineIndex+1, 'total'] <- sum(combine_binBadRate$total)
            binBadRate[combineIndex+1, 'bad'] <- sum(combine_binBadRate$bad)
            binBadRate[combineIndex+1, 'good'] <- sum(combine_binBadRate$good)
            
            binBadRate = binBadRate[-combineIndex, ]
            rownames(binBadRate) <- NULL
        }else if(index_01==nrow(binBadRate)){
            combineIndex <- nrow(binBadRate)-1
            combine_binBadRate = binBadRate[combineIndex:(combineIndex+1), ]
            
            binBadRate[combineIndex+1, 'total'] <- sum(combine_binBadRate$total)
            binBadRate[combineIndex+1, 'bad'] <- sum(combine_binBadRate$bad)
            binBadRate[combineIndex+1, 'good'] <- sum(combine_binBadRate$good)
            
            binBadRate = binBadRate[-combineIndex, ]
            rownames(binBadRate) <- NULL
        }else{
            temp1_binBadRate <- binBadRate[(index_01-1):index_01, ]
            chi2_1 <- calcChi2(temp1_binBadRate, 'total', 'bad', 'good')
            
            temp2_binBadRate <- binBadRate[index_01:(index_01+1), ]
            chi2_2 = calcChi2(temp2_binBadRate, 'total', 'bad', 'good')
            
            if(chi2_1 < chi2_2){
                combineIndex <- index_01-1
            }else{
                combineIndex <- index_01
            }
            
            combine_binBadRate <- binBadRate[combineIndex:(combineIndex+1), ]
            
            binBadRate[combineIndex+1, 'total'] <- sum(combine_binBadRate$total)
            binBadRate[combineIndex+1, 'bad'] <- sum(combine_binBadRate$bad)
            binBadRate[combineIndex+1, 'good'] <- sum(combine_binBadRate$good)
            
            binBadRate <- binBadRate[-combineIndex, ]
            rownames(binBadRate) <- NULL
        }
        binBadRate$BadRate <- binBadRate$bad/binBadRate$total
        minBadRate <- min(binBadRate$BadRate)
        maxBadRate <- max(binBadRate$BadRate)
    }
        
    # 三、檢查每組的佔比是否大於等於BinPcntMin 
    binBadRate$Percent <- binBadRate$total/sum(binBadRate$total) 
    minPercent <- min(binBadRate$Percent)
    while(minPercent<BinPcntMin){
        BadRate_minPercent <- binBadRate[binBadRate$Percent==minPercent, ]
        index_minPercent <- as.numeric(rownames(BadRate_minPercent)[1])
        
        if(index_minPercent==1){
            combineIndex <- 1
            combine_binBadRate = binBadRate[combineIndex:(combineIndex+1), ]
            
            binBadRate[combineIndex+1, 'total'] <- sum(combine_binBadRate$total)
            binBadRate[combineIndex+1, 'bad'] <- sum(combine_binBadRate$bad)
            binBadRate[combineIndex+1, 'good'] <- sum(combine_binBadRate$good)
            
            binBadRate = binBadRate[-combineIndex, ]
            rownames(binBadRate) <- NULL
        }else if(index_minPercent==nrow(binBadRate)){
            combineIndex <- nrow(binBadRate)-1
            combine_binBadRate = binBadRate[combineIndex:(combineIndex+1), ]
            
            binBadRate[combineIndex+1, 'total'] <- sum(combine_binBadRate$total)
            binBadRate[combineIndex+1, 'bad'] <- sum(combine_binBadRate$bad)
            binBadRate[combineIndex+1, 'good'] <- sum(combine_binBadRate$good)
            
            binBadRate = binBadRate[-combineIndex, ]
            rownames(binBadRate) <- NULL
        }else{
            temp1_binBadRate <- binBadRate[(index_minPercent-1):index_minPercent, ]
            chi2_1 <- calcChi2(temp1_binBadRate, 'total', 'bad', 'good')
            
            temp2_binBadRate <- binBadRate[index_minPercent:(index_minPercent+1), ]
            chi2_2 = calcChi2(temp2_binBadRate, 'total', 'bad', 'good')
            
            if(chi2_1 < chi2_2){
                combineIndex <- index_minPercent-1
            }else{
                combineIndex <- index_minPercent
            }
            
            combine_binBadRate <- binBadRate[combineIndex:(combineIndex+1), ]
            
            binBadRate[combineIndex+1, 'total'] <- sum(combine_binBadRate$total)
            binBadRate[combineIndex+1, 'bad'] <- sum(combine_binBadRate$bad)
            binBadRate[combineIndex+1, 'good'] <- sum(combine_binBadRate$good)
            
            binBadRate <- binBadRate[-combineIndex, ]
            rownames(binBadRate) <- NULL
        }
        
        binBadRate$Percent <- binBadRate$total/sum(binBadRate$total) 
        minPercent <- min(binBadRate$Percent)
    }
    
    binBadRate <- dplyr::select(binBadRate, -c('BadRate', 'Percent'))
    
    if(length(spe_attri)>=1 && singleIndicator==T && nrow(df1)>0){
        binBadRate_single <- BinBadRate(df1, col, target, BadRateIndicator=F) 
        colnames(binBadRate_single)[1] <- 'temp'
        bindf <- rbind(binBadRate_single, binBadRate)
    }else{
        bindf <- binBadRate
    }
    colnames(bindf)[1] <- 'upper'
    
    bindf$Percent <- bindf$total/sum(bindf$total)
    bindf$BadRate <- bindf$bad/bindf$total
    
    bindf0 <- data.frame(bin=1:nrow(bindf), lower=c(-Inf, bindf$upper[-nrow(bindf)]))
    bindf <- cbind(bindf0, bindf) 
    
    return(bindf)
}

五、批量分箱函數，將全部要分箱的數值變量進行批量分箱處理，函數返回的是存放每一個變量分箱結果的列表list：

ContVarChi2BinBatch <- function(df, key, target, BinMax, BinPcntMin, SplitNum, spe_attri=NULL, singleIndicator){
    # Copyright by 小石頭(bigdata_0819@163.com)
    # df: 數據框
    # key: 主鍵
    # target: 目標變量，取值0或1
    # return: 存放每一個變量分箱結果的列表
    
    Xvars <- setdiff(colnames(df), c(key, target))
    
    list_bin <- list()
    for(col in Xvars){
        list_bin[[col]] <- ContVarChi2Bin(df, col, target, BinMax, BinPcntMin, SplitNum, spe_attri, singleIndicator)
    }
    
    return(list_bin)
}

六、將變量值替換爲分箱值的函數：

txtContVarBin <- function(df, key, target, list_bin, testIndicator){
    # Copyright by 小石頭(bigdata_0819@163.com)
    # df: 須要將變量值替換爲分箱值的數據框
    # key：主鍵
    # target：目標變量
    # list_bin：包含各變量分箱結果的列表
    # testIndicator：是否爲測試數據框，T：計算測試數據分箱後的佔比、壞樣本率等，並存放在列表中
    
    df_bin <- df[, c(key, target)]
    Xvars <- setdiff(colnames(df), c(key, target))
    ListBin <- list()
    
    for(col in Xvars){
        
        Bin <- list_bin[[col]]
        vec <- NULL
        for(i in Bin$bin){
            vec[df[, col]>Bin$lower[i] & df[, col]<=Bin$upper[i]] <- i
        }
        df_bin[, col] <- vec
        
        if(testIndicator){
            
            col_bin_BadRate <- BinBadRate(df_bin, col, target, BadRateIndicator=F) 
            col_bin_BadRate$Percent <- col_bin_BadRate$total/sum(col_bin_BadRate$total)
            col_bin_BadRate$BadRate <- col_bin_BadRate$bad/col_bin_BadRate$total
            colnames(col_bin_BadRate)[1] <- 'bin'
            col_bin <- merge(Bin[c('bin', 'lower', 'upper')], col_bin_BadRate, by='bin', all.x=T)
            ListBin[[col]] <- col_bin
            
        }
    }
    
    if(testIndicator){
        return(list(df_bin=df_bin, ListBin=ListBin))
    }else{
        return(df_bin)
    }
}

代碼的使用方法與文章數值變量-卡方分箱的方法徹底同樣，可參考。查看某變量的分箱結果以下：

在幾個不一樣數據集上運行驗證都沒出現bug，若是你們運行代碼出現bug，歡迎交流學習。

傳送門：

數值變量-卡方分箱

決策樹分箱-R

本文分享自微信公衆號 - 大數據建模的一點一滴（bigdatamodeling）。
若有侵權，請聯繫 support@oschina.cn 刪除。
本文參與「OSC源創計劃」，歡迎正在閱讀的你也加入，一塊兒分享。