DataFrameNaFunctions用來對DataFrame中值爲null或NaN的列作處理,處理分爲三種類型:html
drop:根據條件丟棄含有null或NaN的行java
fill:根據條件使用指定值填充值爲null或NaN的列,至關於設置默認值sql
replace:根據條件替換列值apache
下面是針對每種處理方式的詳細解釋:api
package cc11001100.spark.dataset.DataFrameNaFunctionsDemo; import com.google.common.collect.ImmutableMap; import org.apache.spark.sql.DataFrameNaFunctions; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.catalyst.encoders.RowEncoder; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructField; import java.util.ArrayList; import java.util.List; /** * DataFrameNaFunctions對空值的處理主要有三種: * drop * fill * replace * * @author CC11001100 */ public class DataFrameNaFunctionsDemo { private static Integer randomValue(int n) { if (Math.random() < 0.5) { return n; } else { return null; } } public static void main(String[] args) { SparkSession spark = SparkSession.builder().master("local[*]").getOrCreate(); List<Row> rowList = new ArrayList<>(); for (int i = 0; i < 100; i++) { Row row = RowFactory.create(randomValue(i), randomValue(i)); rowList.add(row); } Dataset<Row> nums = spark.createDataset(rowList, RowEncoder.apply(DataTypes.createStructType(new StructField[]{ DataTypes.createStructField("col_1", DataTypes.IntegerType, true), DataTypes.createStructField("col_2", DataTypes.IntegerType, true), }))); nums.show(false); DataFrameNaFunctions dataFrameNaFunctions = nums.na(); /*----------------------------- drop -------------------------------*/ // 只要某行中有一列是null或NaN即丟掉此行數據,內部調用了drop("any") dataFrameNaFunctions.drop().show(); // 指定丟棄行的方式,any表示行中任意一列是null或NaN即丟棄此行,all表示此行中全部列都是null或NaN才丟棄此行 dataFrameNaFunctions.drop("any").show(); // 當某行中的全部列爲null或NaN時丟棄掉此行 dataFrameNaFunctions.drop("all").show(); // 當某行的指定列爲null或any時丟棄掉此行 dataFrameNaFunctions.drop(new String[]{"col_1", "col_2"}).show(); // 當某行的指定列任意一個爲null或NaN時丟棄掉此行 dataFrameNaFunctions.drop("any", new String[]{"col_1", "col_2"}).show(); // 當某行的指定列所有爲null或NaN時丟棄掉此行 dataFrameNaFunctions.drop("all", new String[]{"col_1", "col_2"}).show(); // 當某行中指定列爲null或NaN的數量大於指定值時丟棄掉此行 dataFrameNaFunctions.drop(1).show(); dataFrameNaFunctions.drop(1, new String[]{"col_1", "col_2"}).show(); /*----------------------------- fill -------------------------------*/ // 使用指定的值填充全部爲null或NaN的列s,至關於爲全部null或NaN設置默認值 dataFrameNaFunctions.fill(1L).show(); dataFrameNaFunctions.fill(0.1).show(); dataFrameNaFunctions.fill("").show(); dataFrameNaFunctions.fill(true).show(); // 當給定的列出現null或NaN值時使用對應值填充,至關於爲指定的列設置默認值 dataFrameNaFunctions.fill(1L, new String[]{"col_1, col_2"}).show(); dataFrameNaFunctions.fill(0.1, new String[]{"col_1, col_2"}).show(); dataFrameNaFunctions.fill("", new String[]{"col_1, col_2"}).show(); dataFrameNaFunctions.fill(true, new String[]{"col_1, col_2"}).show(); // 傳入Map能夠爲每一列設置不一樣的值,map的key爲列名,值爲當key列爲null或NaN時要填充的值 // 要填充的值必須是下列類型之一: `Integer`, `Long`, `Float`, `Double`, `String`, `Boolean`. dataFrameNaFunctions.fill(ImmutableMap.of("col_1", "unknown", "col_2", 1.0)).show(); /*----------------------------- replace -------------------------------*/ // 當指定列的值爲key時,將其替換爲value dataFrameNaFunctions.replace("col_1", ImmutableMap.of("UNKNOWN", "unnamed")).show(); dataFrameNaFunctions.replace(new String[]{"col_1", "col_2"}, ImmutableMap.of("UNKNOWN", "unnamed")).show(); } }
相關資料:app
1. Class DataFrameNaFunctions - spark docdom
.ui