单列离散化:
val discretizer = new QuantileDiscretizer()
.setHandleInvalid("keep")
.setInputCol("id")
.setOutputCol("id_discretizer")
.setNumBuckets(20)
val result = discretizer.fit(itemdf).transform(itemdf)
result.show(false)
多列同时离散化:
val itemDiscretizerCols = Array("id", "name", "score", "gender")
//使用pipeline一次转换
val indexers = itemDiscretizerCols.map(col => {
new QuantileDiscretizer()
.setHandleInvalid("keep")
.setInputCol(col)
.setOutputCol(col + "_discretizer")
.setNumBuckets(20)
})
val itemDiscretizationDF = new Pipeline().setStages(indexers).fit(itemdf).transform(itemdf).cache()
itemDiscretizationDF.show()