supports multinomial logistic (softmax) and binomial logistic regression
# supports multinomial logistic (softmax) and binomial logistic regression
from pyspark.sql import Row
from pyspark.ml.linearalg import Vectors
bdf = sc.parallelize([
Row(label=1.0, weight=2.0, features=Vectors.dense(1.0)),
Row(label=0.0, weight=2.0, features=Vectors.sparse(1, [], []))]).toDF()
blor = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight")
blorModel = blor.fit(bdf)
blorModel.coefficients
# DenseVector([5.5...])
blorModel.intercept
# -2.68...
mdf = sc.parallelize([
Row(label=1.0, weight=2.0, features=Vectors.dense(1.0)),
Row(label=0.0, weight=2.0, features=Vectors.sparse(1, [], [])),
Row(label=2.0, weight=2.0, features=Vectors.dense(3.0))]).toDF()
mlor = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight",
family="multinomial")
mlorModel = mlor.fit(mdf)
print(mlorModel.coefficientMatrix)
# DenseMatrix([[-2.3...],
# [ 0.2...],
# [ 2.1... ]])
mlorModel.interceptVector
# DenseVector([2.0..., 0.8..., -2.8...])
test0 = sc.parallelize([Row(features=Vectors.dense(-1.0))]).toDF()
result = blorModel.transform(test0).head()
result.prediction
# 0.0
result.probability
# DenseVector([0.99..., 0.00...])
result.rawPrediction
# DenseVector([0.99..., 0.00...])
test1 = sc.parallelize([Row(features=Vectors.sparse(1, [0], [1.0]))]).toDF()
blorModel.transform(test1).head().prediction
1.0
blor.setParams("vector")
# Traceback (most recent call last):
# ...
# TypeError: Method setParams forces keyword arguments.
lr_path = temp_path + "/lr"
blor.save(lr_path)
lr2 = LogisticRegression.load(lr_path)
lr2.getMaxIter()
# 5
model_path = temp_path + "/lr_path"
blorModel.save(model_path)
model2 = LogisticRegressionModel.load(model_path)
blorModel.coefficients[0] == model2.coefficients[0]
# True
blorModel.intercept == model2.intercept
# True