VectorIndexer pyspark
from pyspark.ml import Pipeline from pyspark.ml.regression import LinearRegression from pyspark.ml.feature import VectorIndexer from pyspark.ml.evaluation import RegressionEvaluator # Automatically identify categorical features, and index them. # We specify maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer = VectorIndexer(inputCol="features", \ outputCol="indexedFeatures",\ maxCategories=4).fit(transformed) data = featureIndexer.transform(transformed)