A tokenizer that converts the input string to lowercase and then splits it by white spaces
# A tokenizer that converts the input string to lowercase and then splits it by white spaces df = spark.createDataFrame([("a b c",)], ["text"]) tokenizer = Tokenizer(inputCol="text", outputCol="words") tokenizer.transform(df).head() # Row(text=u'a b c', words=[u'a', u'b', u'c']) # Change a parameter. tokenizer.setParams(outputCol="tokens").transform(df).head() # Row(text=u'a b c', words=[u'a', u'b', u'c']) # Temporarily modify a parameter. tokenizer.transform(df, {tokenizer.outputCol: "words"}).head() # Row(text=u'a b c', tokens=[u'a', u'b', u'c']) # Must use keywords arguments to specify params. tokenizer.setParams("text") # Traceback (most recent call last): # ... # TypeError: Method setParams forces keyword arguments. tokenizerPath = temp_path + "/tokenizer" tokenizer.save(tokenizerPath) loadedTokenizer = Tokenizer.load(tokenizerPath) loadedTokenizer.transform(df).head().tokens == tokenizer.transform(df).head().tokens # True