A tokenizer that converts the input string to lowercase and then splits it by white spaces
# A tokenizer that converts the input string to lowercase and then splits it by white spaces
df = spark.createDataFrame([("a b c",)], ["text"])
tokenizer = Tokenizer(inputCol="text", outputCol="words")
tokenizer.transform(df).head()
# Row(text=u'a b c', words=[u'a', u'b', u'c'])
# Change a parameter.
tokenizer.setParams(outputCol="tokens").transform(df).head()
# Row(text=u'a b c', words=[u'a', u'b', u'c'])
# Temporarily modify a parameter.
tokenizer.transform(df, {tokenizer.outputCol: "words"}).head()
# Row(text=u'a b c', tokens=[u'a', u'b', u'c'])
# Must use keywords arguments to specify params.
tokenizer.setParams("text")
# Traceback (most recent call last):
# ...
# TypeError: Method setParams forces keyword arguments.
tokenizerPath = temp_path + "/tokenizer"
tokenizer.save(tokenizerPath)
loadedTokenizer = Tokenizer.load(tokenizerPath)
loadedTokenizer.transform(df).head().tokens == tokenizer.transform(df).head().tokens
# True