A regex based tokenizer that extracts tokens
# A regex based tokenizer that extracts tokens
df = spark.createDataFrame([("A B c",)], ["text"])
reTokenizer = RegexTokenizer(inputCol="text", outputCol="words")
reTokenizer.transform(df).head()
# Row(text=u'A B c', words=[u'a', u'b', u'c'])
# Change a parameter.
reTokneizer.setParams(outputCol="tokens").transform(df).head()
# Row(text=u'A B c', tokens=[u'a', u'b', u'c'])
# Temporarily modify a parameter.
reTokenizer.transform(df,
{reTokenizer.outputCol: "words"}).head()
# Row(text=u'A B c', words=[u'a', u'b', u'c'])
reTokenizer.transform(df).head()
# Row(text=u'A B c', tokens=[u'a', u'b', u'c'])
# Must use keyword arguments to specify params.
reTokenizer.setParams("text")
# Traceback (most recent call last):
# ...
# TypeError: Method setParams forces keyword arguments.
regexTokenizerPath = temp_path + "/regex-tokenizer"
reTokenizer.save(regexTokenizerPath)
loadedReTokenizer = RegexTokenizer.load(regexTokenizerPath)
loadedReTokenizer.getMinTokenLength() == reTokenizer.getMinTokenLength()
# True
loadedReTokenizer.getGaps() == reTokenizer.getGaps()
# True