Word2Vec trains a model of Map
# Word2Vec trains a model of Map
sent = ("a b " * 100 + "a c " * 10).split(" ")
doc = spark.createDataFrame([(sent,), (sent,)], ["sentence"])
word2Vec = Word2Vec(vectorSize=5, seed=42, inputCol="sentence", outputCol="model")
model = word2Vec.fit(doc)
model.getVectors().show()
# +----+--------------------+
# |word| vector|
# +----+--------------------+
# | a|[0.09461779892444...|
# | b|[1.15474212169647...|
# | c|[-0.3794820010662...|
# +----+--------------------+
# ...
from pyspark.sql.functions import format_number as fmt
model.findSynonyms("a", 2).select("word", fmt("similarity", 5).alias("similarity")).show()
# +----+----------+
# |word|similarity|
# +----+----------+
# | b| 0.25053|
# | c| -0.69805|
# +----+----------+
# ...
model.transform(doc).head().model
# DenseVector([0.5524, -0.4995, -0.3599, 0.0241, 0.3461])
word2vecPath = temp_path + "/word2vec"
word2Vec.save(word2vecPath)
loadedWord2Vec = Word2Vec.load(word2vecPath)
loadedWord2Vec.getVectorSize() == word2Vec.getVectorSize()
# True
loadedWord2Vec.getNumPartitions() == word2Vec.getNumPartitions()
# True
loadedWord2Vec.getMinCount() == word2Vec.getMinCount()
# True
modelPath = temp_path + "/word2vec-model"
model.save(modelPath)
loadedModel = Word2VecModel.load(modelPath)
loadedModel.getVectors().first().word == model.getVectors().first().word
# True
loadedModel.getVectors().first().vector == model.getVectors().first().vector
# True