Word2Vec trains a model of Map(String, Vector)
# Word2Vec trains a model of Map(String, Vector)
sent = ("a b " * 100 + "a c " * 10).split(" ")
doc = spark.createDataFrame([(sent,), (sent,)], ["sentence"])
word2Vec = Word2Vec(vectorSize=5, seed=42, inputCol="sentence",
outputCol="model")
model = word2Vec.fit(doc)
model.getVectors().show()
# +----+--------------------+
# |word| vector|
# +----+--------------------+
# | a|[0.09461779892444...|
# | b|[1.15474212169647...|
# | c|[-0.3794820010662...|
# +----+--------------------+
# ...
from pyspark.sql.functions import format_number as fmt
model.findSynonyms("a", 2).select("word", fmt("similarity", 5).alias(
"similarity")).show()
# +----+----------+
# |word|similarity|
# +----+----------+
# | b| 0.25053|
# | c| -0.69805|
# +----+----------+
# ...
model.transform(doc).head().model
# DenseVector([0.5524, -0.4995, -0.3599, 0.0241, 0.3461])
word2vecPath = temp_path + "/word2vec"
word2Vec.save(word2vecPath)
loadedWord2Vec = Word2Vec.load(word2vecPath)
loadedWord2Vec.getVectorSize() == word2Vec.getVectorSize()
# True
loadedWord2Vec.getNumPartitions() == word2Vec.getNumPartitions()
# True
modelPath = temp_path + "/word2vec-model"
model.save(modelPath)
loadedModel = Word2VecModel.load(modelPath)
loadedModel.getVectors().first().word == model.getVectors().first().word
# True
loadedModel.getVectors().first().vector == model.getVectors().first().vector
# True