implements both tokenization and occurrence
# implements both tokenization and occurrence
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer
# CountVectorizer()
corpus = [
'This is the first document.',
'This is the second second document.',
'And the third one.',
'Is this the first document?',
]
X = vectorizer.fit_transform(corpus)
X
# <4x9 sparse matrix of type '<... 'numpy.int64'>'
# with 19 stored elements in Compressed Sparse ... format>
analyze = vectorizer.build_analyzer()
analyze("This is a text document to analyze.") == (
['this', 'is', 'text', 'document', 'to', 'analyze'])
# True
vectorizer.get_feature_names() == (
['and', 'document', 'first', 'is', 'one',
'second', 'the', 'third', 'this'])
# True
X.toarray()
# array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
# [0, 1, 0, 1, 0, 2, 1, 0, 1],
# [1, 0, 0, 0, 1, 0, 1, 1, 0],
# [0, 1, 1, 1, 0, 0, 1, 0, 1]]...)
vectorizer.vocabulary_.get('document')
# 1
vectorizer.transform(['Something completely new.']).toaaray()
# array([[0, 0, 0, 0, 0, 0, 0, 0, 0]]...)
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),
token_pattern=r'\b\w+b', min_df=1)
analyze = bigram('Bi-grams are cool!') == (
['bi', 'grams', 'are', 'cool', 'bi grams', 'grams are', 'are cool'])
# True
X_2 = bigram_vectorizer.fit_transform(corpus).toarray()
X_2
# array([[0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0],
# [0, 0, 1, 0, 0, 1, 1, 0, 0, 2, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0],
# [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0],
# [0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1]]...)
feature_index = bigram_vectorizer.vocabulary_.get('is this')
X_2[:, feature_index]
# array([0, 0, 0, 1]...)