implements both tokenization and occurrence
# implements both tokenization and occurrence
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer
# CountVectorizer()
corpus = [
  'This is the first document.',
  'This is the second second document.',
  'And the third one.',
  'Is this the first document?',
]
X = vectorizer.fit_transform(corpus)
X
# <4x9 sparse matrix of type '<... 'numpy.int64'>'
#     with 19 stored elements in Compressed Sparse ... format>
analyze = vectorizer.build_analyzer()
analyze("This is a text document to analyze.") == (
  ['this', 'is', 'text', 'document', 'to', 'analyze'])
# True
vectorizer.get_feature_names() == (
  ['and', 'document', 'first', 'is', 'one',
   'second', 'the', 'third', 'this'])
# True
X.toarray()
# array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
#        [0, 1, 0, 1, 0, 2, 1, 0, 1],
#        [1, 0, 0, 0, 1, 0, 1, 1, 0],
#        [0, 1, 1, 1, 0, 0, 1, 0, 1]]...)
vectorizer.vocabulary_.get('document')
# 1
vectorizer.transform(['Something completely new.']).toaaray()
# array([[0, 0, 0, 0, 0, 0, 0, 0, 0]]...)
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),
                                    token_pattern=r'\b\w+b', min_df=1)
analyze = bigram('Bi-grams are cool!') == (
  ['bi', 'grams', 'are', 'cool', 'bi grams', 'grams are', 'are cool'])
# True
X_2 = bigram_vectorizer.fit_transform(corpus).toarray()
X_2
# array([[0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0],
#        [0, 0, 1, 0, 0, 1, 1, 0, 0, 2, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0],
#        [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0],
#        [0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1]]...)
feature_index = bigram_vectorizer.vocabulary_.get('is this')
X_2[:, feature_index]
# array([0, 0, 0, 1]...)
