implements both tokenization and occurrence

# implements both tokenization and occurrence

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer
# CountVectorizer()

corpus = [
  'This is the first document.',
  'This is the second second document.',
  'And the third one.',
  'Is this the first document?',
]
X = vectorizer.fit_transform(corpus)
X
# <4x9 sparse matrix of type '<... 'numpy.int64'>'
#     with 19 stored elements in Compressed Sparse ... format>

analyze = vectorizer.build_analyzer()
analyze("This is a text document to analyze.") == (
  ['this', 'is', 'text', 'document', 'to', 'analyze'])
# True

vectorizer.get_feature_names() == (
  ['and', 'document', 'first', 'is', 'one',
   'second', 'the', 'third', 'this'])
# True

X.toarray()
# array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
#        [0, 1, 0, 1, 0, 2, 1, 0, 1],
#        [1, 0, 0, 0, 1, 0, 1, 1, 0],
#        [0, 1, 1, 1, 0, 0, 1, 0, 1]]...)

vectorizer.vocabulary_.get('document')
# 1

vectorizer.transform(['Something completely new.']).toaaray()
# array([[0, 0, 0, 0, 0, 0, 0, 0, 0]]...)

bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),
                                    token_pattern=r'\b\w+b', min_df=1)
analyze = bigram('Bi-grams are cool!') == (
  ['bi', 'grams', 'are', 'cool', 'bi grams', 'grams are', 'are cool'])
# True

X_2 = bigram_vectorizer.fit_transform(corpus).toarray()
X_2
# array([[0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0],
#        [0, 0, 1, 0, 0, 1, 1, 0, 0, 2, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0],
#        [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0],
#        [0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1]]...)

feature_index = bigram_vectorizer.vocabulary_.get('is this')
X_2[:, feature_index]
# array([0, 0, 0, 1]...)

Posted by: Guest on April-17-2020

Source

Code answers related to "implements both tokenization and occurrence"

Code answers related to "Python"

Browse Popular Code Answers by Language

Answers for "implements both tokenization and occurrence"

Code answers related to "implements both tokenization and occurrence"

Code answers related to "Python"

Python Answers by Framework

Browse Popular Code Answers by Language

Popular Programming Languages

Advertisements

Company

Compilers

Help

Connect with us