CountVectorizer

corpus = [["this is spam, 'SPAM'"],["this is ham, 'HAM'"],["this is nothing, 'NOTHING'"]] from sklearn.feature_extraction.text import CountVectorizer bag_of_words = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False).fit_transform(splited_labels_from_corpus)

>>> from sklearn.feature_extraction.text import TfidfTransformer >>> from sklearn.feature_extraction.text import CountVectorizer >>> from sklearn.pipeline import Pipeline >>> import numpy as np >>> corpus = ['this is the first document', ... 'this document is the second document', ... 'and this is the third one', ... 'is this the first document'] >>> vocabulary = ['this', 'document', 'first', 'is', 'second', 'the', ... 'and', 'one'] >>> pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)), ... ('tfid', TfidfTransformer())]).fit(corpus) >>> pipe['count'].transform(corpus).toarray() array([[1, 1, 1, 1, 0, 1, 0, 0], [1, 2, 0, 1, 1, 1, 0, 0], [1, 0, 0, 1, 0, 1, 1, 1], [1, 1, 1, 1, 0, 1, 0, 0]]) >>> pipe['tfid'].idf_ array([1. , 1.22314355, 1.51082562, 1. , 1.91629073, 1. , 1.91629073, 1.91629073]) >>> pipe.transform(corpus).shape (4, 8)

countvectorizer in nlp

from sklearn.datasets import fetch_20newsgroupsfrom sklearn.feature_extraction.text import CountVectorizerimport numpy as np# Create our vectorizervectorizer = CountVectorizer()# Let's fetch all the possible text datanewsgroups_data = fetch_20newsgroups()# Why not inspect a sample of the text data?print('Sample 0: ')print(newsgroups_data.data[0])print()# Create the vectorizervectorizer.fit(newsgroups_data.data)# Let's look at the vocabulary:print('Vocabulary: ')print(vectorizer.vocabulary_)print()# Converting our first sample into a vectorv0 = vectorizer.transform([newsgroups_data.data[0]]).toarray()[0]print('Sample 0 (vectorized): ')print(v0)print()# It's too big to even see...# What's the length?print('Sample 0 (vectorized) length: ')print(len(v0))print()# How many words does it have?print('Sample 0 (vectorized) sum: ')print(np.sum(v0))print()# What if we wanted to go back to the source?print('To the source:')print(vectorizer.inverse_transform(v0))print()# So all this data has a lot of extra garbage... Why not strip it away?newsgroups_data = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))# Why not inspect a sample of the text data?print('Sample 0: ')print(newsgroups_data.data[0])print()# Create the vectorizervectorizer.fit(newsgroups_data.data)# Let's look at the vocabulary:print('Vocabulary: ')print(vectorizer.vocabulary_)print()# Converting our first sample into a vectorv0 = vectorizer.transform([newsgroups_data.data[0]]).toarray()[0]print('Sample 0 (vectorized): ')print(v0)print()# It's too big to even see...# What's the length?print('Sample 0 (vectorized) length: ')print(len(v0))print()# How many words does it have?print('Sample 0 (vectorized) sum: ')print(np.sum(v0))print()# What if we wanted to go back to the source?print('To the source:')print(vectorizer.inverse_transform(v0))print()

Posted by: Guest on November-12-2020

Source

Code answers related to "CountVectorizer"

Code answers related to "Whatever"

Browse Popular Code Answers by Language

Answers for "CountVectorizer"

Code answers related to "CountVectorizer"

Code answers related to "Whatever"

Browse Popular Code Answers by Language

Popular Programming Languages

Advertisements

Company

Compilers

Help

Connect with us