1

I would like to classify comments based on NLP algorithm (tf-idf). I managed to classify these clusters but I want to visualize them graphically (histogram, scatter plot...)

import collections
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import pandas as pd
import string
data = pd.read_excel (r'C:\Users\cra\One\intern\Book2.xlsx') 
def word_tokenizer(text):
        #tokenizes and stems the text
        tokens = word_tokenize(text)  
        stemmer = PorterStemmer() 
        tokens = [stemmer.stem(t) for t in tokens if t not in 
        stopwords.words('english')] 
        return tokens 

#tfidf convert text data to vectors 

def cluster_sentences(sentences, nb_of_clusters=5):
        tfidf_vectorizer = TfidfVectorizer(tokenizer=word_tokenizer,

        stop_words=stopwords.words('english'),#enlever stopwords
                                        max_df=0.95,min_df=0.05, 
           lowercase=True) 

        tfidf_matrix = tfidf_vectorizer.fit_transform(sentences) 
        kmeans = KMeans(n_clusters=nb_of_clusters)
        kmeans.fit(tfidf_matrix)
        clusters = collections.defaultdict(list)
        for i, label in enumerate(kmeans.labels_):
                clusters[label].append(i)
        return dict(clusters)
if __name__ == "__main__":
         sentences = data.Comment
        nclusters= 20
        clusters = cluster_sentences(sentences, nclusters) #dictionary of 
        #cluster and the index of the comment in the dataframe
        for cluster in range(nclusters):
                print ("cluster ",cluster,":")
                for i,sentence in enumerate(clusters[cluster]):
                        print ("\tsentence ",i,": ",sentences[sentence])

result that I got for example : cluster 6 : sentence 0 : 26 RIH DP std sentence 1 : 32 RIH DP std sentence 2 : 68 RIH Liner with DP std in hole sentence 3 : 105 RIH DP std sentence 4 : 118 RIH std no of DP in hole sentence 5 : 154 RIH DP std

Could you help me please! thank you

1 Answer 1

1

You will need to use t-SNE to visualize the clusters - this article on visualizing and clustering US Laws using tf-idf can get you started.

Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.