I am trying to get the word frequencies for terms within each tweet contained in a dataframe. This is my code:
import pandas as pd
import numpy as np
import nltk
import string
import collections
from collections import Counter
nltk.download('stopwords')
sw= set(nltk.corpus.stopwords.words ('english'))
punctuation = set (string.punctuation)
data= pd.read_csv('~/Desktop/tweets.csv.zip', compression='zip')
print (data.columns)
print(data.text)
data['text'] = [str.lower () for str in data.text if str.lower () not in sw and str.lower () not in punctuation]
print(data.text)
data["text"] = data["text"].str.split()
data['text'] = data['text'].apply(lambda x: [item for item in x if item not in sw])
print(data.text)
data['text'] = data.text.astype(str)
print(type(data.text))
tweets=data.text
data['words']= tweets.apply(nltk.FreqDist(tweets))
print(data.words)
And this is my error and traceback:
Name: text, Length: 14640, dtype: object Traceback (most recent call last):
File "", line 1, in runfile('C:/Users/leska/.spyder-py3/untitled1.py', wdir='C:/Users/leska/.spyder-py3')
File "C:\Users\leska\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 827, in runfile execfile(filename, namespace)
File "C:\Users\leska\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 110, in execfile exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/leska/.spyder-py3/untitled1.py", line 30, in data['words']= tweets.apply(nltk.FreqDist(tweets))
File "C:\Users\leska\Anaconda3\lib\site-packages\pandas\core\series.py", line 4018, in apply return self.aggregate(func, *args, **kwds)
File "C:\Users\leska\Anaconda3\lib\site-packages\pandas\core\series.py", line 3883, in aggregate result, how = self._aggregate(func, *args, **kwargs)
File "C:\Users\leska\Anaconda3\lib\site-packages\pandas\core\base.py", line 506, in _aggregate result = _agg(arg, _agg_1dim)
File "C:\Users\leska\Anaconda3\lib\site-packages\pandas\core\base.py", line 456, in _agg result[fname] = func(fname, agg_how)
File "C:\Users\leska\Anaconda3\lib\site-packages\pandas\core\base.py", line 440, in _agg_1dim return colg.aggregate(how, _level=(_level or 0) + 1)
File "C:\Users\leska\Anaconda3\lib\site-packages\pandas\core\series.py", line 3902, in aggregate result = func(self, *args, **kwargs)
TypeError: 'int' object is not callable
I have verified that the type of data.text is a Pandas series.
I had tried a solution earlier that I thought worked that used tokenizing and creating a wordlist to get the word counts, but it resulted in a frequency distribution for all the tweeets rather than each one. This was the code I had tried based on my earlier question:
import pandas as pd
import numpy as np
import nltk
import string
import collections
from collections import Counter
nltk.download('stopwords')
sw= set(nltk.corpus.stopwords.words ('english'))
punctuation = set (string.punctuation)
data= pd.read_csv('~/Desktop/tweets.csv.zip', compression='zip')
print (data.columns)
print (len(data.tweet_id))
tweets = data.text
test = pd.DataFrame(data)
test.column = ["text"]
# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
test['tweet_without_stopwords'] = test['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (sw) and word for word in x.split() if word not in punctuation]))
print(test)
chirps = test.text
splitwords = [ nltk.word_tokenize( str(c) ) for c in chirps ]
allWords = []
for wordList in splitwords:
allWords += wordList
allWords_clean = [w.lower () for w in allWords if w.lower () not in sw and w.lower () not in punctuation]
tweets2 = pd.Series(allWords)
words = nltk.FreqDist(tweets2)
I really need to the term and counts for each tweet and I am stumped as to what I am doing wrong.