I have written the following code to count the number of sentences, words and characters in the input file sample.txt, which contains a paragraph of text. It works fine in giving the number of sentences and words, but does not give the precise and correct number of characters ( without whitespaces and punctuation marks)
lines,blanklines,sentences,words=0,0,0,0
num_chars=0
print '-'*50
try:
filename = 'sample.txt'
textf = open(filename,'r')c
except IOError:
print 'cannot open file %s for reading' % filename
import sys
sys.exit(0)
for line in textf:
print line
lines += 1
if line.startswith('\n'):
blanklines += 1
else:
sentences += line.count('.')+ line.count ('!')+ line.count('?')
tempwords = line.split(None)
print tempwords
words += len(tempwords)
textf.close()
print '-'*50
print "Lines:", lines
print "blank lines:",blanklines
print "sentences:",sentences
print "words:",words
import nltk
import nltk.data
import nltk.tokenize
with open('sample.txt' , 'r') as f:
for line in f:
num_chars += len(line)
num_chars = num_chars - (words +1 )
pcount = 0
from nltk.tokenize import TreebankWordTokenizer
with open('sample.txt','r') as f1:
for line in f1:
#tokenised_words = nltk.tokenize.word_tokenize(line)
tokenizer = TreebankWordTokenizer()
tokenised_words = tokenizer.tokenize(line)
for w in tokenised_words:
if ((w=='.')|(w==';')|(w=='!')|(w=='?')):
pcount = pcount + 1
print "pcount:",pcount
num_chars = num_chars - pcount
print "chars:",num_chars
pcount is the number of punctuation marks. Can some suggest the changes I need to make in order to find out the exact number of characters without spaces and punctuation marks?