code for counting number of sentences, words and characters in an input file

Question

I have written the following code to count the number of sentences, words and characters in the input file sample.txt, which contains a paragraph of text. It works fine in giving the number of sentences and words, but does not give the precise and correct number of characters ( without whitespaces and punctuation marks)

lines,blanklines,sentences,words=0,0,0,0
num_chars=0

print '-'*50

try:
    filename = 'sample.txt'
    textf = open(filename,'r')c
except IOError:
    print 'cannot open file %s for reading' % filename
    import sys
    sys.exit(0)

for line in textf:
    print line
    lines += 1
    if line.startswith('\n'):
        blanklines += 1
    else:

    sentences += line.count('.')+ line.count ('!')+ line.count('?')

    tempwords = line.split(None)
    print tempwords 
    words += len(tempwords)


textf.close()

print '-'*50
print "Lines:", lines
print "blank lines:",blanklines
print "sentences:",sentences
print "words:",words

import nltk
import nltk.data
import nltk.tokenize

with open('sample.txt' , 'r') as f:
    for line in f:
        num_chars += len(line)

num_chars = num_chars - (words +1 )

pcount = 0
from nltk.tokenize import TreebankWordTokenizer
with open('sample.txt','r') as f1:
    for line in f1:
        #tokenised_words = nltk.tokenize.word_tokenize(line)
        tokenizer = TreebankWordTokenizer()
        tokenised_words = tokenizer.tokenize(line)
    for w in tokenised_words:
        if ((w=='.')|(w==';')|(w=='!')|(w=='?')):
            pcount = pcount + 1
print "pcount:",pcount
num_chars = num_chars - pcount
print "chars:",num_chars

pcount is the number of punctuation marks. Can some suggest the changes I need to make in order to find out the exact number of characters without spaces and punctuation marks?

Is this homework? If not, I'm pretty sure you can get this answer with just a couple of lines of shell script. — Mike Sherrill 'Cat Recall'
– Mike Sherrill 'Cat Recall', Commented Feb 23, 2011 at 17:56

Hugh Bothwell · Accepted Answer · 2011-02-23 18:21:17Z

import string

#
# Per-line counting functions
#
def countLines(ln):      return 1
def countBlankLines(ln): return 0 if ln.strip() else 1
def countWords(ln):      return len(ln.split())

def charCounter(validChars):
    vc = set(validChars)
    def counter(ln):
        return sum(1 for ch in ln if ch in vc)
    return counter
countSentences = charCounter('.!?')
countLetters   = charCounter(string.letters)
countPunct     = charCounter(string.punctuation)

#
# do counting
#
class FileStats(object):
    def __init__(self, countFns, labels=None):
        super(FileStats,self).__init__()
        self.fns    = countFns
        self.labels = labels if labels else [fn.__name__ for fn in countFns]
        self.reset()

    def reset(self):
        self.counts = [0]*len(self.fns)

    def doFile(self, fname):
        try:
            with open(fname) as inf:
                for line in inf:
                    for i,fn in enumerate(self.fns):
                        self.counts[i] += fn(line)
        except IOError:
            print('Could not open file {0} for reading'.format(fname))

    def __str__(self):
        return '\n'.join('{0:20} {1:>6}'.format(label, count) for label,count in zip(self.labels, self.counts))

fs = FileStats(
    (countLines, countBlankLines, countSentences, countWords, countLetters, countPunct),
    ("Lines",    "Blank Lines",   "Sentences",    "Words",    "Letters",    "Punctuation")
)
fs.doFile('sample.txt')
print(fs)

results in

Lines                   101
Blank Lines              12
Sentences                48
Words                   339
Letters                1604
Punctuation             455

eapen · Accepted Answer · 2011-02-23 18:04:47Z

1

You can also use a regex to replace all non-alphanumeric characters and then count the number of characters in each line.

answered Feb 23, 2011 at 18:04

eapen

5796 silver badges15 bronze badges

Comments

Asterisk · Accepted Answer · 2011-02-23 17:53:22Z

0

Once thing you could do is when you read the line iterate through it and increment number of characters:

for character in line:
    if character.isalnum():
        num_chars += 1

P.S. you might want to change if statement condition to satisfy your particular needs, i.e. if you want to count $ for example.

edited Feb 23, 2011 at 17:53

answered Feb 23, 2011 at 17:47

Asterisk

3,5742 gold badges37 silver badges55 bronze badges

Comments

Nadun Priyankarage · Accepted Answer · 2016-11-18 09:35:18Z

Try this for count number of words and number of sentences and get probability for similar words,

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize


text_file = open("..//..//static//output.txt", "r")
lines = text_file.readlines()
x=0
tokenized_words = [word_tokenize(i) for i in lines]
for i in tokenized_words:

    print(i) #array contain with tokens
    print(str(len(i))) #word count

    for j in i:
        if j== 'words': #simple algo for count number of 'words' to be count
            x = x+1

tokenized_sents = [sent_tokenize(k) for k in lines]

for  k in tokenized_sents:
    print("Sentences"+str(k)) #array contain with sentences
    print("number of sentences "+str(len(k))) #number of sentences

print("number of word"+str(x))
print("Probability of 'word' in text file "+str(x/len(i)))

Collectives™ on Stack Overflow

code for counting number of sentences, words and characters in an input file

4 Answers 4

Comments

Comments

Comments

Comments

Your Answer

Linked

Hot Network Questions

Collectives™ on Stack Overflow

4 Answers 4

Comments

Comments

Comments

Comments

Your Answer

Sign up or log in

Post as a guest

Linked

Related