3

I have made the algorithm that split text into n-grmas (collocations) and it counts probabilities and other statistics of this collocations. When file is more then 50 megabytes it takes long time to count maybe some one will help to improve it.

import math
import re
import csv
from itertools import zip_longest
from datetime import datetime


def tokenize(input_file, encoding):
    lst =[]
    with open(input_file, 'r', encoding=encoding) as f:
        for sent in f:
            sent = sent.lower()
            sent = re.sub("[A-z0-9\'\"`\|\/\+\#\,\)\(\?\!\B\-\:\=\;\.\«\»\—\@]", '', sent)
            sent = re.findall('\w+', sent)
            for word in sent:
            lst.append(word)
    return lst


def ngrams_split(lst, n):
    counts = dict()
    grams = [' '.join(lst[i:i+n]) for i in range(len(lst)-n)]
    for gram in grams:
        if gram not in counts:
            counts[gram] = 1
        else:
            counts[gram] += 1
    return counts


def list_add(counts):
    ngrams = []
    for key, val in counts.items():
        ngrams.append((val, key))
    return ngrams


def gram_add(lst, n):
    ng = []
    grams = [' '.join(lst[i:i+n]) for i in range(len(lst)-n)]
    for gram in grams:
        ng.append(gram)
    return ng


def two_gram_count(input_file, encoding, n_filter, n):
    output_file = []
    lst = tokenize(input_file, encoding) #tokenize
    n_words = len(lst)
    counts = ngrams_split(lst, n) #spliting into ngrams
    ngrams = list_add(counts)  #ading ngrmas to list
    for key, val in ngrams:
        if int(key) >= n_filter:
            ngram_freq = math.log(key/n_words)
            num = key*n_words
            f1 = lst.count(val.split()[0])
            f2 = lst.count(val.split()[1])
            mi = math.pow(math.log(num/(f1*f2), 10), 2)
            ngram_prob = math.log(key/f1, 10)
            output_file.append((ngram_freq, mi, ngram_prob, key, val))
    return output_file


def three_gram_count(input_file, encoding, n_filter, n):
    output_file = []
    lst = tokenize(input_file, encoding) #tokenize
    n_words = len(lst)
    counts = ngrams_split(lst, n) #spliting into ngrams
    ngrams = list_add(counts)  #ading ngrmas to list
    ng = gram_add(lst, 2)
    for key, val in ngrams:
        if int(key) >= n_filter:
            ngram_freq = math.log(key/n_words, 10)
            num = key*n_words
            c2gram = ng.count(val.split()[0] + " " + val.split()[1])
            f1 = lst.count(val.split()[0])
            f2 = lst.count(val.split()[1])
            f3 = lst.count(val.split()[2])
            mi = math.pow(math.log(num/(f1*f2*f3), 10), 2)
            ngram_prob = math.log(key/c2gram, 10)
            output_file.append((ngram_freq, mi, ngram_prob, key, val))
    return output_file


def four_grams_count(input_file, encoding, n_filter, n):
    output_file = []
    lst = tokenize(input_file, encoding) #tokenize
    n_words = len(lst)
    counts = ngrams_split(lst, n) #spliting into ngrams
    ngrams = list_add(counts)  #ading ngrmas to list
    ng2 = gram_add(lst, 2)
    for key, val in ngrams:
        if int(key) >= n_filter:
            ngram_freq = math.log(key/n_words, 10)
            num = key*n_words
            c1gram = ng2.count(val.split()[0] + " " + val.split()[1])
            c2gram = ng2.count(val.split()[1] + " " + val.split()[2])
            c3gram = ng2.count(val.split()[2] + " " + val.split()[3])
            f1 = lst.count(val.split()[0])
            f2 = lst.count(val.split()[1])
            f3 = lst.count(val.split()[2])
            f4 = lst.count(val.split()[3])
            mi = math.pow(math.log(num/(f1*f2*f3*f4), 10), 2)
            prob1 = c1gram/f1
            prob2 = c2gram/f2
            prob3 = c3gram/f3
            ngram_prob = math.log(prob1, 10) + math.log(prob2, 10) +    math.log(prob3, 10)
           output_file.append((ngram_freq, mi, ngram_prob, key, val))
    return output_file


def n_grams_stat(input_file, encoding, n_filter, n):
    output_file = []
    if n == 2:
        for i in two_gram_count(input_file, encoding, n_filter, n):
            output_file.append(i)
    elif n == 3:
        for i in three_gram_count(input_file, encoding, n_filter, n):
            output_file.append(i)
    elif n == 4:
        for i in four_grams_count(input_file, encoding, n_filter, n):
            output_file.append(i)
    return output_file

start_time = datetime.now()
for a, b, c, d, e in n_grams_stat("/home/yan/PycharmProjects/vk/piidsluhano/men_pidsluhano.txt",'utf-8', n_filter=3, n=4):
    print(a, b, c, d, e)
    with open("/home/yan/PycharmProjects/vk/piidsluhano/men_4grams", 'dwwaa') as f:
        f.write(str(a)  +", "+ str(b) + ', '+ str(c) + ", " + str(d) + ", " + str(e) + '\n ')
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))
6
  • 8
    If the code works, you understand what you are doing and what is going on, maybe the question would fit better in codereview.stackexchange.com ? Commented Mar 30, 2016 at 8:03
  • 3
    For better performance u should use set() instead of list(). for python 2 xrange() instead of range(). Why did u need list_add ? u can work with dict directly too. Commented Mar 30, 2016 at 8:11
  • 1
    I suppose it has a smaller community, but as it is targeted closer to your needs it might give you better quality replies. Commented Mar 30, 2016 at 8:11
  • 1
    You should try running Python profiler to find which parts of your code take most computation time. Relevant question Commented Mar 30, 2016 at 8:36
  • 1
    Take a look at stackoverflow.com/questions/35857519/… Commented Mar 30, 2016 at 10:05

0

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.