I wrote a script to calculate document distance. It seems working but I couldn't be sure. (I tried for the small strings and it seems working) Also I am not sure that its fast enough for large texts.
Here is the Document Distance formula:
The frequency is the number of occurrences for each object. Such as for "D1: be or not to be" the frequency will be freq$$(D_1) = {be=2, not=1, or=1, to=2}$$ and $$||D_1|| = \sqrt{2^2 + 1^2 + 1^2 + 2^2}$$
from math import sqrt
from string import ascii_lowercase
alphanumerics = ascii_lowercase + "0123456789"
file1 = open("document1.txt", "r")
file2 = open("document2.txt", "r")
print(file1)
document1 = ""
for line in file1:
document1 += line + ""
document2 = ""
for line in file2:
document2 += line + ""
if document1 == "":
print("--- The document1.txt file is empty! ---")
raise ValueError
elif document2 == "":
print("--- The document2.txt file is empty!! ---")
raise ValueError
print(document1, document2)
def word_processor(document):
'''returns an array that only contains the words in the text'''
doc_word = []
word = ""
count_space = 0
for char in document.lower():
if char in alphanumerics:
word += char
count_space = 0
elif char == " " and count_space == 0:
doc_word.append(word)
word = ""
count_space += 1
doc_word.append(word)
return doc_word
doc1_words = word_processor(document1)
doc2_words = word_processor(document2)
dot_product = 0
doc1_word_freq = {i: doc1_words.count(i)
for i in doc1_words} # the doc1 vector
doc2_word_freq = {i: doc2_words.count(i)
for i in doc2_words} # the doc2 vector
for key, value in doc1_word_freq.items():
if key in doc2_word_freq.keys():
dot_product += doc2_word_freq[key] * value # the dot product of the two doc vectors
doc1_mag = sqrt(sum([value**2 for value in doc1_word_freq.values()])) # the magnitude of the each document
doc2_mag = sqrt(sum([value**2 for value in doc2_word_freq.values()]))
similarity = dot_product / (doc1_mag * doc2_mag) * 100
print("The similarity between 2 document is", similarity, "percent")
Any ideas ?
I did not use the acos function since I don't think I need it.
