With this code in Python, I calculate the entropy value of a txt file based on trigrams, but something goes wrong, because the output value is 110.51908986855025 (which is way too high considering that the maximum entropy value of a file is 8 bits based on a logarithmic base of 2). Does anybody can find the mistake (or mistakes)?
import math
file = open("C:/Users/Mik/Documents/import pandas as pd.txt", "r")
content = file.read()
def generate_trigrams(string):
trigrams = []
for i in range(len(string) - 2):
trigram = string[i:i + 3]
trigrams.append(trigram)
return trigrams
trigrams_content = generate_trigrams(content)
n_trigrams = len(trigrams_content)
print(n_trigrams)
def entropy(trigrams_content):
trigram_freqs = {}
for trigram in trigrams_content:
if trigram in trigram_freqs:
trigram_freqs[trigram] += 1
else:
trigram_freqs[trigram] = 1
probs = [trigram_freqs[trigram] / len(trigrams_content) for trigram in trigrams_content]
entropy = 0
for prob in probs:
entropy -= prob * math.log2(prob)
return entropy
print(entropy(trigrams_content))