How to edit a graph in Python (Zipf's Law)

564 Views Asked by At

I need help making a bar chart showing the frequency of the ten most common words in the file. Next to each bar is a second bar whose height is the frequency predicted by Zipf’s Law. (For example, suppose the most common word appears 100 times. Zipf’s Law predicts that the second most common word should appear about 50 times (half as often as the most common), the third most common word should appear about 33 times (a third as often as the most common), the fourth most common word appears about 25 times (a fourth as often as the most common), and so on).

The function takes the name of a text file (as a string) as input.

def zipf_graph(text_file):
    import string
    file = open(text_file, encoding = 'utf8')
    text = file.read()
    file.close()

    punc = string.punctuation + '’”—⎬⎪“⎫'
    new_text = text
    for char in punc:
        new_text = new_text.replace(char,'')
        new_text = new_text.lower()
    text_split = new_text.split()

    # Determines how many times each word appears in the file. 
    from collections import Counter
    word_and_freq = Counter(text_split)
    top_ten_words = word_and_freq.most_common(10)

    print(top_ten_words) 

    #graph info

    import numpy as np
    import matplotlib.pyplot as plt
    barWidth = 0.25
    bars1 = [1,2,3,4,5,6,7,8,9,10] # I want the top_ten_words here
    bars2 = [10,5,3.33,2.5,2,1.67,1.43,1.25,1.11,1] # Zipf Law freq here, numbers are just ex.

    r1 = np.arange(len(bars1))
    r2 = [x + barWidth for x in r1]

    plt.bar(r1, bars1, color='#7f6d5f', width=barWidth, edgecolor='white', label='Word')
    plt.bar(r2, bars2, color='#2d7f5e', width=barWidth, edgecolor='white', label='Zipf Law')
    plt.xlabel('group', fontweight='bold')
    plt.xticks([r + barWidth for r in range(len(bars1))], ['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10']) 
    # Want words to print below bars
    plt.legend()
    plt.show()

zipf_graph('gatsby.txt')

The code prints the top ten words and their frequency in this format (as an example, I used the Great Gatsby book):

[('the', 2573), ('and', 1594), ('a', 1451), ('of', 1233), ('to', 1209), ('i', 1178), ('in', 861), ('he', 797), ('was', 766), ('that', 596)]
2

There are 2 best solutions below

0
On

Matplotlib. Here’s a demo

import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt

objects = ('Python', 'C++', 'Java', 'Perl', 'Scala', 'Lisp')
y_pos = np.arange(len(objects))
performance = [10,8,6,4,2,1]

plt.bar(y_pos, performance, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('Usage')
plt.title('Programming language usage')

plt.show()
0
On

This solution works for me. Some notes:

  • I prefer to use Pandas for gathering my datasets.
  • You need a function that returns the expected frequency by zipf law. I anchored on the most frequent, but an alternative would be to anchor on the total (of top 10).
import pandas as pd

def zipf_frequency(most_common_count, n=10):
    zipf_law = []
    for x in range(1, n+1):
        zipf_law.append(most_common_count/(x))
    return zipf_law

top_ten_words_df = pd.DataFrame(top_ten_words, columns=['word', 'actual count'])
top_ten_words_df['expected zipf frequency'] = zipf_frequency(top_ten_words_df.loc[0, 'actual count'])

fig, ax = plt.subplots()
top_ten_words_df.plot(kind='bar', ax=ax)
ax.set_xticklabels(top_ten_words_df['word'])
fig.tight_layout()

Bar plot