Counting the occurrences of all letters in a txtfile

69 Views Asked by At

I'm trying to open a file and count the occurrences of letters.

So far this is where I'm at:

def frequencies(filename):
    infile=open(filename, 'r')
    wordcount={}
    content = infile.read()
    infile.close()
    counter = {}
    invalid = "‘'`,.?!:;-_\n—' '"

    for word in content:
        word = content.lower()
        for letter in word:
            if letter not in invalid:
                if letter not in counter:
                    counter[letter] = content.count(letter)
                    print('{:8} appears {} times.'.format(letter, counter[letter]))

Any help would be greatly appreciated.

3

There are 3 best solutions below

0
On

best way is using numpy packages, the example would be like this

import numpy
text = "xvasdavawdazczxfawaczxcaweac"
text = list(text)
a,b = numpy.unique(text, return_counts=True)
x = sorted(zip(b,a), reverse=True)
print(x)

in your case, you can combine all your words into single string, then convert the string into list of character if you want to remove all except character, you can use regex to clean it

#clean all except character
content = re.sub(r'[^a-zA-Z]', r'', content)
#convert to list of char
content = list(content)
a,b = numpy.unique(content, return_counts=True)
x = sorted(zip(b,a), reverse=True)
print(x)
0
On

If you are looking for a solution not using numpy:

invalid = set([ch for ch in  "‘'`,.?!:;-_\n—' '"])

def frequencies(filename):
    counter = {}
    with open(filename, 'r') as f:
        for ch in (char.lower() for char in f.read()):
            if ch not in invalid:
                if ch not in counter:
                    counter[ch] = 0
                counter[ch] += 1

        results = [(counter[ch], ch) for ch in counter]
        return sorted(results)

for result in reversed(frequencies(filename)):
    print result
0
On

I would suggest using collections.Counter instead.

Compact Solution

from collections import Counter
from string import ascii_lowercase # a-z string

VALID = set(ascii_lowercase)

with open('in.txt', 'r') as fin:
    counter = Counter(char.lower() for line in fin for char in line if char.lower() in VALID)
    print(counter.most_common()) # print values in order of most common to least.

More readable solution.

from collections import Counter
from string import ascii_lowercase # a-z string

VALID = set(ascii_lowercase)

with open('in.txt', 'r') as fin:
    counter = Counter()
    for char in (char.lower() for line in fin for char in line):
        if char in VALID:
            counter[char] += 1
    print(counter)

If you don't want to use a Counter then you can just use a dict.

from string import ascii_lowercase # a-z string

VALID = set(ascii_lowercase)

with open('test.txt', 'r') as fin:
    counter = {}
    for char in (char.lower() for line in fin for char in line):
        if char in VALID:
            # add the letter to dict
            # dict.get used to either get the current count value
            # or default to 0. Saves checking if it is in the dict already
            counter[char] = counter.get(char, 0) + 1
    # sort the values by occurrence in descending order
    data = sorted(counter.items(), key = lambda t: t[1], reverse = True)
    print(data)