I'm trying to read files from the Enron dataset. However, my code is reading only 6000 emails, when there are more than 33 thousand emails in the database. I'm not doing what is the problem in the code. Is this the correct reading of the file path?
mails_list = []
enron_list = ['enron1', 'enron2', 'enron3', 'enron4', 'enron5', 'enron6']
def getEmail():
print("Processing directories...")
for directory in enron_list:
print("...processing " + str(directory) + "...")
ham_folder = ('/xxxxxxxxx/xxx/xxxx/xxxx/SpamCode/rawdata/' + directory + '/ham')
spam_folder = ('/xxxxxxxxxx/xxxx/xxxxx/xxx/SpamCode/rawdata/' + directory + '/spam')
i = 0
# Process ham messages in directory
for entry in os.scandir(ham_folder):
# This should be encoded in Latin_1 but catch encoding errors just to be sure
try:
file = open(entry, encoding="latin_1")
content = file.read().split("\n", 1)
except (UnicodeDecodeError):
print("COULD NOT DECODE")
print("Problem with file:" + str(entry))
print("Error message:", sys.exc_info()[1])
subject = content[0].replace("Subject: ", "")
message = content[1]
# date is contained in filename - parsed using regex pattern
pattern = r"\d+\.(\d+-\d+-\d+)"
date = re.search(pattern, str(entry)).group(1)
date = dt.datetime.strptime(date, '%Y-%m-%d')
file.close()
mails_list.append([subject, message, "ham", date])
# Process spam messages in directory
for entry in os.scandir(spam_folder):
try:
file = open(entry, encoding="latin_1")
content = file.read().split("\n", 1)
except (UnicodeDecodeError):
print("COULD NOT DECODE")
print("Problem with file:" + str(entry))
print("Error message:", sys.exc_info()[1])
subject = content[0].replace("Subject: ", "")
message = content[1]
# date is contained in filename - parsed using regex pattern
pattern = r"\d+\.(\d+-\d+-\d+)"
date = re.search(pattern, str(entry)).group(1)
date = dt.datetime.strptime(date, '%Y-%m-%d')
file.close()
mails_list.append([subject, message, "spam", date])
return mails_list
emails = getEmail()
print("All directories processed. Writing to Dataframe...")
mails = pd.DataFrame(emails, columns=[
"Subject", "Message", "Spam/Ham", "Date"])
mails.to_csv('mails.csv')
print("\nData processed and saved to file.\nMails contained in data:")
print("\nTotal:\t" + str(mails.shape[0]))
print(mails["Spam/Ham"].value_counts())
Processing directories...
...processing enron1...
...processing enron2...
...processing enron3...
...processing enron4...
...processing enron5...
...processing enron6...
All directories processed. Writing to Dataframe...
Data processed and saved to file.
Mails contained in data:
Total: 5999
spam 4499
ham 1500
Name: Spam/Ham, dtype: int64
the correct amount is 33 thousand emails
I tried to change the reading path but it didn't work