how to run naive bayes algorithm for all the text file in a dataset instead of few text files

580 Views Asked by At

Hi I have written a python code for naïve bayes algorithm using bigram.My dataset contains bunch of text file.This code is working fine with some of the text files but not for all of them.How to improve in my code so that it will successful for all the text files.I have posted my code below.

from __future__ import division
import os
import nltk.classify.util
from nltk.corpus import stopwords
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
import random
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.metrics.scores import precision , recall , f_measure
import itertools ,collections
negfilenames = os.listdir("C:/Users/Sharmili/Desktop/movie_reviews/train/neg")
print('train negative:' ,negfilenames)
neg_reviews = []
for filename in negfilenames:
    f = open(os.path.join("C:/Users/Sharmili/Desktop/movie_reviews/train/neg",filename),'r',encoding='UTF8')
    neg_reviews = f.read().split()
print('train negative:' ,neg_reviews)
print('train negative:',len(neg_reviews))
posfilenames = os.listdir("C:/Users/Sharmili/Desktop/movie_reviews/train/pos")
print('train positive:' ,posfilenames)
pos_reviews = []
for filename in posfilenames:
    f = open(os.path.join("C:/Users/Sharmili/Desktop/movie_reviews/train/pos",filename),'r',encoding='UTF8')
    pos_reviews = f.read().split()
print('train positive:' ,pos_reviews)
print('train positive:' ,len(pos_reviews))
testposfilenames = os.listdir("C:/Users/Sharmili/Desktop/movie_reviews/test/pos")
print('test positive:' ,testposfilenames)
test_pos_reviews = []
for filename in testposfilenames:
    f = open(os.path.join("C:/Users/Sharmili/Desktop/movie_reviews/test/pos",filename),'r',encoding='UTF8')
    test_pos_reviews = f.read().split()
print('test positive:' ,test_pos_reviews)
print('test positive:' ,len(test_pos_reviews))
testnegfilenames = os.listdir("C:/Users/Sharmili/Desktop/movie_reviews/test/neg")
print('test negative:' ,testnegfilenames)
test_neg_reviews = []
for filename in testnegfilenames:
    f = open(os.path.join("C:/Users/Sharmili/Desktop/movie_reviews/test/neg",filename),'r',encoding='UTF8')
test_neg_reviews = f.read().split()
print('test negative:' ,test_neg_reviews)
print('test negative:' ,len(test_neg_reviews))

def word_split(data):
    data_new = []
    for word in data:
        word_filter = [i.lower() for i in word.split()]
        data_new.append(word_filter)
    return data_new
stopset = set(stopwords.words('english')) - set(('over', 'under', 'below', 'more', 'most', 'no', 'not', 'only', 'such', 'few', 'so', 'too', 'very', 'just', 'any', 'once'))

def bigram_word_feats(words , score_fn = BigramAssocMeasures.chi_sq , n= 200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn , n)
    return (dict([(ngram,True) for ngram in itertools.chain(words,bigrams) if ngram not in stopset]))
def evaluate_classifier(bigram_word_feats):
    negtrnfeats = [(bigram_word_feats(f),'negative') for f in neg_reviews]
    postrnfeats = [(bigram_word_feats(f),'positive')for f in pos_reviews]
    negtestfeats = [(bigram_word_feats(f),'negative')for f in test_neg_reviews]
    postestfeats = [(bigram_word_feats(f),'positive')for f in test_pos_reviews]
    trainfeats = postrnfeats + negtrnfeats
    testfeats = negtestfeats + postestfeats
    classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (features,label) in enumerate(testfeats):
       refsets[label].add(i)
       predicted = classifier.classify(features)
       testsets[predicted].add(i)
    accuracy = nltk.classify.util.accuracy(classifier, testfeats)
    pos_precision = precision(refsets['positive'], testsets['positive'])
    pos_recall = recall(refsets['positive'], testsets['positive'])
    pos_fmeasure = f_measure(refsets['positive'], testsets['positive'])
    neg_precision = precision(refsets['negative'], testsets['negative'])
    neg_recall = recall(refsets['negative'], testsets['negative'])
    neg_fmeasure = f_measure(refsets['negative'], testsets['negative'])
    print('accuracy:', accuracy)
    print('precision', (pos_precision + neg_precision) / 2)
    print('recall', (pos_recall + neg_recall) / 2)
    print('f-measure', (pos_fmeasure + neg_fmeasure) / 2)
evaluate_classifier(bigram_word_feats)
0

There are 0 best solutions below