How to use word embedding as features for CRF (sklearn-crfsuite) model training

3.5k Views Asked by At

I want to develop an NER model where I want to use word-embedding features to train CRF model. Code perfectly working without word-embedding features but when I insert embedding as features for CRF training, got error messages. Here is the part of snippet of my code:

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
#from sklearn.cross_validation import cross_val_score
#from sklearn.grid_search import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import pickle
from gensim.models import KeyedVectors
import numpy as np
# Load vectors directly from the file
model1 = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) ### Loading pre-trainned word2vec model
### Embedding function 
def get_features(word):
    word=word.lower()
    vectors=[]
    try:
        vectors.append(model1[word])
    except:
        pass
    #vectors=np.array(vectors)
    #vectors=vectors[0]
    return vectors

def word2features(sent, i):
    word = sent[i][0]
    wordembdding=get_features(word)   ## word embedding vector 
    wordembdding=np.array(wordembdding) ## vectors 
    #wordembdding= 
    #wordembdding=wordembdding[0]
    postag = sent[i][1]
    tag1=sent[i][2]
    tag2=sent[i][4]
    tag3 = sent[i][5]


    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'wordembdding': wordembdding,
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
        'tag1': tag1,
        'tag1[:2]': tag1[:2],
        'tag2': tag2,
        'tag2[:2]': tag2[:2],
        'tag3': tag3,
        'tag3[:2]': tag3[:2],
        'wordlength': len(word),
        'wordinitialcap': word[0].isupper(),
        'wordmixedcap': len([x for x in word[1:] if x.isupper()])>0,
        'wordallcap': len([x for x in word if x.isupper()])==len(word),
        'distfromsentbegin': i
    }
    if i > 0:
        word1 = sent[i-1][0]
        wordembdding1= get_features(word1)
        wordembdding1=np.array(wordembdding1)
        #wordembdding1=f2(wordembdding1)
        postag1 = sent[i-1][1]
        tag11=sent[i-1][2]
        tag22=sent[i-1][4]
        tag33 = sent[i-1][5]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:wordembdding': wordembdding1,   # word embedding features 
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
            '-1:tag1': tag1,
            '-1:tag1[:2]': tag1[:2],
            '-1:tag2': tag2,
            '-1:tag2[:2]': tag2[:2],
            '-1:tag3': tag3,
            '-1:tag3[:2]': tag3[:2],
            '-1:wordlength': len(word),
            '-1:wordinitialcap': word[0].isupper(),
            '-1:wordmixedcap': len([x for x in word[1:] if x.isupper()])>0,
            '-1:wordallcap': len([x for x in word if x.isupper()])==len(word),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        wordembdding1= get_features(word1)
        wordembdding1= get_features(word1)
        wordembdding1=np.array(wordembdding1) ## word embedding features 
        #wordembdding1=f2(wordembdding)
        postag1 = sent[i+1][1]
        tag11=sent[i+1][2]
        tag22=sent[i+1][4]
        tag33 = sent[i+1][5]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:wordembdding': wordembdding1,
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
            '+1:tag1': tag1,
            '+1:tag1[:2]': tag1[:2],
            '+1:tag2': tag2,
            '+1:tag2[:2]': tag2[:2],
            '+1:tag3': tag3,
            '+1:tag3[:2]': tag3[:2],
            '+1:wordlength': len(word),
            '+1:wordinitialcap': word[0].isupper(),
            '+1:wordmixedcap': len([x for x in word[1:] if x.isupper()])>0,
            '+1:wordallcap': len([x for x in word if x.isupper()])==len(word),
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, tag1, label, tag2, tag3 in sent]

def sent2tokens(sent):
    return [token for token, postag, tag1, label, tag2, tag3, tag4, tag5 in sent]



X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]


%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)   ### Error message when try to train

When I want to train the CRF model I got this error messages:

TypeError: only size-1 arrays can be converted to Python scalars

Can anyone suggest me how to use word embedding vectors to train CRF model ?

1

There are 1 best solutions below

0
On BEST ANSWER

As you can read here, currently python-crfsuite and sklearn-crfsuite don't support array features, like word embeddings.

Instead, you can pass every vector component as a feature.

{...
 'v0': 1.81583762e-02,
 'v1': 2.83553465e-02,
  ...
 'v299': -4.26079705e-02,
 ...}

I suggest to replace your get_features function:

def get_features(word):
    word=word.lower()
    try:
         vector=model1[word]
    except:
        # if the word is not in vocabulary,
        # returns zeros array
        vector=np.zeros(300,)

    return vector   

Then modify word2features function, to return a new feature for every component of the vector:

def word2features(sent, i):
    word = sent[i][0]
    wordembdding=get_features(word)   ## word embedding vector 
    postag = sent[i][1]
    tag1=sent[i][2]
    tag2=sent[i][4]
    tag3 = sent[i][5]


    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
        'tag1': tag1,
        'tag1[:2]': tag1[:2],
        'tag2': tag2,
        'tag2[:2]': tag2[:2],
        'tag3': tag3,
        'tag3[:2]': tag3[:2],
        'wordlength': len(word),
        'wordinitialcap': word[0].isupper(),
        'wordmixedcap': len([x for x in word[1:] if x.isupper()])>0,
        'wordallcap': len([x for x in word if x.isupper()])==len(word),
        'distfromsentbegin': i
    }

    # here you add 300 features (one for each vector component)
    for iv,value in enumerate(wordembdding):
        features['v{}'.format(iv)]=value

# And so on...

Two small notes:

  • if in your text there are many words, which are not in the vocabulary, word embeddings cannot improve much your NER model. Maybe you can use Fasttext (also integrated in Gensim), which can properly handle unseen words.
  • even if it useful, adding vector embeddings for each word can make your training set very big, produce long training time and a very big classifier.