Capture words and rewrite

359 Views Asked by At

Made a word classifier with nlpnet (http://nilc.icmc.usp.br/nlpnet/index.html). the goal is to extract only words individually with given tagger.

response code

import nlpnet
import codecs
import itertools

TAGGER = nlpnet.POSTagger('pos-pt', language='pt')


def TAGGER_txt(text):
    return (list(TAGGER.tag(text)))

with codecs.open('document.txt', encoding='utf8') as original_file:
     with codecs.open('document_teste.txt', 'w') as output_file:
          for line in original_file.readlines():
          print (line)
          words = TAGGER_txt(line)
          all_words = list(itertools.chain(*words))
          nouns = [word[0] for word in all_words if word[1]=='V']
          print (nouns)

Result

O gato esta querendo comer o ratão 
['gato', 'ratão']
2

There are 2 best solutions below

1
On BEST ANSWER

I think this could be the essence of what you need. Please see edited version.

As you say in your question, the result of tagging Sentence would be something like tagged. If you wanted just the nouns from Sentence you could recover them using the expression after nouns =.

Sentence = " O gato esta querendo comer o rato "  
tagged = [('O', 'ADJ'), ('gato', 'N'), ('esta', 'V'), ('querendo', 'V'), ('comer', 'V'), ('o', 'ADJ'), ('rato', 'N')]

nouns = [t[0] for t in tagged if t[1]=='N']

print (nouns)

Output:

['gato', 'rato']

Edit: It's not clear to me what you want. Here's another possibility.

  • I haven't installed nlpnet because that would be quite a bit of work and I wouldn't use it myself.
  • I simulate TAGGER.txt with TAGGER_txt.
  • I've changed the encoding to Latin-1. It's used in the header and in codecs.open.

.

# -*- coding: Latin-1 -*-
import codecs
import itertools

def TAGGER_txt(text): ## simulate TAGGER.txt
    return [[(u'O', u'ART'), (u'gato', u'N'), (u'esta', u'PROADJ'), (u'querendo', u'V'), (u'comer', u'V'), (u'o', u'ART'), (u'ratão', u'N')]]

with codecs.open('document.txt', encoding='Latin-1') as original_file:
    with codecs.open('document_test.txt', 'w') as output_file:
        for line in original_file.readlines():
            print (line)
            words = TAGGER_txt(line)
            all_words = list(itertools.chain(*words))
            nouns = [word[0] for word in all_words if word[1]=='N']
            print (nouns)

Output:

 O gato esta querendo comer o ratão 
['gato', 'ratão']
3
On

Question: ... dump to a file the sentences that contain more than N occurrences of a particular POS


Note: Assuming 'document.txt' contains one Sentence per Line!

def is_worth_saving(tags, pos, pos_count):
    """
    :param tags:        nlpnet tags from ONE Sentence
    :param pos:         The POS to filter
    :param pos_count:   Number of 'param pos'
    :return:
        True if 'tags' contain more than 'pos_count' occurrences of 'pos'
        False otherwise
    """  
    pos_found = 0
    # Iterate tags
    for word, _pos in tags:
        if _pos == pos:
            pos_found += 1

    return pos_found >= pos_count

if __name__ == '__main__':
    with open('document.txt') as in_fh, open('document_test.txt', 'w') as out_fh:
        for sentence in in_fh:
            print('Sentence:{}'.format(sentence[:-1]))
            tags = TAGGER.tag(sentence)

            # As your Example Sentence has only **2** Verbs,
            # pass 'pos_count=2'
            if is_worth_saving(tags[0], 'V', 2):
                out_fh.write(sentence)
                print (tags[0])

Output:

Sentence:O gato esta querendo comer o ratão
[(u'O', u'ART'), (u'gato', u'N'), (u'esta', u'PROADJ'), (u'querendo', u'V'), (u'comer', u'V'), (u'o', u'ART'), (u'rat', u'N')]

Tested with Python: 3.4.2 and 2.7.9