Parsing nested dictionary (allen nlp hierplane_tree)

298 Views Asked by At

I am trying to parse the JSON object that gets returned by the allennlp predictor. I was able to find a helpful function to find all of the children values, but what I really want to do with the dependencies, is given an entity "man" can I get the associated attributes from the JSON object.

Example sentence: "When I was walking to the park yesterday, I saw a man wearing a blue shirt."

The dependency tree has wearing, blue, shirt, etc. that is associated with the entity. How can I get the associated JSON block back for man in that structure? I am not sure how I can modify my helper function or develop another one to get that block out of the JSON output. Any help or suggestions would be greatly appreciated.

AllenNLP Code:

text = "When I was walking to the park yesterday, I saw a man wearing a blue shirt."
from allennlp.predictors.predictor import Predictor
import allennlp_models.structured_prediction
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/biaffine-dependency-parser-ptb-2020.04.06.tar.gz")
tree = predictor.predict(sentence=text)

tree = tree['hierplane_tree']
tree

Helper Function that will let me get the children values:

"""Extract nested values from a JSON tree."""


def json_extract(obj, key):
    """Recursively fetch values from nested JSON."""
    arr = []

    def extract(obj, arr, key):
        """Recursively search for values of key in JSON tree."""
        if isinstance(obj, dict):
            for k, v in obj.items():
                if isinstance(v, (dict, list)):
                    extract(v, arr, key)
                elif k == key:
                    arr.append(v)
        elif isinstance(obj, list):
            for item in obj:
                extract(item, arr, key)
        return arr

    values = extract(obj, arr, key)
    return values

The function can give me the values:

# Find every instance of `name` in a Python dictionary.
children = json_extract(tree, 'word')
print(children)

['walking', 'When', 'I', 'was', 'to', 'park', 'the', 'yesterday', ',', 'saw', 'I', 'man', 'a', 'wearing', 'shirt', 'a', 'blue', '.']

JSON Extract (What I want to try to get when I provide "man":

{'word': 'man',
        'nodeType': 'dep',
        'attributes': ['NOUN'],
        'link': 'dep',
        'spans': [{'start': 51, 'end': 55}],
        'children': [{'word': 'a',
          'nodeType': 'det',
          'attributes': ['DET'],
          'link': 'det',
          'spans': [{'start': 49, 'end': 51}]},
         {'word': 'wearing',
          'nodeType': 'dep',
          'attributes': ['VERB'],
          'link': 'dep',
          'spans': [{'start': 55, 'end': 63}],
          'children': [{'word': 'shirt',
            'nodeType': 'dep',
            'attributes': ['NOUN'],
            'link': 'dep',
            'spans': [{'start': 70, 'end': 76}],
            'children': [{'word': 'a',
              'nodeType': 'dep',
              'attributes': ['DET'],
              'link': 'dep',
              'spans': [{'start': 63, 'end': 65}]},
             {'word': 'blue',
              'nodeType': 'dep',
              'attributes': ['ADJ'],
              'link': 'dep',
              'spans': [{'start': 65, 'end': 70}]}]}]}]}]}]}

JSON Output:

{'text': 'When I was walking to the park yesterday , I saw a man wearing a blue shirt .',
 'root': {'word': 'walking',
  'nodeType': 'root',
  'attributes': ['VERB'],
  'link': 'root',
  'spans': [{'start': 11, 'end': 19}],
  'children': [{'word': 'When',
    'nodeType': 'dep',
    'attributes': ['ADV'],
    'link': 'dep',
    'spans': [{'start': 0, 'end': 5}]},
   {'word': 'I',
    'nodeType': 'nsubj',
    'attributes': ['PRON'],
    'link': 'nsubj',
    'spans': [{'start': 5, 'end': 7}]},
   {'word': 'was',
    'nodeType': 'aux',
    'attributes': ['AUX'],
    'link': 'aux',
    'spans': [{'start': 7, 'end': 11}]},
   {'word': 'to',
    'nodeType': 'prep',
    'attributes': ['ADP'],
    'link': 'prep',
    'spans': [{'start': 19, 'end': 22}],
    'children': [{'word': 'park',
      'nodeType': 'pobj',
      'attributes': ['NOUN'],
      'link': 'pobj',
      'spans': [{'start': 26, 'end': 31}],
      'children': [{'word': 'the',
        'nodeType': 'det',
        'attributes': ['DET'],
        'link': 'det',
        'spans': [{'start': 22, 'end': 26}]}]}]},
   {'word': 'yesterday',
    'nodeType': 'tmod',
    'attributes': ['NOUN'],
    'link': 'tmod',
    'spans': [{'start': 31, 'end': 41}]},
   {'word': ',',
    'nodeType': 'dep',
    'attributes': ['PUNCT'],
    'link': 'dep',
    'spans': [{'start': 41, 'end': 43}],
    'children': [{'word': 'saw',
      'nodeType': 'dep',
      'attributes': ['VERB'],
      'link': 'dep',
      'spans': [{'start': 45, 'end': 49}],
      'children': [{'word': 'I',
        'nodeType': 'nsubj',
        'attributes': ['PRON'],
        'link': 'nsubj',
        'spans': [{'start': 43, 'end': 45}]},
       {'word': 'man',
        'nodeType': 'dep',
        'attributes': ['NOUN'],
        'link': 'dep',
        'spans': [{'start': 51, 'end': 55}],
        'children': [{'word': 'a',
          'nodeType': 'det',
          'attributes': ['DET'],
          'link': 'det',
          'spans': [{'start': 49, 'end': 51}]},
         {'word': 'wearing',
          'nodeType': 'dep',
          'attributes': ['VERB'],
          'link': 'dep',
          'spans': [{'start': 55, 'end': 63}],
          'children': [{'word': 'shirt',
            'nodeType': 'dep',
            'attributes': ['NOUN'],
            'link': 'dep',
            'spans': [{'start': 70, 'end': 76}],
            'children': [{'word': 'a',
              'nodeType': 'dep',
              'attributes': ['DET'],
              'link': 'dep',
              'spans': [{'start': 63, 'end': 65}]},
             {'word': 'blue',
              'nodeType': 'dep',
              'attributes': ['ADJ'],
              'link': 'dep',
              'spans': [{'start': 65, 'end': 70}]}]}]}]}]}]},
   {'word': '.',
    'nodeType': 'punct',
    'attributes': ['PUNCT'],
    'link': 'punct',
    'spans': [{'start': 76, 'end': 78}]}]},
 'nodeTypeToStyle': {'root': ['color5', 'strong'],
  'dep': ['color5', 'strong'],
  'nsubj': ['color1'],
  'nsubjpass': ['color1'],
  'csubj': ['color1'],
  'csubjpass': ['color1'],
  'pobj': ['color2'],
  'dobj': ['color2'],
  'iobj': ['color2'],
  'mark': ['color2'],
  'pcomp': ['color2'],
  'xcomp': ['color2'],
  'ccomp': ['color2'],
  'acomp': ['color2'],
  'aux': ['color3'],
  'cop': ['color3'],
  'det': ['color3'],
  'conj': ['color3'],
  'cc': ['color3'],
  'prep': ['color3'],
  'number': ['color3'],
  'possesive': ['color3'],
  'poss': ['color3'],
  'discourse': ['color3'],
  'expletive': ['color3'],
  'prt': ['color3'],
  'advcl': ['color3'],
  'mod': ['color4'],
  'amod': ['color4'],
  'tmod': ['color4'],
  'quantmod': ['color4'],
  'npadvmod': ['color4'],
  'infmod': ['color4'],
  'advmod': ['color4'],
  'appos': ['color4'],
  'nn': ['color4'],
  'neg': ['color0'],
  'punct': ['color0']},
 'linkToPosition': {'nsubj': 'left',
  'nsubjpass': 'left',
  'csubj': 'left',
  'csubjpass': 'left',
  'pobj': 'right',
  'dobj': 'right',
  'iobj': 'right',
  'pcomp': 'right',
  'xcomp': 'right',
  'ccomp': 'right',
  'acomp': 'right'}}
1

There are 1 best solutions below

0
On BEST ANSWER

This will no doubt need to be optimized and cleaned up, but it does enable you to parse the dependency tree from AllenNLP by items of interest (in this case man). Hopefully, this helps someone else out.

From the text, by providing the key/value (word as key and man as value). You get:

Helper Functions:

def get_entity_attributes(obj, key, value):
    """Recursively fetch values from nested JSON."""
    arr = []

    def extract(obj, arr, key):
        """Recursively search for values of key in JSON tree."""
        if isinstance(obj, dict):
            for k, v in obj.items():
                if isinstance(v, (dict, list)):
                    extract(v, arr, key)
        elif isinstance(obj, list):
            for item in obj:
                if(isinstance(item,dict)):
                    ky,vl = key, value
                    if ky in item and vl == item[ky]:
#                         print(type(item), item)
                        arr.append(item)
                extract(item, arr, key)
        return arr

    values = extract(obj, arr, key)
    return values

def parse_attributes(obj, key):
    """Recursively fetch values from nested JSON."""
    arr = []

    def extract(obj, arr, key):
        """Recursively search for values of key in JSON tree."""
        if isinstance(obj, dict):
            for k, v in obj.items():
                if isinstance(v, (dict, list)):
                    extract(v, arr, key)
                elif k == key:
                    arr.append(v)
        elif isinstance(obj, list):
            for item in obj:
                extract(item, arr, key)
        return arr

    values = extract(obj, arr, key)
    return values

# Create list of word tokens after removing stopwords
def get_clean_list(entities):
    filtered_sentence = []

    for word in entities:
        lexeme = nlp.vocab[word]
        if not lexeme.is_stop and not lexeme.is_punct:
            filtered_sentence.append(word) 
    return filtered_sentence

View Output:

text = "When I was walking to the park yesterday, I saw a man wearing a blue shirt."
tree = predictor.predict(sentence=text)

key = "word"
entity = "man"
entities = get_entity_attributes(tree, key, entity)

for ent in entities:
    if ent['nodeType'] == 'dep':
        attributes = parse_attributes(ent, key)
        clean_attributes = get_clean_list(attributes)
        clean_attributes.remove(entity)
        print(f'entity: {entity} Attributes: {clean_attributes}')
    else:
        attributes = parse_attributes(ent, key)
        clean_attributes = get_clean_list(attributes)
        clean_attributes.remove(entity)
        print(f'entity: {entity} Action Attributes: {clean_attributes}')

Gives you:

entity: man Attributes: ['wearing', 'shirt', 'blue']