How do I turn a list of JSON objects into a Spark dataframe in Code Workbook?

476 Views Asked by At

How can I turn this list of JSON objects into a Spark dataframe?

[
  {
    '1': 'A', 
    '2': 'B'
  }, 
  {
    '1': 'A', 
    '3': 'C'
  }
] 

into

 1     2     3
 A     B     null
 A     null  C

I've tried spark.read.json(spark.sparkContext.parallelize(d)) and various combinations of that with json.dumps(d).

2

There are 2 best solutions below

0
On

You can use spark.createDataFrame(d) to get the desired effect.

You do get a deprecation warning about inferring schema from dictionaries, so the "right" way to do this is to first create the rows:

from pyspark.sql import Row
data = [{'1': 'A', '2': 'B'}, {'1': 'A', '3': 'C'}]
schema = ['1', '2', '3']
rows = []
for d in data:
    dict_for_row = {k: d.get(k,None) for k in schema}
    rows.append(Row(**dict_for_row))

then create the DataFrame:

df = spark.createDataFrame(row)
0
On

I had to slay this dragon to import JIRA issues. They came back as a dataset of response objects, each containing an inner array of issue JSON objects.

This code worked as a single transformation to get to the properly-parsed JSON objects in a DataFrame:

import json
from pyspark.sql import Row
from pyspark.sql.functions import explode

def issues_enumerated(All_Issues_Paged):

    def generate_issue_row(input_row: Row) -> Row:
        """
        Generates a dataframe of each responses issue array as a single array record per-Row
        """
        d = input_row.asDict()
        resp_json = d['response']
        resp_obj = json.loads(resp_json)
        issues = list(map(json.dumps,resp_obj['issues']))

        return Row(issues=issues)
    
    # array-per-record
    unexploded_df = All_Issues_Paged.rdd.map(generate_issue_row).toDF()
    # row-per-record
    row_per_record_df = unexploded_df.select(explode(unexploded_df.issues))
    # raw JSON string per-record RDD
    issue_json_strings_rdd = row_per_record_df.rdd.map(lambda _: _.col)
    # JSON object dataframe
    issues_df = spark.read.json(issue_json_strings_rdd)
    issues_df.printSchema()
    return issues_df

Schema is too big to show, but here's a snippet:

root
 |-- expand: string (nullable = true)
 |-- fields: struct (nullable = true)
 |    |-- aggregateprogress: struct (nullable = true)
 |    |    |-- percent: long (nullable = true)
 |    |    |-- progress: long (nullable = true)
 |    |    |-- total: long (nullable = true)
 |    |-- aggregatetimeestimate: long (nullable = true)
 |    |-- aggregatetimeoriginalestimate: long (nullable = true)
 |    |-- aggregatetimespent: long (nullable = true)
 |    |-- assignee: struct (nullable = true)
 |    |    |-- accountId: string (nullable = true)
 |    |    |-- accountType: string (nullable = true)
 |    |    |-- active: boolean (nullable = true)
 |    |    |-- avatarUrls: struct (nullable = true)
 |    |    |    |-- 16x16: string (nullable = true)
 |    |    |    |-- 24x24: string (nullable = true)
 |    |    |    |-- 32x32: string (nullable = true)
 |    |    |    |-- 48x48: string (nullable = true)
 |    |    |-- displayName: string (nullable = true)
 |    |    |-- emailAddress: string (nullable = true)
 |    |    |-- self: string (nullable = true)
 |    |    |-- timeZone: string (nullable = true)
 |    |-- components: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- description: string (nullable = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- self: string (nullable = true)
 |    |-- created: string (nullable = true)
 |    |-- creator: struct (nullable = true)
 |    |    |-- accountId: string (nullable = true)
 |    |    |-- accountType: string (nullable = true)
 |    |    |-- active: boolean (nullable = true)
 |    |    |-- avatarUrls: struct (nullable = true)
 |    |    |    |-- 16x16: string (nullable = true)
 |    |    |    |-- 24x24: string (nullable = true)
 |    |    |    |-- 32x32: string (nullable = true)
 |    |    |    |-- 48x48: string (nullable = true)
 |    |    |-- displayName: string (nullable = true)
 |    |    |-- emailAddress: string (nullable = true)
 |    |    |-- self: string (nullable = true)
 |    |    |-- timeZone: string (nullable = true)
 |    |-- customfield_10000: string (nullable = true)
 |    |-- customfield_10001: struct (nullable = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- isShared: boolean (nullable = true)
 |    |    |-- title: string (nullable = true)
 |    |-- customfield_10002: string (nullable = true)
 |    |-- customfield_10003: string (nullable = true)
 |    |-- customfield_10004: struct (nullable = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- self: string (nullable = true)
 |    |    |-- value: string (nullable = true)
 |    |-- customfield_10005: string (nullable = true)
 |    |-- customfield_10006: string (nullable = true)
 |    |-- customfield_10007: string (nullable = true)
 |    |-- customfield_10008: struct (nullable = true)
 |    |    |-- data: struct (nullable = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- issueType: struct (nullable = true)
 |    |    |    |    |-- iconUrl: string (nullable = true)
 |    |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- key: string (nullable = true)
 |    |    |    |-- keyNum: long (nullable = true)
 |    |    |    |-- projectId: long (nullable = true)
 |    |    |    |-- summary: string (nullable = true)
 |    |    |-- hasEpicLinkFieldDependency: boolean (nullable = true)
 |    |    |-- nonEditableReason: struct (nullable = true)
 |    |    |    |-- message: string (nullable = true)
 |    |    |    |-- reason: string (nullable = true)
 |    |    |-- showField: boolean (nullable = true)
 |    |-- customfield_10009: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- boardId: long (nullable = true)
 |    |    |    |-- completeDate: string (nullable = true)
 |    |    |    |-- endDate: string (nullable = true)
 |    |    |    |-- goal: string (nullable = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- startDate: string (nullable = true)
 |    |    |    |-- state: string (nullable = true)

...