Snowflake View too slow to query with dynamic pivot

666 Views Asked by At

I have created a stored procedure in Snowflake to dynamically pivot the table and create a view from JSON data. The query creates a view with 370 columns as I desired but it's too slow to query over the view in fact querying a single row takes up to 20 mins

create or replace procedure create_pivot_view()
returns string
language javascript
execute as caller as
$$
  var cols_query = ` select
concat('\\'',
listagg( 
    distinct f.value:signalName::STRING,'\\',\\''),'\\'') as signal_name_list
FROM
(select *
FROM 
    PM_POWER),table(flatten(input=>c1:data:baseData:signals, mode=>'ARRAY')) as f`;
  var stmt1 = snowflake.createStatement({sqlText: cols_query});
  var results1 = stmt1.execute();
  results1.next();
  var col_list = results1.getColumnValue(1);
   pivot_query = `
create or replace view AWSS3_PM.PUBLIC.PM_POWER_CN8000_V2 as 
select  * from (
select figures,stats,SignalName,id,Latitude,Longitude,Altitude
from (
select
  c1:id::STRING as id,
  c1:data:baseData:unitID::varchar as UnitID,
  c1:data:baseData:latitude::varchar as Latitude,
  c1:data:baseData:longitude::varchar as Longitude,
  c1:data:baseData:altitude::varchar as Altitude,
  c1:timestamp::varchar as TimeStamp,
  f.value:"dataValue"::varchar as SignalDataValue,
  f.value:"dataValueValid"::varchar as SignalDataValueValid,
  f.value:"signalID"::varchar as SignalID,
  f.value:"units"::varchar as SignalUnits,
  f.value:"dataValueEnum"::varchar as SignalDataEnum,
  f.value:"signalName"::varchar as SignalName
from
(
select *
FROM 
    PM_POWER), table(flatten(input=>c1:data:baseData:signals, mode=>'ARRAY')) as f
) flt
unpivot (figures for stats in(UnitID,SignalDataValue, SignalDataValueValid, SignalID, SignalUnits, SignalDataEnum, TimeStamp))
) up
pivot (min(up.figures) for up.SignalName in (${col_list}
))

     `;
  var stmt2 = snowflake.createStatement({sqlText: pivot_query});
  stmt2.execute();
  
  return pivot_query;
 
$$;
  
call create_pivot_view();

Any suggestions to fasten this approach or an alternative would be really grateful to improve the performance of the query.

A single row of JSON data looks like the below:

{
  "data": {
    "baseData": {
      "altitude": 0,
      "altitudeValid": "false",
      "customerID": "CN",
      "kind": "Power",
      "latitude": 0,
      "latitudeValid": "false",
      "longitude": 0,
      "longitudeValid": "false",
      "name": "Predictive Maintenance Data.Power",
      "signals": [
        {
          "dataValue": 3112900,
          "dataValueValid": "true",
          "signalID": 3424,
          "signalName": "mainGeneratorPower",
          "units": "kW"
        },
        {
          "dataValue": 1.4035000801086426,
          "dataValueValid": "true",
          "signalID": 380,
          "signalName": "DB_HandlePos",
          "units": "V"
        },
        {
          "dataValue": 2,
          "dataValueEnum": "Reverse",
          "dataValueValid": "true",
          "signalID": 813,
          "signalName": "reverserPos",
          "units": "none"
        },
        {
          "dataValue": 291400,
          "dataValueValid": "true",
          "signalID": 586,
          "signalName": "tractiveEffort",
          "units": "none"
        },
        {
          "dataValue": 8,
          "dataValueEnum": "T 8",
          "dataValueValid": "true",
          "signalID": 899,
          "signalName": "throttlePos",
          "units": "none"
        },
        {
          "dataValue": 47.05950164794922,
          "dataValueValid": "true",
          "signalID": 1805,
          "signalName": "AWTF",
          "units": "°C"
        },
        {
          "dataValue": 0.2971585690975189,
          "dataValueValid": "true",
          "signalID": 4925,
          "signalName": "ChpDty",
          "units": "none"
        },
        {
          "dataValue": 20.14109992980957,
          "dataValueValid": "true",
          "signalID": 4835,
          "signalName": "LDBBLWA",
          "units": "A"
        },
        {
          "dataValue": 36.02000045776367,
          "dataValueValid": "true",
          "signalID": 2669,
          "signalName": "IcGVelM",
          "units": "km/hr"
        },
        {
          "dataValue": 479185.125,
          "dataValueValid": "true",
          "signalID": 1070,
          "signalName": "WPEgILP",
          "units": "PSIG"
        },
        {
          "dataValue": 293026.875,
          "dataValueValid": "true",
          "signalID": 1799,
          "signalName": "WPEgOtP",
          "units": "PSIG"
        },
        {
          "dataValue": 926750,
          "dataValueValid": "true",
          "signalID": 4698,
          "signalName": "MR2 Prs",
          "units": "PSIG"
        },
        {
          "dataValue": 24,
          "dataValueEnum": "ON",
          "dataValueValid": "true",
          "signalID": 664,
          "signalName": "MVCC>",
          "units": "none"
        },
        {
          "dataValue": 907422.625,
          "dataValueValid": "true",
          "signalID": 4804,
          "signalName": "SR Pres",
          "units": "PSIG"
        }
      ],
      "unitID": "CN 8000",
      "ver": "1.0.0"
    },
    "baseType": "PredictiveMaintenanceData"
  },
  "dataName": "CN8000.Prod.PredictiveMaintenanceData",
  "id": "18a89f9e-9620-4453-a546-23412025e7c0",
  "tags": {
    "iaapl.access.level1": "Private",
    "iaapl.access.level2": "OEM",
    "iaapl.internal.deviceID": "",
    "iaapl.internal.deviceName": "",
    "iaapl.internal.encodeTime": "2021-02-25T07:41:19.000Z",
    "iaapl.internal.sender": "Intelligent",
    "iaapl.software.name": "",
    "iaapl.software.partNumber": 0,
    "iaapl.software.version": ""
  },
  "timestamp": "2021-02-25T07:32:31.000Z"
}

The result should look like :

enter image description here

Thanks in Advance

1

There are 1 best solutions below

7
On

the bits where you type

( select * FROM PM_POWER)

could just be

PM_POWER

Your JSON is malformed.

You are reading from data:baseData:signals but there is no signals in you data..

If you want help with your View performance, I would remove the whole Stored Procedure part. And get SELECT that get the result you want from the demo data you provide. Because presently.

Update:

So your example data still does not generate the desired output. So if this is a performance improvement project, it's super frustrating, to not even have the input + code = output

anyways, here's you data and code mixed together:

with pm_power as (
    select parse_json('{
  "data": {
    "baseData": {
      "altitude": 0,
      "altitudeValid": "false",
      "customerID": "CN",
      "kind": "Power",
      "latitude": 0,
      "latitudeValid": "false",
      "longitude": 0,
      "longitudeValid": "false",
      "name": "Predictive Maintenance Data.Power",
      "signals": [
        {
          "dataValue": 3112900,
          "dataValueValid": "true",
          "signalID": 3424,
          "signalName": "mainGeneratorPower",
          "units": "kW"
        },
        {
          "dataValue": 1.4035000801086426,
          "dataValueValid": "true",
          "signalID": 380,
          "signalName": "DB_HandlePos",
          "units": "V"
        },
        {
          "dataValue": 2,
          "dataValueEnum": "Reverse",
          "dataValueValid": "true",
          "signalID": 813,
          "signalName": "reverserPos",
          "units": "none"
        },
        {
          "dataValue": 291400,
          "dataValueValid": "true",
          "signalID": 586,
          "signalName": "tractiveEffort",
          "units": "none"
        },
        {
          "dataValue": 8,
          "dataValueEnum": "T 8",
          "dataValueValid": "true",
          "signalID": 899,
          "signalName": "throttlePos",
          "units": "none"
        },
        {
          "dataValue": 47.05950164794922,
          "dataValueValid": "true",
          "signalID": 1805,
          "signalName": "AWTF",
          "units": "°C"
        },
        {
          "dataValue": 0.2971585690975189,
          "dataValueValid": "true",
          "signalID": 4925,
          "signalName": "ChpDty",
          "units": "none"
        },
        {
          "dataValue": 20.14109992980957,
          "dataValueValid": "true",
          "signalID": 4835,
          "signalName": "LDBBLWA",
          "units": "A"
        },
        {
          "dataValue": 36.02000045776367,
          "dataValueValid": "true",
          "signalID": 2669,
          "signalName": "IcGVelM",
          "units": "km/hr"
        },
        {
          "dataValue": 479185.125,
          "dataValueValid": "true",
          "signalID": 1070,
          "signalName": "WPEgILP",
          "units": "PSIG"
        },
        {
          "dataValue": 293026.875,
          "dataValueValid": "true",
          "signalID": 1799,
          "signalName": "WPEgOtP",
          "units": "PSIG"
        },
        {
          "dataValue": 926750,
          "dataValueValid": "true",
          "signalID": 4698,
          "signalName": "MR2 Prs",
          "units": "PSIG"
        },
        {
          "dataValue": 24,
          "dataValueEnum": "ON",
          "dataValueValid": "true",
          "signalID": 664,
          "signalName": "MVCC>",
          "units": "none"
        },
        {
          "dataValue": 907422.625,
          "dataValueValid": "true",
          "signalID": 4804,
          "signalName": "SR Pres",
          "units": "PSIG"
        }
      ],
      "unitID": "CN 8000",
      "ver": "1.0.0"
    },
    "baseType": "PredictiveMaintenanceData"
  },
  "dataName": "CN8000.Prod.PredictiveMaintenanceData",
  "id": "18a89f9e-9620-4453-a546-23412025e7c0",
  "tags": {
    "iaapl.access.level1": "Private",
    "iaapl.access.level2": "OEM",
    "iaapl.internal.deviceID": "",
    "iaapl.internal.deviceName": "",
    "iaapl.internal.encodeTime": "2021-02-25T07:41:19.000Z",
    "iaapl.internal.sender": "Intelligent",
    "iaapl.software.name": "",
    "iaapl.software.partNumber": 0,
    "iaapl.software.version": ""
  },
  "timestamp": "2021-02-25T07:32:31.000Z"
}')as c1
) 
select  * 
from (
    select 
        figures,
        stats,
        SignalName,
        id,
        Latitude,
        Longitude,
        Altitude
    from (
        select
            c1:id::STRING as id,
            c1:data:baseData:unitID::varchar as UnitID,
            c1:data:baseData:latitude::varchar as Latitude,
            c1:data:baseData:longitude::varchar as Longitude,
            c1:data:baseData:altitude::varchar as Altitude,
            c1:timestamp::varchar as TimeStamp,
            f.value:"dataValue"::varchar as SignalDataValue,
            f.value:"dataValueValid"::varchar as SignalDataValueValid,
            f.value:"signalID"::varchar as SignalID,
            f.value:"units"::varchar as SignalUnits,
            f.value:"dataValueEnum"::varchar as SignalDataEnum,
            f.value:"signalName"::varchar as SignalName
        from PM_POWER
            ,table( flatten(input=>c1:data:baseData:signals, mode=>'ARRAY') ) as f
    ) flt
        unpivot (figures for stats in(UnitID, SignalDataValue, SignalDataValueValid, SignalID, SignalUnits, SignalDataEnum, TimeStamp))
) up
pivot (min(up.figures) for up.SignalName in (
'mainGeneratorPower','DB_HandlePos','reverserPos','tractiveEffort','throttlePos','AWTF','LDBBLWA','IcGVelM','WPEgOtP','MR2 Prs','SR Pres','ChpDty','MVCC>','WPEgILP'
))
STATS ID LATITUDE LONGITUDE ALTITUDE 'mainGeneratorPower' 'DB_HandlePos' 'reverserPos' 'tractiveEffort' 'throttlePos' 'AWTF' 'LDBBLWA' 'IcGVelM' 'WPEgOtP' 'MR2 Prs' 'SR Pres' 'ChpDty' 'MVCC>' 'WPEgILP'
UNITID 18a89f9e-9620-4453-a546-23412025e7c0 0 0 0 CN 8000 CN 8000 CN 8000 CN 8000 CN 8000 CN 8000 CN 8000 CN 8000 CN 8000 CN 8000 CN 8000 CN 8000 CN 8000 CN 8000
SIGNALID 18a89f9e-9620-4453-a546-23412025e7c0 0 0 0 3424 380 813 586 899 1805 4835 2669 1799 4698 4804 4925 664 1070
SIGNALUNITS 18a89f9e-9620-4453-a546-23412025e7c0 0 0 0 kW V none none none °C A km/hr PSIG PSIG PSIG none none PSIG
TIMESTAMP 18a89f9e-9620-4453-a546-23412025e7c0 0 0 0 2021-02-25T07:32:31.000Z 2021-02-25T07:32:31.000Z 2021-02-25T07:32:31.000Z 2021-02-25T07:32:31.000Z 2021-02-25T07:32:31.000Z 2021-02-25T07:32:31.000Z 2021-02-25T07:32:31.000Z 2021-02-25T07:32:31.000Z 2021-02-25T07:32:31.000Z 2021-02-25T07:32:31.000Z 2021-02-25T07:32:31.000Z 2021-02-25T07:32:31.000Z 2021-02-25T07:32:31.000Z 2021-02-25T07:32:31.000Z
SIGNALDATAVALUEVALID 18a89f9e-9620-4453-a546-23412025e7c0 0 0 0 true true true true true true true true true true true true true true
SIGNALDATAENUM 18a89f9e-9620-4453-a546-23412025e7c0 0 0 0 Reverse T 8 ON
SIGNALDATAVALUE 18a89f9e-9620-4453-a546-23412025e7c0 0 0 0 3112900 1.4035000801086426 2 291400 8 47.05950164794922 20.14109992980957 36.02000045776367 293026.875 926750 907422.625 0.2971585690975189 24 479185.125

output as picture

An Answer:

so this SQL gets the answer in the same structure BUT timestamp,unitid are not rows as those are all the same value, so it seem having them as rows is a mistake, and they should be a column..

select
    case lower(o.key )
        when 'datavalue' then 'SignalDataValue'
        when 'datavaluevalid' then 'SignalDataValueValid'
        when 'signalid' then 'SignalID'
        when 'units' then 'SignalUnits'
        when 'datavalueenum' then 'SignalDataEnum'
        when 'signalname' then 'SignalName'
    end as stats,
    c1:id::STRING as id,
    c1:data:baseData:unitID::varchar as UnitID,
    c1:data:baseData:latitude::varchar as Latitude,
    c1:data:baseData:longitude::varchar as Longitude,
    c1:data:baseData:altitude::varchar as Altitude,
    c1:timestamp::varchar as TimeStamp
    ,max(iff(f.value:"signalName"='mainGeneratorPower', o.value, null)) as "'mainGeneratorPower'"
    ,max(iff(f.value:"signalName"='DB_HandlePos', o.value, null)) as "'DB_HandlePos'"
    ,max(iff(f.value:"signalName"='reverserPos', o.value, null)) as "'reverserPos'"
    ,max(iff(f.value:"signalName"='tractiveEffort', o.value, null)) as "'tractiveEffort'"
    ,max(iff(f.value:"signalName"='throttlePos', o.value, null)) as "'throttlePos'"
    ,max(iff(f.value:"signalName"='AWTF', o.value, null)) as "'AWTF'"
    ,max(iff(f.value:"signalName"='LDBBLWA', o.value, null)) as "'LDBBLWA'"
    ,max(iff(f.value:"signalName"='IcGVelM', o.value, null)) as "'IcGVelM'"
    ,max(iff(f.value:"signalName"='WPEgOtP', o.value, null)) as "'WPEgOtP'"
    ,max(iff(f.value:"signalName"='MR2 Prs', o.value, null)) as "'MR2 Prs'"
    ,max(iff(f.value:"signalName"='SR Pres', o.value, null)) as "'SR Pres'"
    ,max(iff(f.value:"signalName"='ChpDty', o.value, null)) as "'ChpDty'"
    ,max(iff(f.value:"signalName"='MVCC>', o.value, null)) as "'MVCC>'"
    ,max(iff(f.value:"signalName"='WPEgILP', o.value, null)) as "'WPEgILP'"
from PM_POWER
    ,table( flatten(input=>c1:data:baseData:signals, mode=>'ARRAY') ) as f
    ,table( flatten(input=>f.value, mode=>'OBJECT')) as o
WHERE o.key != 'signalName'
GROUP BY 1,2,3,4,5,6,7
ORDER BY 1;

gives:

which means your Store Procedure needs to build that.. which is you are happy that this is more performant, you can solve how to build that above in your SP.

enter image description here