Dynamic Length macro

117 Views Asked by At

I have a project in which I often merge datasets by using data and set i.e.

data want;
set have1 have2;
run;

The problem is, these datasets are often the same but the observations have different lengths and truncation of the data occurs. So I am setting out in writing a macro which allows dynamic length altercation based on the input dataset with the longest length for character variables. So far I've built this code, very simply

%Macro Formatting;

proc contents data = engdata.assets2 out = Assets1 noprint;
run;

data Assets2;
set Assets1;

keep NAME LENGTH;
if FORMAT = "$";
run;

proc contents data = historic.assets2016 out = HAssets1 noprint;
run;

data HAssets2;
set HAssets1;

keep NAME LENGTH;
if FORMAT = "$";
run;

proc contents data = engdata.Liabilities2 out = Liabilities1 noprint;
run;

data Liabilities2;
set Liabilities1;

keep NAME LENGTH;
if FORMAT = "$";

run;

proc contents data = historic.Liabilities2016 out = HLiabilities1 noprint;
run;

data HLiabilities2;
set HLiabilities1;

keep NAME LENGTH;
if FORMAT = "$";

run;

proc contents data = engdata.bonds2 out = bonds1 noprint;
run;

data bonds2;
set bonds1;

keep NAME LENGTH;
if FORMAT = "$";

run;

proc contents data = engdata.Irswaps2 out = Irswaps1 noprint;
run;

data Irswaps2;
set Irswaps1;

keep NAME LENGTH;
if FORMAT = "$";

run;

proc contents data = historic.Money_Market_2016 out = MoneyMarket1 noprint;
run;

data MoneyMarket2;
set MoneyMarket1;

keep NAME LENGTH;
if FORMAT = "$";

run;

proc sql;
    create table AllLength as
    select a.*
          ,a.Length as Length1
          ,b.Length as Length2
          ,c.Length as Length3
          ,d.Length as Length4
          ,e.Length as Length5
          ,f.Length as Length6
          ,g.Length as Length7
    from Liabilities2 as a
    left join Assets2 as b
    on a.Name = b.Name
    left join Bonds2 as c
    on a.Name = c.Name
    left join Irswaps2 as d
    on a.Name = d.Name
    left join HLiabilities2 as e
    on a.Name = e.Name
    left join HAssets2 as f
    on a.Name = f.Name
    left join MoneyMarket2 as g
    on a.Name = g.Name
    order by Name;
quit;

data AllLength2;
set AllLength;

array LengthVar Length1-Length7;
largest = max(of LengthVar[*]);
index    = whichn(largest, of LengthVar[*]);
Varname = vname(LengthVar[index]);

keep name largest;

run;

proc sql noprint;
select name into: Var1 separated by " " from AllLength2;
select largest into: Var2 separated by " " from AllLength2;
quit;

%put &var1;
%put &var2;

%let index = 1;

%do %until (%Scan(&Var1,&index," ")=);

%let Varr1 = %Scan(&Var1,&index," ");
%let Varr2 = %Scan(&Var2,&index," ");

data engdata.liabilities2;
length &Varr1 $&Varr2..;
set engdata.liabilities2;

format &Varr1 $&Varr2..;
informat &Varr1 $&Varr2..;

run;

data engdata.assets2;
length &Varr1 $&Varr2..;
set engdata.assets2;

format &Varr1 $&Varr2..;
informat &Varr1 $&Varr2..;

run;

data engdata.bonds2;
length &Varr1 $&Varr2..;
set engdata.bonds2;

format &Varr1 $&Varr2..;
informat &Varr1 $&Varr2..;

run;

data engdata.irswaps2;
length &Varr1 $&Varr2..;
set engdata.irswaps2;

format &Varr1 $&Varr2..;
informat &Varr1 $&Varr2..;

run;

data historic.liabilities2016;
length &Varr1 $&Varr2..;
set historic.liabilities2016;

format &Varr1 $&Varr2..;
informat &Varr1 $&Varr2..;

run;

data historic.assets2016;
length &Varr1 $&Varr2..;
set historic.assets2016;

format &Varr1 $&Varr2..;
informat &Varr1 $&Varr2..;

run;

data Historic.moneymarket2016;
length &Varr1 $&Varr2..;
set Historic.moneymarket2016;

format &Varr1 $&Varr2..;
informat &Varr1 $&Varr2..;

run;

%let index = %eval(&Index + 1);

%end;

%mend;

%Formatting;

At times, there variables I am looking format are not present in some datasets and I get the following in my log

NOTE: There were 0 observations read from the data set HISTORIC.MONEYMARKET2016.
NOTE: The data set HISTORIC.MONEYMARKET2016 has 0 observations and 11 variables.
NOTE: DATA statement used (Total process time):
  real time           0.02 seconds
  cpu time            0.01 seconds

When I look at the dataset, everything is there? Do I lose any work, is there a way to use this loop and when the variable is not present, to just skip it?

1

There are 1 best solutions below

0
On BEST ANSWER

Instead of manipulating all the data sets individually, which can cause a lot of disk activity, consider writing code that constructs an attrib statement that is a homogenization of the variable attributes. The constructed statement would be placed prior to the data stacking SET statement, thus forcing the pdv to use the homogenized attributes, which means all the incoming data would fit the length attributes of pdv and no warnings would occur.

For example, consider three data sets

data one;
  s='aaaa';
  y=4;
  length y 4;
run;

data two;
  length s $50;
  t = 'for 2';
  y = 1.75;
run;

data three;
  length s $20;
  z = -1;
run;

Which are to be stacked in a homogeneous manner

%big_stack_attack (datasets=
  one
  two
  three,
  out=next_big_thing
)

The stacking macro is a simple wrapper with one extra twist, obtaining attrib statements that homogenize the data set variables.

%macro big_stack_attack(datasets=, out=);

  %local attr_code;

  %* obtain the attrib statements that homogenize the data;
  %homogenize (datasets=&datasets, result=attr_code);

  * stack the data, using the attrib statements first to predefine the PDV ;
  * into which the SET statement will place values;

  data &out;
    &attr_code;
    set &datasets;
  run;

%mend big_stack_attack;

The macro for constructing the attrib statements examines the contents of the datasets and uses the longest length for constructed attrib statements

%macro homogenize (datasets=, result=);

  %* construct attribute statements as the result value
   * The statements use the longest length when a variable+type appears
   * in more than one dataset
   * No checks are done for like named variables of differing types;

  %* extract each data set ;

  %local i N;
  %let i = 1;
  %do %while (%length(%scan(&datasets,&i)));
    %local data&i;
    %let data&i = %scan(&datasets,&i);
    %let i = %eval(&i + 1);
  %end;
  %let N = %eval (&i - 1);

  %* get contents of each data set;

  %do i = 1 %to &N;
    proc contents noprint data=&&data&i out=_contents&i;
    run;
  %end;

  %* construct and concatenate an attrib statement for each variable+type;

  proc sql noprint;
    select "attrib " 
      || trim(name) || " length="
      || case when type=2 then "$" else " " end
      || cats(max(length))
      || case 
          when type=2 then " format=$" || cats(max(length)) || "."
          else " " 
         end
    into
      :&result       %* NOTE: result parameter is name of macro-var in containing scope;
    separated by 
      ';'
    from 
(
  %do i = 1 %to &N;
    %if &i > 1 %then UNION;
    select * from _contents&i
  %end;
)
    group by name, type
    ;
  quit;

%mend homogenize;

The case of like-name variables of different types would need additional coding and requirements (should the character variable be try-coerced to a numeric type value, or should the numeric variable to coerced to a character type value)