How do I create columns that add a tag if it matches with a keyword from a different dataset?

39 Views Asked by At

I have 2 datasets: 1 with a list of general goods and another with a list of transactions:

Keyword Goods
Soap A Soap
Soap B Soap
Shampoo Shampoo
ID Date Txn
1 1/22 Soap A 100 ml
1 1/23 Soap A 50 ml
2 1/24 Soap B 100 ml
2 1/24 Shampoo 50 ml
3 1/24 Juice 100 g

I want to create something like below that would check if the txn column matches the keyword column and create a tagging per goods for each ID (e.g. if an ID had any txns with the keyword 'Soap A' or 'Soap B', the column Soap would be tagged as 1 and 0 if not)

ID Soap Shampoo
1 1 0
2 1 1
3 0 0

I know I can make this happen by coding manually using if then statements but I have a long list of goods so I want to see if there's a more efficient way to do this through do loops.

2

There are 2 best solutions below

0
Reeza On BEST ANSWER

Here's one way to do this:


*create fake data;
data lookup;
infile cards dlm=',';
input Keyword $ Goods $;
cards;
Soap A,  Soap
Soap B,  Soap
Shampoo, Shampoo
;
run;

data rawData;
infile cards dlm=',' truncover;
input ID  $ _Date $    Txn  $25.;
informat id $8. _date $8. txn $25.;
date = mdy(input(scan(_date, 1, "/"), 8.), 1, input(scan(_date, 2, "/"), 8.));
format date yymmn6.;
drop _date;
cards;
1,   1/22,    Soap A 100 ml
1,   1/23,    Soap A 50 ml
2,   1/24,    Soap B 100 ml
2,   1/24,    Shampoo 50 ml
3,   1/24,    Juice 100 g
;
run;

*get the dimensions of the number of lookup terms;
proc sql noprint;
select count(*) into :num_search_terms from lookup;
quit;

%put &num_search_terms;

*lookup values;
data search;
array _lookup(&num_search_terms., 2) $ _temporary_;

/*2*/
*load array into memory;
   if _n_ = 1 then do j=1 to &num_search_terms.;
   set lookup;
   _lookup(j,1) = keyword;
   _lookup(j,2) = Goods;
   end;
 
 set rawData;
 do i=1 to &num_search_terms.;
    if find(txn, _lookup(i, 1), 'it')>0 then term=_lookup(i, 2);
 end;
 
 if not missing(term) then Value=1;
 
 drop i j keyword;
run;

*summarize to one record per id, term;
proc sql;
create table data2transpose as
select id, term, max(value) as Value
from search
group by id, term
order by 1;
quit;

*flip to desired format;
proc transpose data=data2transpose out=want;
by id;
var value;
id term;
run; 
0
d r On

Oracle SQL
You could use conditional aggregation or pivoting to get expected result on db side using just sql...

WITH    --  S a m p l e    D a t a :
    goods (KEYWORD, GOODS) as
        ( Select 'Soap A',  'Soap'    From Dual Union All
          Select 'Soap B',  'Soap'    From Dual Union All
          Select 'Shampoo', 'Shampoo' From Dual 
        ),
    txns (ID, A_DATE, TXN) AS
        ( Select 1, '1/22', 'Soap A 100 ml' From Dual Union All
          Select 1, '1/23', 'Soap A 50 ml'  From Dual Union All
          Select 2, '1/24', 'Soap B 100 ml' From Dual Union All
          Select 2, '1/24', 'Shampoo 50 ml' From Dual Union All
          Select 3, '1/24', 'Juice 100 g'   From Dual
        )
--  SQL - conditional aggregation
Select    t.ID, 
          Max( Case When Nvl(g.GOODS, '0') = 'Soap' Then '1' Else '0' End ) "SOAP",
          Max( Case When Nvl(g.GOODS, '0') = 'Shampoo' Then '1' Else '0' End ) "SHAMPOO"
From      txns t
Left Join goods g ON( g.KEYWORD = SubStr(t.TXN, InStr(t.TXN, g.KEYWORD), Length(g.KEYWORD)) )
Group By  t.ID
Order By  t.ID
/*    R e s u l t :
ID       SOAP     SHAMPOO
1           1           0
2           1           1
3           0           0   */

Using PIVOT results the same as above:

--    SQL - PIVOT
Select  ID, LEAST(SOAP, 1) "SOAP", LEAST(SHAMPOO, 1) "SHAMPOO" 
From    ( Select    Distinct t.ID, g.GOODS
          From      txns t
          Left Join goods g ON( g.KEYWORD = SubStr(t.TXN, InStr(t.TXN, g.KEYWORD), Length(g.KEYWORD)) )
        )
PIVOT ( Count(*) For GOODS IN('Soap' "SOAP", 'Shampoo' "SHAMPOO") )