Identifying Sequences of Rows that Meet a Condition

101 Views Asked by At

I have this table in SQL (sample_table):

CREATE TABLE student_table (
    student INT,
    var CHAR(1),
    d1 DATE,
    d2 DATE
);


INSERT INTO student_table (student, var, d1, d2) VALUES
    (1, 'd', '2008-09-27', '2008-10-02'),
    (1, 'd', '2008-11-14', '2008-11-21'),
    (2, 'd', '2007-01-15', '2007-01-20'),
    (2, 'd', '2010-03-04', '2010-03-10'),
    (3, 'a', '2011-03-24', '2011-04-02'),
    (3, 'a', '2015-01-12', '2015-01-14'),
    (4, 'e', '2009-07-18', '2009-07-23'),
    (4, 'a', '2020-06-19', '2020-06-27'),
    (5, 'c', '2009-11-26', '2009-11-28'),
    (5, 'a', '2015-12-24', '2015-12-25'),
    (6, 'c', '2009-01-09', '2009-01-18'),
    (6, 'a', '2018-11-21', '2018-11-30'),
    (7, 'b', '2003-07-08', '2003-07-14'),
    (7, 'a', '2006-06-01', '2006-06-06'),
    (7, 'a', '2010-02-26', '2010-03-07'),
    (8, 'b', '2004-11-11', '2004-11-21'),
    (8, 'a', '2014-03-26', '2014-03-30'),
    (8, 'a', '2021-05-06', '2021-05-12'),
    (8, 'c', '2023-04-28', '2023-05-06');


 student var         d1         d2
       1   d 2008-09-27 2008-10-02
       1   d 2008-11-14 2008-11-21
       2   d 2007-01-15 2007-01-20
       2   d 2010-03-04 2010-03-10
       3   a 2011-03-24 2011-04-02
       3   a 2015-01-12 2015-01-14
       4   e 2009-07-18 2009-07-23
       4   a 2020-06-19 2020-06-27
       5   c 2009-11-26 2009-11-28
       5   a 2015-12-24 2015-12-25
       6   c 2009-01-09 2009-01-18
       6   a 2018-11-21 2018-11-30
       7   b 2003-07-08 2003-07-14
       7   a 2006-06-01 2006-06-06
       7   a 2010-02-26 2010-03-07
       8   b 2004-11-11 2004-11-21
       8   a 2014-03-26 2014-03-30
       8   a 2021-05-06 2021-05-12
       8   c 2023-04-28 2023-05-06

I am trying to use a CASE WHEN statement to identify 4 different groups of students:

  • students that never had var=a
  • students that only have var=a
  • students that had var=a at some point but as per their latest row they dont have var=a
  • students that had var=a, then had var !=a and then went back to having var=a at least once (e.g. of conditions that match this - student_i: a,b,a,,c .... student_j: a,b,a ... student_k: a,b,c,a )

I having difficulty figuring out how to use CASE WHEN statements to make sure no student is double counted in two groups. I tried to write the CASE WHEN statements different ways and settled on the following:

WITH student_var_sequence AS (
    SELECT 
        student,
        var,
        ROW_NUMBER() OVER (PARTITION BY student ORDER BY d1, d2) AS row_num,
        COUNT(*) OVER (PARTITION BY student) AS total_rows
    FROM sample_table
),
student_var_groups AS (
    SELECT 
        student,
        MAX(CASE WHEN var = 'a' THEN row_num ELSE 0 END) AS last_a_row,
        MAX(row_num) AS last_row
    FROM student_var_sequence
    GROUP BY student
),
student_var_cases AS (
    SELECT 
        svs.student,
        svs.var,
        CASE
            WHEN svg.last_a_row = 0 THEN 'Never had a'
            WHEN svg.last_a_row = svg.last_row THEN 'Only have a'
            WHEN svg.last_a_row < svg.last_row THEN 'Had a but not now'
            WHEN svg.last_a_row < MAX(svs.row_num) OVER (PARTITION BY svs.student) THEN 'Had a, not a, then a again'
            ELSE 'Other'
        END AS new_var
    FROM student_var_sequence svs
    JOIN student_var_groups svg ON svs.student = svg.student
)
SELECT * FROM student_var_cases;

The results look like this:

 student var           new_var
       1   d       Never had a
       1   d       Never had a
       2   d       Never had a
       2   d       Never had a
       3   a       Only have a
       3   a       Only have a
       4   a       Only have a
       4   e       Only have a
       5   a       Only have a
       5   c       Only have a
       6   a       Only have a
       6   c       Only have a
       7   a       Only have a
       7   a       Only have a
       7   b       Only have a
       8   a Had a but not now
       8   a Had a but not now
       8   b Had a but not now
       8   c Had a but not now

I can see mistakes here - e.g. student_5 is said to only has "a", but I can see that this is not true.

Is there a way I can simplify my SQL code to fix these errors?

Attempt 2:

WITH 
student_sequence AS (
    SELECT 
        student_id,
        var,
        ROW_NUMBER() OVER (PARTITION BY student_id ORDER BY d1, d2) AS row_num
    FROM student_table
),

-- Students that never had var=a
never_a AS (
    SELECT student_id
    FROM student_sequence
    GROUP BY student_id
    HAVING COUNT(CASE WHEN var = 'a' THEN 1 END) = 0
),

-- Students that only have var=a
only_a AS (
    SELECT student_id
    FROM student_sequence
    GROUP BY student_id
    HAVING COUNT(CASE WHEN var != 'a' THEN 1 END) = 0
),

-- Students that had var=a at some point but as per their latest row they dont have var=a
had_a_not_now AS (
    SELECT student_id
    FROM student_sequence
    WHERE student_id NOT IN (SELECT student_id FROM never_a)
    AND student_id NOT IN (SELECT student_id FROM only_a)
    GROUP BY student_id
    HAVING MAX(CASE WHEN var = 'a' THEN row_num END) < MAX(row_num)
),

-- Students that had var=a, then had var !=a and then went back to having var=a at least once
a_not_a_a AS (
    SELECT student_id
    FROM student_sequence
    WHERE student_id NOT IN (SELECT student_id FROM never_a)
    AND student_id NOT IN (SELECT student_id FROM only_a)
    AND student_id NOT IN (SELECT student_id FROM had_a_not_now)
)

-- Combine all groups
SELECT 
    student_id,
    CASE 
        WHEN student_id IN (SELECT student_id FROM never_a) THEN 'Never had a'
        WHEN student_id IN (SELECT student_id FROM only_a) THEN 'Only have a'
        WHEN student_id IN (SELECT student_id FROM had_a_not_now) THEN 'Had a but not now'
        WHEN student_id IN (SELECT student_id FROM a_not_a_a) THEN 'Had a, not a, then a again'
    END AS student_group
FROM student_sequence
GROUP BY student_id;
0

There are 0 best solutions below