How to remove duplicates using time difference with linq

103 Views Asked by At

I have an IEnumerable of an item class defined like this:

class Checkup
{
    public Guid SubjectGuid { get; set; }
    public Guid DoctorGuid { get; set; }
    public DateTime Date {get; set;}
}

For example I have the following data set:

SubjectGuid                          DoctorGuid                           Date
2b9dd19f-9ce4-4a0a-832c-e941f5dc0234 6cfbdc40-d30d-4b49-9f02-ca116bcffee0 03.03.2024 11:04:46.644
2b9dd19f-9ce4-4a0a-832c-e941f5dc0234 6cfbdc40-d30d-4b49-9f02-ca116bcffee0 03.03.2024 11:12:27.369
2b9dd19f-9ce4-4a0a-832c-e941f5dc0234 6cfbdc40-d30d-4b49-9f02-ca116bcffee0 03.03.2024 11:30:36.564
bf4e7572-2328-4b19-9a9e-a67aa2f24fba b8dedde9-a397-4469-9c1b-22af4194f35f 03.03.2024 16:24:46.935
bf4e7572-2328-4b19-9a9e-a67aa2f24fba b8dedde9-a397-4469-9c1b-22af4194f35f 03.03.2024 16:39:27.853
bf4e7572-2328-4b19-9a9e-a67aa2f24fba b8dedde9-a397-4469-9c1b-22af4194f35f 03.03.2024 16:59:27.853
efea799a-c755-4f4f-ad80-8ac63fd35d07 b8e88718-55f0-4700-9900-09e8d440345b 03.03.2024 19:00:00.000
efea799a-c755-4f4f-ad80-8ac63fd35d07 b8e88718-55f0-4700-9900-09e8d440345b 03.03.2024 19:21:00.203
efea799a-c755-4f4f-ad80-8ac63fd35d07 b8e88718-55f0-4700-9900-09e8d440345b 03.03.2024 19:50:33.906

And I want to remove duplicates if the difference between the date is less than or equal to twenty minutes, while the SubjectGuid and DoctorGuid are the same. If there are more than two duplicates, then the element with the most recent date within 20 minutes should remain:

SubjectGuid                          DoctorGuid                           Date
2b9dd19f-9ce4-4a0a-832c-e941f5dc0234 6cfbdc40-d30d-4b49-9f02-ca116bcffee0 03.03.2024 11:04:46.644
2b9dd19f-9ce4-4a0a-832c-e941f5dc0234 6cfbdc40-d30d-4b49-9f02-ca116bcffee0 03.03.2024 11:30:36.564
bf4e7572-2328-4b19-9a9e-a67aa2f24fba b8dedde9-a397-4469-9c1b-22af4194f35f 03.03.2024 16:24:46.935
bf4e7572-2328-4b19-9a9e-a67aa2f24fba b8dedde9-a397-4469-9c1b-22af4194f35f 03.03.2024 16:59:27.853
efea799a-c755-4f4f-ad80-8ac63fd35d07 b8e88718-55f0-4700-9900-09e8d440345b 03.03.2024 19:00:00.000
efea799a-c755-4f4f-ad80-8ac63fd35d07 b8e88718-55f0-4700-9900-09e8d440345b 03.03.2024 19:21:00.203
efea799a-c755-4f4f-ad80-8ac63fd35d07 b8e88718-55f0-4700-9900-09e8d440345b 03.03.2024 19:50:33.906

Here is the code (LINQPad) with which I tried to get the desired result, but it does not work as I would like:

void Main()
{
    var checkups = new List<Checkup>()
    {
        new Checkup { SubjectGuid = Guid.Parse("2b9dd19f-9ce4-4a0a-832c-e941f5dc0234"), DoctorGuid = Guid.Parse("6cfbdc40-d30d-4b49-9f02-ca116bcffee0"), Date = DateTime.Parse("03.03.2024 11:04:46.644") },
        new Checkup { SubjectGuid = Guid.Parse("2b9dd19f-9ce4-4a0a-832c-e941f5dc0234"), DoctorGuid = Guid.Parse("6cfbdc40-d30d-4b49-9f02-ca116bcffee0"), Date = DateTime.Parse("03.03.2024 11:12:27.369") },
        new Checkup { SubjectGuid = Guid.Parse("2b9dd19f-9ce4-4a0a-832c-e941f5dc0234"), DoctorGuid = Guid.Parse("6cfbdc40-d30d-4b49-9f02-ca116bcffee0"), Date = DateTime.Parse("03.03.2024 11:30:36.564") },
        new Checkup { SubjectGuid = Guid.Parse("bf4e7572-2328-4b19-9a9e-a67aa2f24fba"), DoctorGuid = Guid.Parse("b8dedde9-a397-4469-9c1b-22af4194f35f"), Date = DateTime.Parse("03.03.2024 16:24:46.935") },
        new Checkup { SubjectGuid = Guid.Parse("bf4e7572-2328-4b19-9a9e-a67aa2f24fba"), DoctorGuid = Guid.Parse("b8dedde9-a397-4469-9c1b-22af4194f35f"), Date = DateTime.Parse("03.03.2024 16:39:27.853") },
        new Checkup { SubjectGuid = Guid.Parse("bf4e7572-2328-4b19-9a9e-a67aa2f24fba"), DoctorGuid = Guid.Parse("b8dedde9-a397-4469-9c1b-22af4194f35f"), Date = DateTime.Parse("03.03.2024 16:59:27.853") },
        new Checkup { SubjectGuid = Guid.Parse("efea799a-c755-4f4f-ad80-8ac63fd35d07"), DoctorGuid = Guid.Parse("b8e88718-55f0-4700-9900-09e8d440345b"), Date = DateTime.Parse("03.03.2024 19:00:00.000") },
        new Checkup { SubjectGuid = Guid.Parse("efea799a-c755-4f4f-ad80-8ac63fd35d07"), DoctorGuid = Guid.Parse("b8e88718-55f0-4700-9900-09e8d440345b"), Date = DateTime.Parse("03.03.2024 19:21:00.203") },
        new Checkup { SubjectGuid = Guid.Parse("efea799a-c755-4f4f-ad80-8ac63fd35d07"), DoctorGuid = Guid.Parse("b8e88718-55f0-4700-9900-09e8d440345b"), Date = DateTime.Parse("03.03.2024 19:50:33.906") },
    };
    checkups.Dump();

    var checkupsNoDuplicates = checkups
        .GroupBy(c => new
        {
            SubjectGuid = c.SubjectGuid,
            DoctorGuid = c.DoctorGuid,
            Time = c.Date.Ticks / TimeSpan.FromMinutes(20).Ticks
        })
        .Select(g => g.OrderByDescending(sg => sg.Date).FirstOrDefault());
    checkupsNoDuplicates.Dump();
}

class Checkup
{
    public Guid SubjectGuid { get; set; }
    public Guid DoctorGuid { get; set; }
    public DateTime Date { get; set; }
}

This code produces this result:

SubjectGuid                          DoctorGuid                           Date
2b9dd19f-9ce4-4a0a-832c-e941f5dc0234 6cfbdc40-d30d-4b49-9f02-ca116bcffee0 03.03.2024 11:12:27
2b9dd19f-9ce4-4a0a-832c-e941f5dc0234 6cfbdc40-d30d-4b49-9f02-ca116bcffee0 03.03.2024 11:30:36
bf4e7572-2328-4b19-9a9e-a67aa2f24fba b8dedde9-a397-4469-9c1b-22af4194f35f 03.03.2024 16:39:27
bf4e7572-2328-4b19-9a9e-a67aa2f24fba b8dedde9-a397-4469-9c1b-22af4194f35f 03.03.2024 16:59:27
efea799a-c755-4f4f-ad80-8ac63fd35d07 b8e88718-55f0-4700-9900-09e8d440345b 03.03.2024 19:00:00
efea799a-c755-4f4f-ad80-8ac63fd35d07 b8e88718-55f0-4700-9900-09e8d440345b 03.03.2024 19:21:00
efea799a-c755-4f4f-ad80-8ac63fd35d07 b8e88718-55f0-4700-9900-09e8d440345b 03.03.2024 19:50:33

And dividing ticks will not work always. For example ticks division in this code:

void Main()
{
    var dt1 = DateTime.Parse("03.01.2024 01:27:44.907");
    Console.WriteLine(dt1.ToString("MM.dd.yyyy hh:mm:ss.fff"));
    Console.WriteLine(dt1.Ticks / TimeSpan.FromMinutes(20).Ticks);
    
    var dt2 = DateTime.Parse("03.01.2024 01:41:55.088");
    Console.WriteLine(dt2.ToString("MM.dd.yyyy hh:mm:ss.fff"));
    Console.WriteLine(dt2.Ticks / TimeSpan.FromMinutes(20).Ticks);
}

Produces result:

01.03.2024 01:27:44.907
53199868
01.03.2024 01:41:55.088
53199869
3

There are 3 best solutions below

1
Mario Vernari On BEST ANSWER

You can leverage the Aggregate, which performs a reduction.

Here we go:

List<Checkup> result = checkups
    .OrderBy(c => c.Date)   //sorting
    .Aggregate<Checkup, List<Checkup>>(
        [],
        (accumulate, item) =>
        {
            Checkup? match = accumulate
                .LastOrDefault(c => c.SubjectGuid == item.SubjectGuid && c.DoctorGuid == item.DoctorGuid);

            if (match == null)
            {
                accumulate.Add(item);
            }
            else if ((item.Date - match.Date) > TimeSpan.FromMinutes(20))
            {
                accumulate.Add(item);
            }
            return accumulate;
        });

There's a first ordering (ascending), which could be optional if the input data are ensured already sorted.

Then, there is the aggregation block.

The Aggregate function takes as first argument the initial value of the expected result, which is an empty List<Checkup> in our case.

The core function is its second argument, which operates on the "accumulation" and the current item. By iterating, the accumulator represents the current-step result. The function elaborates then the post-step result, which will be the accumulator for the next iteration.

When the iteration is over, the actual result is what the accumulator contains.

What the core does in our case?

  1. It finds the last entry of the same (SubjectGuid, DoctorGuid) pair.

  2. If there's not, then adds the current item to the accumuator.

  3. If there's one, then checks if enough time (20 mins) have been elapsed, and adds the item if that applies.

1
Nitesh On

In your current approach, the issue is because of dividing ticks by TimeSpan.FromMinutes(20).

Ticks might not give the desired result for grouping dates in 20-minute intervals.

Instead, you can use a custom grouping logic (helper method) to achieve the desired result of removing duplicates within a 20-minute window.

Something like this:

void Main()
{
    var checkups = new List<Checkup>()
    {
        new Checkup { SubjectGuid = Guid.Parse("2b9dd19f-9ce4-4a0a-832c-e941f5dc0234"), DoctorGuid = Guid.Parse("6cfbdc40-d30d-4b49-9f02-ca116bcffee0"), Date = DateTime.Parse("03.03.2024 11:04:46.644") },
        new Checkup { SubjectGuid = Guid.Parse("2b9dd19f-9ce4-4a0a-832c-e941f5dc0234"), DoctorGuid = Guid.Parse("6cfbdc40-d30d-4b49-9f02-ca116bcffee0"), Date = DateTime.Parse("03.03.2024 11:12:27.369") },
        new Checkup { SubjectGuid = Guid.Parse("2b9dd19f-9ce4-4a0a-832c-e941f5dc0234"), DoctorGuid = Guid.Parse("6cfbdc40-d30d-4b49-9f02-ca116bcffee0"), Date = DateTime.Parse("03.03.2024 11:30:36.564") },
        new Checkup { SubjectGuid = Guid.Parse("bf4e7572-2328-4b19-9a9e-a67aa2f24fba"), DoctorGuid = Guid.Parse("b8dedde9-a397-4469-9c1b-22af4194f35f"), Date = DateTime.Parse("03.03.2024 16:24:46.935") },
        new Checkup { SubjectGuid = Guid.Parse("bf4e7572-2328-4b19-9a9e-a67aa2f24fba"), DoctorGuid = Guid.Parse("b8dedde9-a397-4469-9c1b-22af4194f35f"), Date = DateTime.Parse("03.03.2024 16:39:27.853") },
        new Checkup { SubjectGuid = Guid.Parse("bf4e7572-2328-4b19-9a9e-a67aa2f24fba"), DoctorGuid = Guid.Parse("b8dedde9-a397-4469-9c1b-22af4194f35f"), Date = DateTime.Parse("03.03.2024 16:59:27.853") },
        new Checkup { SubjectGuid = Guid.Parse("efea799a-c755-4f4f-ad80-8ac63fd35d07"), DoctorGuid = Guid.Parse("b8e88718-55f0-4700-9900-09e8d440345b"), Date = DateTime.Parse("03.03.2024 19:00:00.000") },
        new Checkup { SubjectGuid = Guid.Parse("efea799a-c755-4f4f-ad80-8ac63fd35d07"), DoctorGuid = Guid.Parse("b8e88718-55f0-4700-9900-09e8d440345b"), Date = DateTime.Parse("03.03.2024 19:21:00.203") },
        new Checkup { SubjectGuid = Guid.Parse("efea799a-c755-4f4f-ad80-8ac63fd35d07"), DoctorGuid = Guid.Parse("b8e88718-55f0-4700-9900-09e8d440345b"), Date = DateTime.Parse("03.03.2024 19:50:33.906") },
    };
    
    var checkupsNoDuplicates = checkups
        .GroupBy(c => new
        {
            SubjectGuid = c.SubjectGuid,
            DoctorGuid = c.DoctorGuid,
            TimeSlot = GetTimeSlot(c.Date)
        })
        .Select(g => g.OrderByDescending(sg => sg.Date).First());
    
    foreach(var item in checkupsNoDuplicates)
    {
        Console.WriteLine($"{item.SubjectGuid} - {item.DoctorGuid} - {item.Date}");
    }
}

// Helper method to get time slot based on date
private long GetTimeSlot(DateTime date)
{
    // 20 minutes in ticks
    long interval = TimeSpan.FromMinutes(20).Ticks;
    
    // Round the ticks down to the nearest 20-minute interval
    long timeSlot = date.Ticks / interval;
    
    return timeSlot;
}
2
John Wu On

If I understand the requirement, you need the last item (in date/time order) of any grouping of items with the same GUID within a 20-minute span.

To put this a different way, you need the set of all items where there does not exist any item within the next 20 minutes with the same GUIDs.

You can achieve this with !Any.

    var filteredData = data.Where
    (
        x => !data.Any
        (
            y => y != x
              && y.SubjectGuid == x.SubjectGuid
              && y.DoctorGuid == x.DoctorGuid
              && y.Date < x.Date.AddMinutes(20)
        )
    );