get diff of polars dataframe in rust

126 Views Asked by At
use polars::prelude::*;

fn main() -> PolarsResult<()> {
    let df = df! (
        "nrs" => &[Some(1), Some(2), Some(3), Some(4), Some(5)],
        "names" => &[Some("foo"), Some("ham"), Some("spam"), Some("eggs"), None],
        "groups" => &["A", "A", "B", "C", "B"],
    )?;

    println!("{:?}", df);

    let df2 = df.lazy().filter(col("nrs").lt(lit(4))).collect()?;

    println!("{:?}", df2);

    Ok(())
}
shape: (5, 3)
┌─────┬───────┬────────┐
│ nrs ┆ names ┆ groups │
│ --- ┆ ---   ┆ ---    │
│ i32 ┆ str   ┆ str    │
╞═════╪═══════╪════════╡
│ 1   ┆ foo   ┆ A      │
│ 2   ┆ ham   ┆ A      │
│ 3   ┆ spam  ┆ B      │
│ 4   ┆ eggs  ┆ C      │
│ 5   ┆ null  ┆ B      │
└─────┴───────┴────────┘
shape: (3, 3)
┌─────┬───────┬────────┐
│ nrs ┆ names ┆ groups │
│ --- ┆ ---   ┆ ---    │
│ i32 ┆ str   ┆ str    │
╞═════╪═══════╪════════╡
│ 1   ┆ foo   ┆ A      │
│ 2   ┆ ham   ┆ A      │
│ 3   ┆ spam  ┆ B      │
└─────┴───────┴────────┘

I have 1 dataframe defined. I then use some conditions to filter out some rows, which yields a new dataframe. How to get the diff of them? The diff in this case should be a dataframe of 2 rows removed by the filter.

PS: Instead of reversing the condition, a more general way is preferred.

1

There are 1 best solutions below

0
On BEST ANSWER

You can use an anti-join (enabled with the feature semi_anti_join) to achieve this:

use polars::prelude::*;

fn main() -> PolarsResult<()> {
    let df = df! (
        "nrs" => &[Some(1), Some(2), Some(3), Some(4), Some(5)],
        "names" => &[Some("foo"), Some("ham"), Some("spam"), Some("eggs"), None],
        "groups" => &["A", "A", "B", "C", "B"],
    )?;

    println!("{:?}", df);

    //     shape: (5, 3)
    // ┌─────┬───────┬────────┐
    // │ nrs ┆ names ┆ groups │
    // │ --- ┆ ---   ┆ ---    │
    // │ i32 ┆ str   ┆ str    │
    // ╞═════╪═══════╪════════╡
    // │ 1   ┆ foo   ┆ A      │
    // │ 2   ┆ ham   ┆ A      │
    // │ 3   ┆ spam  ┆ B      │
    // │ 4   ┆ eggs  ┆ C      │
    // │ 5   ┆ null  ┆ B      │
    // └─────┴───────┴────────┘

    let df2 = df.clone().lazy().filter(col("nrs").lt(lit(4)));

    println!("{:?}", df2.clone().collect()?);

    //     shape: (3, 3)
    // ┌─────┬───────┬────────┐
    // │ nrs ┆ names ┆ groups │
    // │ --- ┆ ---   ┆ ---    │
    // │ i32 ┆ str   ┆ str    │
    // ╞═════╪═══════╪════════╡
    // │ 1   ┆ foo   ┆ A      │
    // │ 2   ┆ ham   ┆ A      │
    // │ 3   ┆ spam  ┆ B      │
    // └─────┴───────┴────────┘

    let disjoint_df = df.lazy().join(
        df2,
        [col("nrs"), col("names"), col("groups")],
        [col("nrs"), col("names"), col("groups")],
        JoinArgs::new(JoinType::Anti),
    );

    println!("{:?}", disjoint_df.collect()?);

    //     shape: (2, 3)
    // ┌─────┬───────┬────────┐
    // │ nrs ┆ names ┆ groups │
    // │ --- ┆ ---   ┆ ---    │
    // │ i32 ┆ str   ┆ str    │
    // ╞═════╪═══════╪════════╡
    // │ 4   ┆ eggs  ┆ C      │
    // │ 5   ┆ null  ┆ B      │
    // └─────┴───────┴────────┘
    Ok(())
}