Aggregation-Percentile Clickhouse

214 Views Asked by At

I would like to store the data of percentile 99 in every 5 minutes on ClickHouse. But, in case that I want to calculate the p99 for 10 minutes. the average p99 of 2 interval may be not accurate.

I understand that there are method 'T-digest' function that we can merge the t-digest for each interval to estimate the whole percentile. but, I still confuse how can I store the result of T-digest in ClickHouse. Or there are other suitable way to do.

Can anyone suggest me for this scenario. Appreciated for all response.

Thank you is much.

Best Regards,

1

There are 1 best solutions below

0
Denny Crane On

https://clickhouse.com/docs/en/sql-reference/aggregate-functions/reference/quantiletdigest/

Computes an approximate quantile of a numeric data sequence using the t-digest algorithm.

quantilesTDigestState uses significantly less data than quantiles in many cases and approximation error is less 1% ( your mileage may vary ).

example with quantiles_tdigest AggregateFunction(quantilesTDigest(0.75, 0.9, 0.95, 0.99), UInt32) CODEC (ZSTD(1))

drop database if exists db_test;

create database db_test;


CREATE TABLE db_test.raw_local (
  time DateTime('Europe/Warsaw'),
  cdn_id UInt32,
  continent UInt32,
  cdn_response_time_ms UInt32
)
ENGINE = MergeTree PARTITION BY toYYYYMMDD(time)
ORDER BY (cdn_id,time);


CREATE TABLE db_test.hour_local (
   cdn_id UInt32,
   continent UInt32,
   grouped DateTime,
   quantiles_exact AggregateFunction(quantilesExact(0.75, 0.9, 0.95, 0.99), UInt32)
)
ENGINE = AggregatingMergeTree
PARTITION BY toYYYYMMDD(grouped)
ORDER BY (cdn_id, continent, grouped);


CREATE MATERIALIZED VIEW db_test.hour_mv TO db_test.hour_local AS
SELECT cdn_id,
       continent,
       toStartOfHour(time) AS grouped ,
       quantilesExactState(0.75, 0.9, 0.95, 0.99)(cdn_response_time_ms) AS quantiles_exact
FROM db_test.raw_local group by cdn_id, continent, grouped;


CREATE TABLE db_test.hour_local_optimized (
   cdn_id UInt32,
   continent UInt32,
   grouped DateTime,
   quantiles_tdigest AggregateFunction(quantilesTDigest(0.75, 0.9, 0.95, 0.99), UInt32) CODEC (ZSTD(1))
)
ENGINE = AggregatingMergeTree
PARTITION BY toYYYYMMDD(grouped)
ORDER BY (cdn_id, continent, grouped);

CREATE MATERIALIZED VIEW db_test.hour_local_optimized_mv TO db_test.hour_local_optimized
 AS
SELECT cdn_id,
       continent,
       toStartOfHour(time) AS grouped ,
       quantilesTDigestState(0.75, 0.9, 0.95, 0.99)(cdn_response_time_ms) AS quantiles_tdigest
FROM db_test.raw_local  group by cdn_id, continent, grouped;


CREATE TABLE db_test.hour_local_optimized_2 (
    cdn_id UInt32,
    continent UInt32,
    grouped DateTime,
    quantiles_tdigest AggregateFunction(quantilesTDigest(0.75, 0.9, 0.95, 0.99), UInt32) CODEC (ZSTD(2)))
ENGINE = AggregatingMergeTree
PARTITION BY toYYYYMMDD(grouped)
ORDER BY (cdn_id, continent, grouped);



CREATE MATERIALIZED VIEW db_test.hour_local_optimized_2_mv TO db_test.hour_local_optimized_2
AS
SELECT cdn_id,
      continent,
      toStartOfHour(time) AS grouped ,
      quantilesTDigestState(0.75, 0.9, 0.95, 0.99)(cdn_response_time_ms) AS quantiles_tdigest
FROM db_test.raw_local  group by cdn_id, continent, grouped;


SET max_partitions_per_insert_block = 0;
insert into db_test.raw_local select toDateTime('2021-01-01 00:00:00') + (number / 100), rand() % 3, number % 5, intDiv(rand() % 32323, 111) from numbers(0,100000000);
insert into db_test.raw_local select toDateTime('2021-01-01 00:00:00') + (number / 100), rand() % 3, number % 5, intDiv(rand() % 32323, 111) from numbers(100000000, 100000000);
insert into db_test.raw_local select toDateTime('2021-01-01 00:00:00') + (number / 100), rand() % 3, number % 5, intDiv(rand() % 32323, 111) from numbers(200000000, 100000000);
insert into db_test.raw_local select toDateTime('2021-01-01 00:00:00') + (number / 100), rand() % 3, number % 5, intDiv(rand() % 32323, 111) from numbers(300000000, 100000000);
insert into db_test.raw_local select toDateTime('2021-01-01 00:00:00') + (number / 100), rand() % 3, number % 5, intDiv(rand() % 32323, 111) from numbers(400000000, 100000000);
insert into db_test.raw_local select toDateTime('2021-01-01 00:00:00') + (number / 100), rand() % 3, number % 5, intDiv(rand() % 32323, 111) from numbers(500000000, 100000000);
insert into db_test.raw_local select toDateTime('2021-01-01 00:00:00') + (number / 100), rand() % 3, number % 5, intDiv(rand() % 32323, 111) from numbers(600000000, 100000000);
insert into db_test.raw_local select toDateTime('2021-01-01 00:00:00') + (number / 100), rand() % 3, number % 5, intDiv(rand() % 32323, 111) from numbers(700000000, 100000000);


SELECT
    database,
    table,
    formatReadableSize(sum(data_compressed_bytes) AS size) AS compressed,
    formatReadableSize(sum(data_uncompressed_bytes) AS usize) AS uncompressed,
    round(usize / size, 2) AS compr_rate,
    sum(rows) AS rows,
    count() AS part_count
FROM system.parts
WHERE (active = 1) AND (table LIKE '%') AND (database LIKE 'db_test')
GROUP BY
    database,
    table
ORDER BY size DESC;


┌─database─┬─table──────────────────┬─compressed─┬─uncompressed─┬─compr_rate─┬──────rows─┬─part_count─┐
│ db_test  │ raw_local              │ 4.04 GiB   │ 11.92 GiB    │       2.95 │ 800000000 │        395 │
│ db_test  │ hour_local             │ 1.50 GiB   │ 2.98 GiB     │       1.98 │     33450 │        100 │
│ db_test  │ hour_local_optimized   │ 41.76 MiB  │ 126.26 MiB   │       3.02 │     33450 │        100 │
│ db_test  │ hour_local_optimized_2 │ 38.99 MiB  │ 126.26 MiB   │       3.24 │     33450 │        100 │
└──────────┴────────────────────────┴────────────┴──────────────┴────────────┴───────────┴────────────┘


SELECT
    continent,
    quantilesExact(0.75, 0.9, 0.95, 0.99)(cdn_response_time_ms)
FROM db_test.raw_local
WHERE (toYYYYMMDD(time) = 20210102) AND (cdn_id = 2)
GROUP BY continent;
┌─continent─┬─quantilesExact(0.75, 0.9, 0.95, 0.99)(cdn_response_time_ms)─┐
│         0 │ [218,262,276,288]                                           │
│         4 │ [218,261,276,288]                                           │
│         3 │ [218,262,276,288]                                           │
│         2 │ [218,261,276,288]                                           │
│         1 │ [218,262,276,288]                                           │
└───────────┴─────────────────────────────────────────────────────────────┘
5 rows in set. Elapsed: 0.036 sec. Processed 2.90 million rows, 46.36 MB (80.28 million rows/s., 1.28 GB/s.)

SELECT
    continent,
    quantilesExactMerge(0.75, 0.9, 0.95, 0.99)(quantiles_exact)
FROM db_test.hour_local
WHERE (toYYYYMMDD(grouped) = 20210102) AND (cdn_id = 2)
GROUP BY continent;
┌─continent─┬─quantilesExactMerge(0.75, 0.9, 0.95, 0.99)(quantiles_exact)─┐
│         0 │ [218,262,276,288]                                           │
│         4 │ [218,261,276,288]                                           │
│         3 │ [218,262,276,288]                                           │
│         2 │ [218,261,276,288]                                           │
│         1 │ [218,262,276,288]                                           │
└───────────┴─────────────────────────────────────────────────────────────┘
5 rows in set. Elapsed: 0.041 sec.

SELECT
    continent,
    arrayMap(x -> round(x), quantilesTDigestMerge(0.75, 0.9, 0.95, 0.99)(quantiles_tdigest))
FROM db_test.hour_local_optimized
WHERE (toYYYYMMDD(grouped) = 20210102) AND (cdn_id = 2)
GROUP BY continent
┌─continent─┬─arrayMap(lambda(tuple(x), round(x)), quantilesTDigestMerge(0.75, 0.9, 0.95, 0.99)(quantiles_tdigest))─┐
│         0 │ [218,262,276,288]                                                                                     │
│         4 │ [218,261,276,288]                                                                                     │
│         3 │ [218,262,276,288]                                                                                     │
│         2 │ [218,261,276,288]                                                                                     │
│         1 │ [218,262,276,288]                                                                                     │
└───────────┴───────────────────────────────────────────────────────────────────────────────────────────────────────┘
5 rows in set. Elapsed: 0.005 sec.


┌─continent─┬─arrayMap(lambda(tuple(x), round(x)), quantilesTDigestMerge(0.75, 0.9, 0.95, 0.99)(quantiles_tdigest))─┐
│         0 │ [218,262,276,288]                                                                                     │
│         4 │ [218,261,276,288]                                                                                     │
│         3 │ [218,262,276,288]                                                                                     │
│         2 │ [218,261,276,288]                                                                                     │
│         1 │ [218,262,276,288]                                                                                     │
└───────────┴───────────────────────────────────────────────────────────────────────────────────────────────────────┘
5 rows in set. Elapsed: 0.006 sec.