├── .gitignore ├── fuzz ├── .gitignore ├── fuzz_targets │ ├── serde.rs │ └── estimator.rs └── Cargo.toml ├── benches ├── error_rate.png ├── estimate_time.png ├── insert_time.png ├── memory_bytes.png ├── requirements.txt ├── analyze.py └── cardinality_estimator.rs ├── examples └── estimator.rs ├── LICENSE ├── .github └── workflows │ ├── semgrep.yml │ └── ci.yml ├── Makefile ├── Cargo.toml ├── src ├── lib.rs ├── small.rs ├── representation.rs ├── serde.rs ├── array.rs ├── hyperloglog.rs └── estimator.rs └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | Cargo.lock 3 | -------------------------------------------------------------------------------- /fuzz/.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | corpus 3 | artifacts 4 | coverage 5 | -------------------------------------------------------------------------------- /benches/error_rate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cloudflare/cardinality-estimator/main/benches/error_rate.png -------------------------------------------------------------------------------- /benches/estimate_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cloudflare/cardinality-estimator/main/benches/estimate_time.png -------------------------------------------------------------------------------- /benches/insert_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cloudflare/cardinality-estimator/main/benches/insert_time.png -------------------------------------------------------------------------------- /benches/memory_bytes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cloudflare/cardinality-estimator/main/benches/memory_bytes.png -------------------------------------------------------------------------------- /benches/requirements.txt: -------------------------------------------------------------------------------- 1 | chdb==1.1.0 2 | matplotlib==3.8.2 3 | numpy==1.26.2 4 | pandas==2.1.4 5 | pyarrow==14.0.2 6 | seaborn==0.13.0 7 | tabulate==0.9.0 -------------------------------------------------------------------------------- /fuzz/fuzz_targets/serde.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | 3 | use cardinality_estimator::estimator::CardinalityEstimator; 4 | use libfuzzer_sys::fuzz_target; 5 | 6 | fuzz_target!(|data: &[u8]| { 7 | if let Ok(mut estimator) = serde_json::from_slice::>(data) { 8 | estimator.insert(&1); 9 | assert!(estimator.estimate() > 0); 10 | } 11 | }); 12 | -------------------------------------------------------------------------------- /fuzz/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "cardinality-estimator-fuzz" 3 | version = "0.0.0" 4 | publish = false 5 | edition = "2021" 6 | 7 | [package.metadata] 8 | cargo-fuzz = true 9 | 10 | [dependencies] 11 | cardinality-estimator = { path = "..", features = ["with_serde"] } 12 | libfuzzer-sys = "0.4" 13 | serde_json = "1.0.115" 14 | wyhash = "0.5.0" 15 | 16 | [[bin]] 17 | name = "estimator" 18 | path = "fuzz_targets/estimator.rs" 19 | test = false 20 | doc = false 21 | bench = false 22 | 23 | [[bin]] 24 | name = "serde" 25 | path = "fuzz_targets/serde.rs" 26 | test = false 27 | doc = false 28 | bench = false 29 | -------------------------------------------------------------------------------- /examples/estimator.rs: -------------------------------------------------------------------------------- 1 | use cardinality_estimator::CardinalityEstimator; 2 | 3 | fn main() { 4 | let mut estimator1 = CardinalityEstimator::::new(); 5 | for i in 0..10 { 6 | estimator1.insert(&i); 7 | } 8 | println!("estimator1 estimate = {}", estimator1.estimate()); 9 | 10 | let mut estimator2 = CardinalityEstimator::::new(); 11 | for i in 10..15 { 12 | estimator2.insert(&i); 13 | } 14 | println!("estimator2 estimate = {}", estimator2.estimate()); 15 | 16 | estimator1.merge(&estimator2); 17 | println!("merged estimate = {}", estimator1.estimate()); 18 | } 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2023 Cloudflare, Inc. and contributors. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /.github/workflows/semgrep.yml: -------------------------------------------------------------------------------- 1 | 2 | on: 3 | pull_request: {} 4 | workflow_dispatch: {} 5 | push: 6 | branches: 7 | - main 8 | - master 9 | schedule: 10 | - cron: '0 0 * * *' 11 | name: Semgrep config 12 | jobs: 13 | semgrep: 14 | name: semgrep/ci 15 | runs-on: ubuntu-latest 16 | env: 17 | SEMGREP_APP_TOKEN: ${{ secrets.SEMGREP_APP_TOKEN }} 18 | SEMGREP_URL: https://cloudflare.semgrep.dev 19 | SEMGREP_APP_URL: https://cloudflare.semgrep.dev 20 | SEMGREP_VERSION_CHECK_URL: https://cloudflare.semgrep.dev/api/check-version 21 | container: 22 | image: semgrep/semgrep 23 | steps: 24 | - uses: actions/checkout@v4 25 | - run: semgrep ci 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: test bench bench-extended fuzz-estimator fuzz-serde lint fmt clean build doc 2 | 3 | all: build 4 | 5 | build: 6 | cargo build 7 | 8 | test: 9 | cargo test --features with_serde 10 | 11 | bench: export RUSTFLAGS = -C target-cpu=native 12 | bench: export N = 1048576 13 | bench: export BENCH_RESULTS_PATH = target/bench_results/$(shell date '+%Y%m%d_%H%M%S') 14 | bench: 15 | mkdir -p $(BENCH_RESULTS_PATH) 16 | cargo criterion --bench cardinality_estimator --message-format json | tee $(BENCH_RESULTS_PATH)/results.json 17 | python3 benches/analyze.py 18 | 19 | fuzz-estimator: 20 | RUSTFLAGS="-Z sanitizer=address" cargo +nightly fuzz run estimator -- -max_len=65536 21 | 22 | fuzz-serde: 23 | RUSTFLAGS="-Z sanitizer=address" cargo +nightly fuzz run serde -- -max_len=65536 24 | 25 | lint: 26 | cargo clippy --features with_serde -- -D warnings 27 | 28 | fmt: 29 | cargo fmt --all 30 | 31 | clean: 32 | cargo clean 33 | 34 | doc: 35 | cargo doc 36 | -------------------------------------------------------------------------------- /fuzz/fuzz_targets/estimator.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | 3 | use cardinality_estimator::estimator::CardinalityEstimator; 4 | use libfuzzer_sys::fuzz_target; 5 | use wyhash::wyhash; 6 | 7 | fuzz_target!(|data: &[u8]| { 8 | if data.is_empty() { 9 | return; 10 | } 11 | 12 | let split_index = wyhash(data, 0) as usize % data.len(); 13 | let (first_half, second_half) = data.split_at(split_index); 14 | 15 | let mut estimator1 = CardinalityEstimator::<&[u8]>::new(); 16 | for chunk in first_half.chunks(4) { 17 | estimator1.insert(&chunk); 18 | assert!(estimator1.estimate() > 0); 19 | assert!(estimator1.size_of() > 0); 20 | 21 | } 22 | 23 | let mut estimator2 = CardinalityEstimator::<&[u8]>::new(); 24 | for chunk in second_half.chunks(4) { 25 | estimator2.insert(&chunk); 26 | assert!(estimator2.estimate() > 0); 27 | assert!(estimator2.size_of() > 0); 28 | } 29 | 30 | estimator1.merge(&estimator2); 31 | }); 32 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "cardinality-estimator" 3 | version = "1.0.2" 4 | edition = "2021" 5 | authors = ["Alex Bocharov "] 6 | description = "A crate for estimating the cardinality of distinct elements in a stream or dataset." 7 | documentation = "https://docs.rs/cardinality-estimator" 8 | license = "Apache-2.0" 9 | readme = "README.md" 10 | repository = "https://github.com/cloudflare/cardinality-estimator" 11 | keywords = ["cardinality", "distinct-count", "hyperloglog", "probabilistic", "sketch"] 12 | categories = ["algorithms", "data-structures"] 13 | 14 | [dependencies] 15 | enum_dispatch = "0.3.13" 16 | serde = { version = "1.0", optional = true } 17 | wyhash = "0.5.0" 18 | 19 | [dev-dependencies] 20 | amadeus-streaming = "0.4.3" 21 | criterion = { version = "0.5.0", features = ["html_reports"] } 22 | dhat = "0.3.3" 23 | hyperloglog = "1.0.2" 24 | hyperloglogplus = "0.4.1" 25 | pprof = { version = "0.14.0", features = ["flamegraph", "criterion", "protobuf-codec"] } 26 | probabilistic-collections = "0.7.0" 27 | rand = "0.8.5" 28 | serde_json = "1.0" 29 | tabled = "0.15.0" 30 | test-case = "3.3.1" 31 | 32 | [[bench]] 33 | name = "cardinality_estimator" 34 | harness = false 35 | 36 | [features] 37 | default = [] 38 | with_serde = ["serde"] 39 | 40 | [profile.release] 41 | debug = 1 42 | 43 | [lints.clippy] 44 | cast_lossless = "deny" 45 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | on: [push, pull_request] 2 | 3 | name: CI 4 | 5 | jobs: 6 | check: 7 | name: Check 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Checkout sources 11 | uses: actions/checkout@v4 12 | 13 | - name: Install stable toolchain 14 | uses: dtolnay/rust-toolchain@master 15 | with: 16 | toolchain: stable 17 | 18 | - name: Run cargo check 19 | run: cargo check 20 | 21 | test: 22 | name: Test Suite 23 | runs-on: ubuntu-latest 24 | steps: 25 | - name: Checkout sources 26 | uses: actions/checkout@v4 27 | 28 | - name: Install stable toolchain 29 | uses: dtolnay/rust-toolchain@master 30 | with: 31 | toolchain: stable 32 | 33 | - name: Run cargo test 34 | run: cargo test --features with_serde 35 | 36 | lints: 37 | name: Lints 38 | runs-on: ubuntu-latest 39 | steps: 40 | - name: Checkout sources 41 | uses: actions/checkout@v4 42 | 43 | - name: Install stable toolchain 44 | uses: dtolnay/rust-toolchain@master 45 | with: 46 | toolchain: stable 47 | components: rustfmt, clippy 48 | 49 | - name: Run cargo fmt 50 | run: cargo fmt --all -- --check 51 | 52 | - name: Run cargo clippy 53 | run: cargo clippy --features with_serde -- -D warnings 54 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! `cardinality-estimator` is a Rust crate designed to estimate the number of distinct elements in a stream or dataset in an efficient manner. 2 | //! 3 | //! This library uses HyperLogLog++ with an optimized low memory footprint and high accuracy approach, suitable for large-scale data analysis tasks. 4 | //! 5 | //! Cardinality estimator allows to estimate number of distinct elements in the stream or dataset and is defined with const `P` and `W` parameters: 6 | //! - `P`: precision parameter in [4..18] range, which defines 7 | //! number of bits to use for HyperLogLog register indices. 8 | //! - `W`: width parameter in [4..6] range, which defines 9 | //! number of bits to use for HyperLogLog register width. 10 | //! 11 | //! # Data-structure design rationale 12 | //! 13 | //! ## Low memory footprint 14 | //! 15 | //! For parameters P = 12, W = 6: 16 | //! - Cardinality in [0..2] range - 8 bytes (small representation) 17 | //! - Cardinality in [3..4] range - 24 bytes (array representation) 18 | //! - Cardinality in [5..8] range - 40 bytes (array representation) 19 | //! - Cardinality in [9..16] range - 72 bytes (array representation) 20 | //! - ... 21 | //! - Cardinality in [65..128] range - 520 bytes (array representation) 22 | //! - Cardinality in [129..] range - 3092 bytes (hyperloglog representation) 23 | //! 24 | //! ## Low latency 25 | //! - Auto-vectorization for slice operations via compiler hints 26 | //! to use SIMD instructions when using `chunks_exact`. 27 | //! - Number of zero registers and registers' harmonic sum are 28 | //! stored and updated dynamically as more data being inserted, 29 | //! allowing to have truly constant `estimate` operations. 30 | //! - Efficient polynomial computation using Horner's method. 31 | //! 32 | //! ## High accuracy 33 | //! - For small cardinality range (<= 128 for P = 12, W = 6) 34 | //! cardinality counted very accurately (within hash collisions chance) 35 | //! - For large cardinality range HyperLogLog++ is used with LogLog-Beta bias correction. 36 | //! - Expected error (1.04 / sqrt(2^P)): 37 | //! - P = 10, W = 5: 0.0325 38 | //! - P = 12, W = 6: 0.0162 39 | //! - P = 14, W = 6: 0.0081 40 | //! - P = 18, W = 6: 0.0020 41 | //! 42 | //! # Data storage format 43 | //! Cardinality estimator stores data in one of the three representations: 44 | //! - `Small` representation - see `small` module for more details. 45 | //! - `Array` representation - see `array` module for more details. 46 | //! - `HyperLogLog` representation - see `hyperloglog` module for more details 47 | //! 48 | //! # Data Storage Format 49 | //! The cardinality estimator stores data in one of three formats: `Small`, `Array`, and `HyperLogLog`. 50 | //! See corresponding modules (`small`, `array`, `hyperloglog`) for more details. 51 | mod array; 52 | pub mod estimator; 53 | mod hyperloglog; 54 | mod representation; 55 | #[cfg(feature = "with_serde")] 56 | mod serde; 57 | mod small; 58 | 59 | pub use estimator::*; 60 | -------------------------------------------------------------------------------- /src/small.rs: -------------------------------------------------------------------------------- 1 | //! ## Small representation 2 | //! Allows to estimate cardinality in [0..2] range and uses only 8 bytes of memory. 3 | //! 4 | //! The `data` format of small representation: 5 | //! - 0..1 bits - store representation type (bits are set to `00`) 6 | //! - 2..33 bits - store 31-bit encoded hash 7 | //! - 34..63 bits - store 31-bit encoded hash 8 | 9 | use std::fmt::{Debug, Formatter}; 10 | 11 | use crate::array::Array; 12 | use crate::representation::{RepresentationTrait, REPRESENTATION_SMALL}; 13 | 14 | /// Mask used for extracting hashes stored in small representation (31 bits) 15 | const SMALL_MASK: usize = 0x0000_0000_7fff_ffff; 16 | 17 | /// Small representation container 18 | #[derive(PartialEq)] 19 | pub(crate) struct Small(usize); 20 | 21 | impl Small { 22 | /// Insert encoded hash into `Small` representation. 23 | /// Returns true on success, false otherwise. 24 | #[inline] 25 | pub(crate) fn insert(&mut self, h: u32) -> bool { 26 | let h1 = self.h1(); 27 | if h1 == 0 { 28 | self.0 |= (h as usize) << 2; 29 | return true; 30 | } else if h1 == h { 31 | return true; 32 | } 33 | 34 | let h2 = self.h2(); 35 | if h2 == 0 { 36 | self.0 |= (h as usize) << 33; 37 | return true; 38 | } else if h2 == h { 39 | return true; 40 | } 41 | 42 | false 43 | } 44 | 45 | /// Return 1-st encoded hash 46 | #[inline] 47 | fn h1(&self) -> u32 { 48 | ((self.0 >> 2) & SMALL_MASK) as u32 49 | } 50 | 51 | /// Return 2-nd encoded hash 52 | #[inline] 53 | fn h2(&self) -> u32 { 54 | ((self.0 >> 33) & SMALL_MASK) as u32 55 | } 56 | 57 | /// Return items stored within `Small` representation 58 | #[inline] 59 | pub(crate) fn items(&self) -> [u32; 2] { 60 | [self.h1(), self.h2()] 61 | } 62 | } 63 | 64 | impl RepresentationTrait for Small { 65 | /// Insert encoded hash into `Small` representation. 66 | fn insert_encoded_hash(&mut self, h: u32) -> usize { 67 | if self.insert(h) { 68 | self.to_data() 69 | } else { 70 | // upgrade from `Small` to `Array` representation 71 | let items = self.items(); 72 | let arr = Array::::from_vec(vec![items[0], items[1], h, 0], 3); 73 | arr.to_data() 74 | } 75 | } 76 | 77 | /// Return cardinality estimate of `Small` representation 78 | #[inline] 79 | fn estimate(&self) -> usize { 80 | match (self.h1(), self.h2()) { 81 | (0, 0) => 0, 82 | (_, 0) => 1, 83 | (_, _) => 2, 84 | } 85 | } 86 | 87 | /// Return memory size of `Small` representation 88 | fn size_of(&self) -> usize { 89 | std::mem::size_of::() 90 | } 91 | 92 | /// Free memory occupied by the `Small` representation 93 | #[inline] 94 | unsafe fn drop(&mut self) {} 95 | 96 | /// Convert `Small` representation to `data` 97 | #[inline] 98 | fn to_data(&self) -> usize { 99 | self.0 | REPRESENTATION_SMALL 100 | } 101 | } 102 | 103 | impl Debug for Small { 104 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 105 | f.write_str(&self.to_string()) 106 | } 107 | } 108 | 109 | impl From for Small { 110 | /// Create new instance of `Small` from given `data` 111 | fn from(data: usize) -> Self { 112 | Self(data) 113 | } 114 | } 115 | 116 | #[cfg(test)] 117 | mod tests { 118 | use super::*; 119 | 120 | #[test] 121 | fn small_size() { 122 | assert_eq!(std::mem::size_of::>(), 8); 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /benches/analyze.py: -------------------------------------------------------------------------------- 1 | import os 2 | import chdb 3 | import pandas as pd 4 | import seaborn as sns 5 | import matplotlib.pyplot as plt 6 | 7 | 8 | def analyze(bench_results_path): 9 | df = chdb.query(f""" 10 | SELECT 11 | toUInt64(extract(id, '/.*/(\d+)')) as cardinality, 12 | extract(id, '/(.*)/') as estimator, 13 | round(mean.estimate / if(cardinality = 0, 1, cardinality), 2) as time 14 | FROM 15 | '{bench_results_path}/results.json' 16 | WHERE 17 | match(id, 'insert') 18 | ORDER BY 19 | cardinality 20 | """, 'DataFrame') 21 | render_comparison(bench_results_path, df, 'insert', 'time', 'linear') 22 | 23 | df = chdb.query(f""" 24 | SELECT 25 | toUInt64(extract(id, '/.*/(\d+)')) as cardinality, 26 | extract(id, '/(.*)/') as estimator, 27 | round(mean.estimate, 2) as time 28 | FROM 29 | '{bench_results_path}/results.json' 30 | WHERE 31 | match(id, 'estimate') 32 | ORDER BY 33 | cardinality 34 | """, 'DataFrame') 35 | 36 | render_comparison(bench_results_path, df, 'estimate', 'time', 'log') 37 | 38 | df = pd.read_table(f'{bench_results_path}/memory_usage.md', delimiter='|', comment='-', skipinitialspace=True) 39 | df = df.dropna(axis=1, how='all').rename(columns=lambda x: x.strip().replace('_', '-')).dropna() 40 | for column in df.columns[1:]: 41 | df[column] = df[column].apply(lambda x: sum(int(num) for num in x.split('/')[:2])) 42 | df_memory = df.melt(id_vars='cardinality', var_name='estimator', value_name='bytes') 43 | render_comparison(bench_results_path, df_memory, 'memory', 'bytes', 'log') 44 | 45 | df = pd.read_table(f'{bench_results_path}/relative_error.md', delimiter='|', comment='-', skipinitialspace=True) 46 | df = df.dropna(axis=1, how='all').rename(columns=lambda x: x.strip().replace('_', '-')).dropna() 47 | df_error = df.melt(id_vars='cardinality', var_name='estimator', value_name='rate') 48 | render_comparison(bench_results_path, df_error, 'error', 'rate', 'linear', (0.0, 0.05)) 49 | 50 | 51 | def render_comparison(bench_results_path, df, operation, metric, yscale, ylim=None): 52 | pivot_df = df.pivot(index='cardinality', columns='estimator', values=metric) 53 | pivot_df = pivot_df[df["estimator"].unique()] 54 | 55 | md_path = f'{bench_results_path}/{operation}_{metric}.md' 56 | pivot_df.apply(highlight_min, axis=1).to_markdown(md_path, tablefmt='github') 57 | print(f"saved markdown table to {md_path}") 58 | 59 | plt.figure(figsize=(12, 8)) 60 | 61 | colors = { 62 | 'cardinality-estimator': 'green', 63 | 'amadeus-streaming': 'blue', 64 | 'probabilistic-collections': 'red', 65 | 'hyperloglog': 'purple', 66 | 'hyperloglogplus': 'orange', 67 | # ... define other colors for other estimators if there are any ... 68 | } 69 | 70 | palette_dict = {est: colors[est] for est in df['estimator'].unique() if est in colors} 71 | 72 | sns.scatterplot(data=df, x='cardinality', y=metric, hue='estimator', style='estimator', palette=palette_dict, s=100) 73 | 74 | for i, estimator in enumerate(df['estimator'].unique()): 75 | subset = df[df['estimator'] == estimator] 76 | plt.plot(subset['cardinality'], subset[metric], linestyle='--', linewidth=0.5, color=palette_dict[estimator]) 77 | 78 | plt.xscale('log', base=2) 79 | plt.yscale(yscale) 80 | if ylim: 81 | plt.ylim(ylim) 82 | 83 | x_ticks_estimate = df['cardinality'].unique() 84 | selected_x_ticks_estimate = [x for x in x_ticks_estimate if x > 0] 85 | x_labels_estimate = [str(int(x)) if x > 0 else '0' for x in selected_x_ticks_estimate] 86 | 87 | plt.xticks(selected_x_ticks_estimate, labels=x_labels_estimate, rotation=45) 88 | plt.grid(True, which='major', linestyle='--', linewidth=0.5) 89 | plt.xlabel('Cardinality') 90 | plt.ylabel(metric) 91 | 92 | plot_path = f'{bench_results_path}/{operation}_{metric}.png' 93 | plt.savefig(plot_path) 94 | print(f"saved plot to {plot_path}") 95 | 96 | 97 | # Function to highlight the minimum value in each row (size) 98 | def highlight_min(row): 99 | min_val = row.min() 100 | return row.apply(lambda x: '**' + str(x) + '**' if x == min_val else str(x)) 101 | 102 | 103 | if __name__ == '__main__': 104 | analyze(os.environ["BENCH_RESULTS_PATH"]) 105 | -------------------------------------------------------------------------------- /src/representation.rs: -------------------------------------------------------------------------------- 1 | use std::hash::{Hash, Hasher}; 2 | 3 | use enum_dispatch::enum_dispatch; 4 | 5 | use crate::array::{Array, MAX_CAPACITY}; 6 | use crate::hyperloglog::HyperLogLog; 7 | use crate::representation::RepresentationError::*; 8 | use crate::small::Small; 9 | use crate::CardinalityEstimator; 10 | 11 | /// Masks used for storing and retrieving representation type stored in lowest 2 bits of `data` field. 12 | pub(crate) const REPRESENTATION_MASK: usize = 0x0000_0000_0000_0003; 13 | pub(crate) const REPRESENTATION_SMALL: usize = 0x0000_0000_0000_0000; 14 | pub(crate) const REPRESENTATION_ARRAY: usize = 0x0000_0000_0000_0001; 15 | pub(crate) const REPRESENTATION_HLL: usize = 0x0000_0000_0000_0003; 16 | 17 | /// Representation types supported by `CardinalityEstimator` 18 | #[repr(u8)] 19 | #[derive(Debug, PartialEq)] 20 | #[enum_dispatch] 21 | pub(crate) enum Representation<'a, const P: usize, const W: usize> { 22 | Small(Small), 23 | Array(Array<'a, P, W>), 24 | Hll(HyperLogLog<'a, P, W>), 25 | } 26 | 27 | /// Representation trait which must be implemented by all representations. 28 | #[enum_dispatch(Representation)] 29 | pub(crate) trait RepresentationTrait { 30 | fn insert_encoded_hash(&mut self, h: u32) -> usize; 31 | fn estimate(&self) -> usize; 32 | fn size_of(&self) -> usize; 33 | unsafe fn drop(&mut self); 34 | fn to_data(&self) -> usize; 35 | fn to_string(&self) -> String { 36 | format!("estimate: {}, size: {}", self.estimate(), self.size_of()) 37 | } 38 | } 39 | 40 | /// Representation error 41 | #[derive(Debug)] 42 | pub enum RepresentationError { 43 | InvalidRepresentation, 44 | SmallRepresentationInvalid, 45 | ArrayRepresentationInvalid, 46 | HllRepresentationInvalid, 47 | } 48 | 49 | impl Representation<'_, P, W> { 50 | /// Returns the representation type of `CardinalityEstimator`. 51 | /// 52 | /// This method extracts the representation based on the lowest 2 bits of `data`. 53 | /// 54 | /// Valid encodings: 55 | /// - `0` for `Small` representation 56 | /// - `1` for `Array` representation 57 | /// - `3` for `HLL` representation 58 | /// 59 | /// If `data` is not encoded as 0, 1, or 3, the function defaults to `Small` with value of 0 60 | /// as a safe fallback to handle unexpected conditions. 61 | #[inline] 62 | pub(crate) fn from_data(data: usize) -> Self { 63 | match data & REPRESENTATION_MASK { 64 | REPRESENTATION_SMALL => Representation::Small(Small::from(data)), 65 | REPRESENTATION_ARRAY => Representation::Array(Array::from(data)), 66 | REPRESENTATION_HLL => Representation::Hll(HyperLogLog::::from(data)), 67 | _ => Representation::Small(Small::from(0)), 68 | } 69 | } 70 | 71 | /// Create new cardinality estimator from data and optional vector 72 | pub fn try_from( 73 | data: usize, 74 | opt_vec: Option>, 75 | ) -> Result, RepresentationError> 76 | where 77 | T: Hash + ?Sized, 78 | H: Hasher + Default, 79 | { 80 | let mut estimator = CardinalityEstimator::::new(); 81 | estimator.data = match data & REPRESENTATION_MASK { 82 | REPRESENTATION_SMALL if opt_vec.is_some() => return Err(SmallRepresentationInvalid), 83 | REPRESENTATION_SMALL => Small::::from(data).to_data(), 84 | REPRESENTATION_ARRAY => { 85 | let vec = opt_vec.ok_or(ArrayRepresentationInvalid)?; 86 | let len = vec.len(); 87 | if len <= 2 || len > MAX_CAPACITY { 88 | return Err(ArrayRepresentationInvalid); 89 | } 90 | Array::::from_vec(vec, len).to_data() 91 | } 92 | REPRESENTATION_HLL => { 93 | let vec = opt_vec.ok_or(HllRepresentationInvalid)?; 94 | if vec.len() != HyperLogLog::::HLL_SLICE_LEN { 95 | return Err(HllRepresentationInvalid); 96 | } 97 | HyperLogLog::::from(vec).to_data() 98 | } 99 | _ => return Err(InvalidRepresentation), 100 | }; 101 | 102 | Ok(estimator) 103 | } 104 | } 105 | 106 | #[cfg(test)] 107 | mod tests { 108 | use super::*; 109 | 110 | #[test] 111 | fn small_size() { 112 | assert_eq!(std::mem::size_of::>(), 32); 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /src/serde.rs: -------------------------------------------------------------------------------- 1 | //! # Serde module for CardinalityEstimator 2 | //! 3 | //! This module provides serde-based (serialization and deserialization) features for 4 | //! `CardinalityEstimator`. It uses `serde`'s custom serialization and deserialization mechanisms. 5 | //! 6 | //! `CardinalityEstimator` has a usize field, `data`, and an optional `Vec` hidden behind a 7 | //! pointer within `data`. During serialization, these fields are converted into a tuple: 8 | //! `(data, Option>)`. 9 | //! 10 | //! During deserialization, the tuple is converted back into the `CardinalityEstimator` struct, 11 | //! handling the case where the `Vec` may be `None` (indicating a "small" estimator). 12 | //! 13 | //! This allows `CardinalityEstimator` to be easily serialized/deserialized, for storage, 14 | //! transmission, and reconstruction. 15 | //! 16 | //! Refer to the serde documentation for more details on custom serialization and deserialization: 17 | //! - [Serialization](https://serde.rs/impl-serialize.html) 18 | //! - [Deserialization](https://serde.rs/impl-deserialize.html) 19 | use std::hash::{Hash, Hasher}; 20 | use std::ops::Deref; 21 | 22 | use serde::de::Error; 23 | use serde::ser::SerializeTuple; 24 | use serde::{Deserialize, Serialize}; 25 | 26 | use crate::estimator::CardinalityEstimator; 27 | use crate::representation::Representation; 28 | 29 | impl Serialize for CardinalityEstimator 30 | where 31 | T: Hash + ?Sized, 32 | H: Hasher + Default, 33 | { 34 | fn serialize(&self, serializer: S) -> Result 35 | where 36 | S: serde::Serializer, 37 | { 38 | // Begin a new serialized tuple with two elements. 39 | let mut tup = serializer.serialize_tuple(2)?; 40 | 41 | // The first element is the data field of the estimator. 42 | tup.serialize_element(&self.data)?; 43 | match self.representation() { 44 | Representation::Small(_) => { 45 | // If the estimator is small, the second element is a None value. This indicates that 46 | // the estimator is using the small data optimization and has no separate slice data. 47 | tup.serialize_element(&None::>)?; 48 | } 49 | Representation::Array(arr) => { 50 | // If the estimator is slice, the second element is a option containing slice data. 51 | tup.serialize_element(&Some(arr.deref()))?; 52 | } 53 | Representation::Hll(hll) => { 54 | // If the estimator is HLL, the second element is a option containing HLL data. 55 | tup.serialize_element(&Some(hll.data))?; 56 | } 57 | } 58 | 59 | // Finalize the tuple. 60 | tup.end() 61 | } 62 | } 63 | 64 | impl<'de, T, H, const P: usize, const W: usize> Deserialize<'de> 65 | for CardinalityEstimator 66 | where 67 | T: Hash + ?Sized, 68 | H: Hasher + Default, 69 | { 70 | fn deserialize(deserializer: D) -> Result 71 | where 72 | D: serde::Deserializer<'de>, 73 | { 74 | // Deserialize the tuple that was serialized by the serialize method. The first element 75 | // of the tuple is the data field of the estimator, and the second element is an Option 76 | // that contains the array data if the estimator is not small. 77 | let (data, opt_vec): (usize, Option>) = Deserialize::deserialize(deserializer)?; 78 | Representation::try_from(data, opt_vec).map_err(|e| Error::custom(format!("{:?}", e))) 79 | } 80 | } 81 | 82 | #[cfg(test)] 83 | pub mod tests { 84 | use super::*; 85 | use test_case::test_case; 86 | 87 | #[test_case(0; "empty set")] 88 | #[test_case(1; "single element")] 89 | #[test_case(2; "two distinct elements")] 90 | #[test_case(100; "hundred distinct elements")] 91 | #[test_case(10000; "ten thousand distinct elements")] 92 | fn test_serde(n: usize) { 93 | let mut original_estimator = CardinalityEstimator::::new(); 94 | 95 | for i in 0..n { 96 | let item = &format!("item{}", i); 97 | original_estimator.insert(&item); 98 | } 99 | 100 | let serialized = serde_json::to_string(&original_estimator).expect("serialization failed"); 101 | assert!( 102 | !serialized.is_empty(), 103 | "serialized string should not be empty" 104 | ); 105 | 106 | let deserialized_estimator: CardinalityEstimator = 107 | serde_json::from_str(&serialized).expect("deserialization failed"); 108 | 109 | assert_eq!( 110 | original_estimator.representation(), 111 | deserialized_estimator.representation() 112 | ); 113 | } 114 | 115 | #[test] 116 | fn test_deserialize_invalid_json() { 117 | let invalid_json = "{ invalid_json_string }"; 118 | let result: Result, _> = serde_json::from_str(invalid_json); 119 | 120 | assert!( 121 | result.is_err(), 122 | "Deserialization should fail for invalid JSON" 123 | ); 124 | } 125 | 126 | #[test_case("[12345,null]".as_bytes(); "case 1")] 127 | #[test_case(&[91, 49, 55, 44, 13, 10, 91, 13, 93, 93]; "case 2")] 128 | #[test_case(&[91, 51, 44, 10, 110, 117, 108, 108, 93, 122]; "case 3")] 129 | #[test_case(&[91, 51, 44, 10, 110, 117, 108, 108, 93]; "case 4")] 130 | fn test_failed_deserialization(input: &[u8]) { 131 | let result: Result, _> = serde_json::from_slice(input); 132 | assert!(result.is_err()); 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /src/array.rs: -------------------------------------------------------------------------------- 1 | //! ## Array representation 2 | //! Allows to estimate medium cardinality in [3..MAX_CAPACITY] range. 3 | //! 4 | //! The `data` format of array representation: 5 | //! - 0..1 bits - store representation type (bits are set to `01`) 6 | //! - 2..55 bits - store pointer to `u32` slice (on `x86_64` systems only 48-bits are needed). 7 | //! - 56..63 bits - store number of items `N` stored in array 8 | //! 9 | //! Slice encoding: 10 | //! - data[0..N] - store `N` encoded hashes 11 | //! - data[N..] - store zeros used for future hashes 12 | 13 | use std::fmt::{Debug, Formatter}; 14 | use std::mem::{size_of, size_of_val}; 15 | use std::ops::Deref; 16 | use std::slice; 17 | 18 | use crate::hyperloglog::HyperLogLog; 19 | use crate::representation::{RepresentationTrait, REPRESENTATION_ARRAY}; 20 | 21 | /// Maximum number of elements stored in array representation 22 | pub(crate) const MAX_CAPACITY: usize = 128; 23 | /// Bit offset of the array's length 24 | const LEN_OFFSET: usize = 56; 25 | /// Mask used for accessing heap allocated data stored at the pointer in `data` field. 26 | const PTR_MASK: usize = ((1 << LEN_OFFSET) - 1) & !3; 27 | 28 | /// Array representation container 29 | pub(crate) struct Array<'a, const P: usize, const W: usize> { 30 | /// Number of items stored in the array 31 | len: usize, 32 | /// Array of items. Not all items are used, only first `len` of them. 33 | /// The length of the slice is actually the current capacity. 34 | arr: &'a mut [u32], 35 | } 36 | 37 | impl<'a, const P: usize, const W: usize> Array<'a, P, W> { 38 | /// Insert encoded hash into `Array` representation 39 | /// Returns true on success, false otherwise. 40 | #[inline] 41 | pub(crate) fn insert(&mut self, h: u32) -> bool { 42 | let cap = self.arr.len(); 43 | let found = if cap == 4 { 44 | contains_fixed_vectorized::<4>(self.arr.try_into().unwrap(), h) 45 | } else if cap == 8 { 46 | contains_fixed_vectorized::<8>(self.arr.try_into().unwrap(), h) 47 | } else { 48 | // calculate rounded up slice length for efficient look up in batches 49 | let rlen = 16 * self.len.div_ceil(16); 50 | // SAFETY: `rlen` guaranteed to be within `self.arr` boundaries 51 | contains_vectorized::<16>(unsafe { self.arr.get_unchecked(..rlen) }, h) 52 | }; 53 | 54 | if found { 55 | return true; 56 | } 57 | 58 | if self.len < cap { 59 | // if there are available slots in current array - append to it 60 | self.arr[self.len] = h; 61 | self.len += 1; 62 | return true; 63 | } 64 | 65 | if cap < MAX_CAPACITY { 66 | // double array capacity up to `MAX_CAPACITY` 67 | let new_arr = Self::from_vec(vec![0; cap * 2], self.len + 1); 68 | new_arr.arr[..self.len].copy_from_slice(self.arr); 69 | new_arr.arr[self.len] = h; 70 | unsafe { self.drop() }; 71 | *self = new_arr; 72 | return true; 73 | }; 74 | 75 | false 76 | } 77 | 78 | /// Create new instance of `Array` representation from vector 79 | #[inline] 80 | pub(crate) fn from_vec(mut arr: Vec, len: usize) -> Array<'a, P, W> { 81 | let cap = arr.len(); 82 | let ptr = arr.as_mut_ptr(); 83 | std::mem::forget(arr); 84 | // SAFETY: valid pointer from vector being used to create slice reference 85 | let arr = unsafe { slice::from_raw_parts_mut(ptr, cap) }; 86 | Self { len, arr } 87 | } 88 | } 89 | 90 | impl RepresentationTrait for Array<'_, P, W> { 91 | /// Insert encoded hash into `HyperLogLog` representation. 92 | #[inline] 93 | fn insert_encoded_hash(&mut self, h: u32) -> usize { 94 | if self.insert(h) { 95 | self.to_data() 96 | } else { 97 | // upgrade from `Array` to `HyperLogLog` representation 98 | let mut hll = HyperLogLog::::new(self); 99 | unsafe { self.drop() }; 100 | hll.insert_encoded_hash(h) 101 | } 102 | } 103 | 104 | /// Return cardinality estimate of `Array` representation 105 | #[inline] 106 | fn estimate(&self) -> usize { 107 | self.len 108 | } 109 | 110 | /// Return memory size of `Array` representation 111 | #[inline] 112 | fn size_of(&self) -> usize { 113 | size_of::() + size_of_val(self.arr) 114 | } 115 | 116 | /// Free memory occupied by the `Array` representation 117 | /// SAFETY: caller of this method must ensure that `self.arr` holds valid slice elements. 118 | #[inline] 119 | unsafe fn drop(&mut self) { 120 | drop(Box::from_raw(self.arr)); 121 | } 122 | 123 | /// Convert `Array` representation to `data` 124 | #[inline] 125 | fn to_data(&self) -> usize { 126 | (self.len << LEN_OFFSET) | (PTR_MASK & self.arr.as_ptr() as usize) | REPRESENTATION_ARRAY 127 | } 128 | } 129 | 130 | impl Debug for Array<'_, P, W> { 131 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 132 | f.write_str(&self.to_string()) 133 | } 134 | } 135 | 136 | impl PartialEq for Array<'_, P, W> { 137 | fn eq(&self, other: &Self) -> bool { 138 | self.deref() == other.deref() 139 | } 140 | } 141 | 142 | impl From for Array<'_, P, W> { 143 | /// Create new instance of `Array` from given `data` 144 | #[inline] 145 | fn from(data: usize) -> Self { 146 | let ptr = (data & PTR_MASK) as *mut u32; 147 | let len = data >> LEN_OFFSET; 148 | let cap = len.next_power_of_two(); 149 | let arr = unsafe { slice::from_raw_parts_mut(ptr, cap) }; 150 | Self { len, arr } 151 | } 152 | } 153 | 154 | impl Deref for Array<'_, P, W> { 155 | type Target = [u32]; 156 | 157 | fn deref(&self) -> &Self::Target { 158 | &self.arr[..self.len] 159 | } 160 | } 161 | 162 | /// Vectorized linear array search benefiting from SIMD instructions (e.g. AVX2). 163 | /// 164 | /// Input slice length assumed to be divisible by `N` to perform efficient 165 | /// batch comparisons of slice elements to provided value `v`. 166 | /// 167 | /// Assembly output: https://godbolt.org/z/eb8Kob9fa 168 | /// Background reading: https://tinyurl.com/2e4srh2d 169 | #[inline] 170 | fn contains_vectorized(a: &[u32], v: u32) -> bool { 171 | debug_assert_eq!(a.len() % N, 0); 172 | a.chunks_exact(N) 173 | .any(|chunk| contains_fixed_vectorized::(chunk.try_into().unwrap(), v)) 174 | } 175 | 176 | /// Vectorized linear fixed array search 177 | #[inline] 178 | fn contains_fixed_vectorized(a: [u32; N], v: u32) -> bool { 179 | let mut res = false; 180 | for x in a { 181 | res |= x == v 182 | } 183 | res 184 | } 185 | 186 | #[cfg(test)] 187 | mod tests { 188 | use super::*; 189 | 190 | #[test] 191 | fn array_size() { 192 | assert_eq!(std::mem::size_of::>(), 24); 193 | } 194 | } 195 | -------------------------------------------------------------------------------- /benches/cardinality_estimator.rs: -------------------------------------------------------------------------------- 1 | #[global_allocator] 2 | static ALLOC: dhat::Alloc = dhat::Alloc; 3 | 4 | use std::hash::{BuildHasherDefault, Hash}; 5 | 6 | use cardinality_estimator::CardinalityEstimator; 7 | use criterion::measurement::WallTime; 8 | use criterion::{ 9 | black_box, criterion_group, criterion_main, BenchmarkGroup, BenchmarkId, Criterion, Throughput, 10 | }; 11 | use hyperloglogplus::HyperLogLog as HyperLogLogTrait; 12 | use pprof::criterion::{Output, PProfProfiler}; 13 | use rand::rngs::StdRng; 14 | use rand::{Rng, SeedableRng}; 15 | use tabled::settings::{Settings, Style}; 16 | use tabled::{Table, Tabled}; 17 | use wyhash::WyHash; 18 | 19 | /// Insert and estimate operations are benchmarked against cardinalities ranging from 0 to 20 | /// `DEFAULT_MAX_CARDINALITY` or environment variable `N` (if defined) with cardinality doubled 21 | /// with every iteration as [0, 1, 2, ..., N]. 22 | const DEFAULT_MAX_CARDINALITY: usize = 256; 23 | 24 | criterion_group! { 25 | name = benches; 26 | config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Protobuf)); 27 | targets = benchmark 28 | } 29 | criterion_main!(benches); 30 | 31 | fn benchmark(c: &mut Criterion) { 32 | let bench_results_path = std::env::var("BENCH_RESULTS_PATH").unwrap(); 33 | let max_cardinality = std::env::var("N") 34 | .ok() 35 | .and_then(|v| v.parse().ok()) 36 | .unwrap_or(DEFAULT_MAX_CARDINALITY); 37 | 38 | let cardinalities: Vec = std::iter::once(0) 39 | .chain((0..).map(|c| 1 << c)) 40 | .take_while(|&c| c <= max_cardinality) 41 | .collect(); 42 | 43 | let mut group = c.benchmark_group("insert"); 44 | for &cardinality in &cardinalities { 45 | group.throughput(Throughput::Elements(cardinality.max(1) as u64)); 46 | bench_insert::(&mut group, cardinality); 47 | bench_insert::(&mut group, cardinality); 48 | bench_insert::(&mut group, cardinality); 49 | bench_insert::(&mut group, cardinality); 50 | bench_insert::(&mut group, cardinality); 51 | } 52 | group.finish(); 53 | 54 | let mut group = c.benchmark_group("estimate"); 55 | group.throughput(Throughput::Elements(1)); 56 | for &cardinality in &cardinalities { 57 | bench_estimate::(&mut group, cardinality); 58 | bench_estimate::(&mut group, cardinality); 59 | bench_estimate::(&mut group, cardinality); 60 | bench_estimate::(&mut group, cardinality); 61 | bench_estimate::(&mut group, cardinality); 62 | } 63 | group.finish(); 64 | 65 | let results: Vec = cardinalities 66 | .iter() 67 | .map(|&cardinality| StatRecord { 68 | cardinality, 69 | cardinality_estimator: measure_allocations::(cardinality), 70 | amadeus_streaming: measure_allocations::(cardinality), 71 | probabilistic_collections: measure_allocations::(cardinality), 72 | hyperloglog: measure_allocations::(cardinality), 73 | hyperloglogplus: measure_allocations::(cardinality), 74 | }) 75 | .collect(); 76 | 77 | let table_config = Settings::default().with(Style::markdown()); 78 | std::fs::write( 79 | format!("{}/memory_usage.md", bench_results_path), 80 | Table::new(results).with(table_config).to_string(), 81 | ) 82 | .unwrap(); 83 | 84 | let results: Vec = cardinalities 85 | .iter() 86 | .map(|&cardinality| StatRecord { 87 | cardinality, 88 | cardinality_estimator: measure_error::(cardinality), 89 | amadeus_streaming: measure_error::(cardinality), 90 | probabilistic_collections: measure_error::(cardinality), 91 | hyperloglog: measure_error::(cardinality), 92 | hyperloglogplus: measure_error::(cardinality), 93 | }) 94 | .collect(); 95 | 96 | let table_config = Settings::default().with(Style::markdown()); 97 | std::fs::write( 98 | format!("{}/relative_error.md", bench_results_path), 99 | Table::new(results).with(table_config).to_string(), 100 | ) 101 | .unwrap(); 102 | } 103 | 104 | /// Cardinality estimator trait representing common estimator operations. 105 | trait CardinalityEstimatorTrait { 106 | fn new() -> Self; 107 | fn insert(&mut self, item: &T); 108 | fn estimate(&mut self) -> usize; 109 | fn merge(&mut self, rhs: &Self); 110 | fn name() -> String; 111 | } 112 | 113 | fn bench_insert>( 114 | group: &mut BenchmarkGroup, 115 | cardinality: usize, 116 | ) { 117 | group.bench_with_input( 118 | BenchmarkId::new(E::name(), cardinality), 119 | &cardinality, 120 | |b, &cardinality| { 121 | b.iter(|| { 122 | let mut estimator = E::new(); 123 | for i in 0..black_box(cardinality) { 124 | estimator.insert(black_box(&i)); 125 | } 126 | }); 127 | }, 128 | ); 129 | } 130 | 131 | fn bench_estimate>( 132 | group: &mut BenchmarkGroup, 133 | cardinality: usize, 134 | ) { 135 | group.bench_with_input( 136 | BenchmarkId::new(E::name(), cardinality), 137 | &cardinality, 138 | |b, &cardinality| { 139 | let mut estimator = E::new(); 140 | for i in 0..black_box(cardinality) { 141 | estimator.insert(black_box(&i)); 142 | } 143 | b.iter(|| estimator.estimate()); 144 | }, 145 | ); 146 | } 147 | 148 | fn measure_allocations>(cardinality: usize) -> String { 149 | let _profiler = dhat::Profiler::builder().testing().build(); 150 | let mut estimator = E::new(); 151 | for i in 0..cardinality { 152 | estimator.insert(&i); 153 | } 154 | let stats = dhat::HeapStats::get(); 155 | format!( 156 | "{} / {} / {}", 157 | std::mem::size_of::(), 158 | stats.total_bytes, 159 | stats.total_blocks, 160 | ) 161 | } 162 | 163 | fn measure_error>(cardinality: usize) -> String { 164 | let n = 100; 165 | let mut total_relative_error: f64 = 0.0; 166 | let mut rng = StdRng::seed_from_u64(12345); 167 | for _ in 0..n { 168 | let mut estimator = E::new(); 169 | for _ in 0..cardinality { 170 | estimator.insert(&rng.gen()); 171 | } 172 | let relative_error = if cardinality == 0 { 173 | 0.0 174 | } else { 175 | (estimator.estimate() as f64 - cardinality as f64).abs() / cardinality as f64 176 | }; 177 | total_relative_error += relative_error; 178 | } 179 | let avg_relative_error = total_relative_error / f64::from(n); 180 | 181 | if avg_relative_error < 1.0 { 182 | format!("{:.4}", avg_relative_error) 183 | } else { 184 | format!("{:.2e}", avg_relative_error) 185 | } 186 | } 187 | 188 | #[derive(Tabled)] 189 | struct StatRecord { 190 | cardinality: usize, 191 | cardinality_estimator: String, 192 | amadeus_streaming: String, 193 | probabilistic_collections: String, 194 | hyperloglog: String, 195 | hyperloglogplus: String, 196 | } 197 | 198 | struct CardinalityEstimatorMut(CardinalityEstimator); 199 | 200 | impl CardinalityEstimatorTrait for CardinalityEstimatorMut { 201 | fn new() -> Self { 202 | Self(CardinalityEstimator::new()) 203 | } 204 | 205 | fn insert(&mut self, item: &usize) { 206 | self.0.insert(item); 207 | } 208 | 209 | fn estimate(&mut self) -> usize { 210 | self.0.estimate() 211 | } 212 | 213 | fn merge(&mut self, rhs: &Self) { 214 | self.0.merge(&rhs.0); 215 | } 216 | 217 | fn name() -> String { 218 | "cardinality-estimator".to_string() 219 | } 220 | } 221 | 222 | struct AmadeusStreamingEstimator(amadeus_streaming::HyperLogLog); 223 | 224 | impl CardinalityEstimatorTrait for AmadeusStreamingEstimator { 225 | fn new() -> Self { 226 | AmadeusStreamingEstimator(amadeus_streaming::HyperLogLog::new(0.01625)) 227 | } 228 | 229 | fn insert(&mut self, item: &usize) { 230 | self.0.push(item) 231 | } 232 | 233 | fn estimate(&mut self) -> usize { 234 | self.0.len() as usize 235 | } 236 | 237 | fn merge(&mut self, rhs: &Self) { 238 | self.0.union(&rhs.0); 239 | } 240 | 241 | fn name() -> String { 242 | "amadeus-streaming".to_string() 243 | } 244 | } 245 | 246 | struct ProbabilisticCollections(probabilistic_collections::hyperloglog::HyperLogLog); 247 | 248 | impl CardinalityEstimatorTrait for ProbabilisticCollections { 249 | fn new() -> Self { 250 | Self(probabilistic_collections::hyperloglog::HyperLogLog::new( 251 | 0.004, 252 | )) 253 | } 254 | 255 | fn insert(&mut self, item: &usize) { 256 | self.0.insert(item); 257 | } 258 | 259 | fn estimate(&mut self) -> usize { 260 | self.0.len() as usize 261 | } 262 | 263 | fn merge(&mut self, rhs: &Self) { 264 | self.0.merge(&rhs.0); 265 | } 266 | 267 | fn name() -> String { 268 | "probabilistic-collections".to_string() 269 | } 270 | } 271 | 272 | struct HyperLogLog(hyperloglog::HyperLogLog); 273 | 274 | impl CardinalityEstimatorTrait for HyperLogLog { 275 | fn new() -> Self { 276 | Self(hyperloglog::HyperLogLog::new(0.004)) 277 | } 278 | 279 | fn insert(&mut self, item: &usize) { 280 | self.0.insert(item); 281 | } 282 | 283 | fn estimate(&mut self) -> usize { 284 | self.0.len() as usize 285 | } 286 | 287 | fn merge(&mut self, rhs: &Self) { 288 | self.0.merge(&rhs.0); 289 | } 290 | 291 | fn name() -> String { 292 | "hyperloglog".to_string() 293 | } 294 | } 295 | 296 | struct HyperLogLogPlus(hyperloglogplus::HyperLogLogPlus>); 297 | 298 | impl CardinalityEstimatorTrait for HyperLogLogPlus { 299 | fn new() -> Self { 300 | Self( 301 | hyperloglogplus::HyperLogLogPlus::new(12, BuildHasherDefault::::default()) 302 | .unwrap(), 303 | ) 304 | } 305 | 306 | fn insert(&mut self, item: &usize) { 307 | self.0.insert(item); 308 | } 309 | 310 | fn estimate(&mut self) -> usize { 311 | self.0.count() as usize 312 | } 313 | 314 | fn merge(&mut self, rhs: &Self) { 315 | self.0.merge(&rhs.0).unwrap(); 316 | } 317 | 318 | fn name() -> String { 319 | "hyperloglogplus".to_string() 320 | } 321 | } 322 | -------------------------------------------------------------------------------- /src/hyperloglog.rs: -------------------------------------------------------------------------------- 1 | //! ## HyperLogLog representation 2 | //! Allows to estimate large cardinality in `[N..]` range, where `N` is based on `P` and `W`. 3 | //! This representation uses modified HyperLogLog++ with `M` registers of `W` width. 4 | //! 5 | //! [Original HyperLogLog++ paper](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/40671.pdf) 6 | //! 7 | //! The `data` format of HyperLogLog representation: 8 | //! - 0..1 bits - store representation type (bits are set to `11`) 9 | //! - 2..63 bits - store pointer to `u32` slice (on `x86_64 systems only 48-bits are needed). 10 | //! 11 | //! Slice encoding: 12 | //! - data[0] - stores number of HyperLogLog registers set to 0. 13 | //! - data[1] - stores harmonic sum of HyperLogLog registers (`f32` transmuted into `u32`). 14 | //! - data[2..] - stores register ranks using `W` bits per each register. 15 | 16 | use std::fmt::{Debug, Formatter}; 17 | use std::mem::{size_of, size_of_val}; 18 | use std::slice; 19 | 20 | use crate::representation::{RepresentationTrait, REPRESENTATION_HLL}; 21 | 22 | /// Mask used for accessing heap allocated data stored at the pointer in `data` field. 23 | const PTR_MASK: usize = !3; 24 | 25 | #[derive(PartialEq)] 26 | pub(crate) struct HyperLogLog<'a, const P: usize = 12, const W: usize = 6> { 27 | pub(crate) data: &'a mut [u32], 28 | } 29 | 30 | impl HyperLogLog<'_, P, W> { 31 | /// Number of HyperLogLog registers 32 | const M: usize = 1 << P; 33 | /// HyperLogLog representation `u32` slice length based on #registers, stored zero registers, harmonic sum, and 34 | /// one extra element for branchless register updates (see `set_register` for more details). 35 | pub(crate) const HLL_SLICE_LEN: usize = Self::M * W / 32 + 3; 36 | 37 | /// Create new instance of `HyperLogLog` representation from items 38 | #[inline] 39 | pub(crate) fn new(items: &[u32]) -> Self { 40 | let mut hll_data = vec![0u32; Self::HLL_SLICE_LEN]; 41 | let data = (PTR_MASK & hll_data.as_mut_ptr() as usize) | 3; 42 | std::mem::forget(hll_data); 43 | let mut hll = Self::from(data); 44 | 45 | hll.data[0] = Self::M as u32; 46 | hll.data[1] = (Self::M as f32).to_bits(); 47 | 48 | for &h in items.iter() { 49 | hll.insert_encoded_hash(h); 50 | } 51 | 52 | hll 53 | } 54 | 55 | /// Return normal index and rank from encoded sparse hash 56 | #[inline] 57 | fn decode_hash(h: u32) -> (u32, u32) { 58 | let rank = h & ((1 << W) - 1); 59 | let idx = (h >> W) & ((1 << P) - 1); 60 | (idx, rank) 61 | } 62 | 63 | /// Insert encoded hash into HyperLogLog representation 64 | #[inline] 65 | fn update_rank(&mut self, idx: u32, new_rank: u32) { 66 | let old_rank = self.get_register(idx); 67 | if new_rank > old_rank { 68 | self.set_register(idx, old_rank, new_rank); 69 | } 70 | } 71 | 72 | /// Get HyperLogLog `idx` register 73 | #[inline] 74 | fn get_register(&self, idx: u32) -> u32 { 75 | let bit_idx = (idx as usize) * W; 76 | let u32_idx = (bit_idx / 32) + 2; 77 | let bit_pos = bit_idx % 32; 78 | // SAFETY: `self.data` is always guaranteed to have these elements. 79 | let bits = unsafe { self.data.get_unchecked(u32_idx..u32_idx + 2) }; 80 | let bits_1 = W.min(32 - bit_pos); 81 | let bits_2 = W - bits_1; 82 | let mask_1 = (1 << bits_1) - 1; 83 | let mask_2 = (1 << bits_2) - 1; 84 | 85 | ((bits[0] >> bit_pos) & mask_1) | ((bits[1] & mask_2) << bits_1) 86 | } 87 | 88 | /// Set HyperLogLog `idx` register to new value `rank` 89 | #[inline] 90 | fn set_register(&mut self, idx: u32, old_rank: u32, new_rank: u32) { 91 | let bit_idx = (idx as usize) * W; 92 | let u32_idx = (bit_idx / 32) + 2; 93 | let bit_pos = bit_idx % 32; 94 | // SAFETY: `self.data` is always guaranteed to have these elements. 95 | let bits = unsafe { self.data.get_unchecked_mut(u32_idx..u32_idx + 2) }; 96 | let bits_1 = W.min(32 - bit_pos); 97 | let bits_2 = W - bits_1; 98 | let mask_1 = (1 << bits_1) - 1; 99 | let mask_2 = (1 << bits_2) - 1; 100 | 101 | // Unconditionally update two `u32` elements based on `new_rank` bits and masks 102 | bits[0] &= !(mask_1 << bit_pos); 103 | bits[0] |= (new_rank & mask_1) << bit_pos; 104 | bits[1] &= !mask_2; 105 | bits[1] |= (new_rank >> bits_1) & mask_2; 106 | 107 | // Update HyperLogLog's number of zero registers and harmonic sum 108 | // SAFETY: `self.data` is always guaranteed to have 0-th and 1-st elements. 109 | let zeros_and_sum = unsafe { self.data.get_unchecked_mut(0..2) }; 110 | zeros_and_sum[0] -= u32::from(old_rank == 0) & u32::from(zeros_and_sum[0] > 0); 111 | 112 | let mut sum = f32::from_bits(zeros_and_sum[1]); 113 | sum -= 1.0 / ((1u64 << u64::from(old_rank)) as f32); 114 | sum += 1.0 / ((1u64 << u64::from(new_rank)) as f32); 115 | zeros_and_sum[1] = sum.to_bits(); 116 | } 117 | 118 | /// Merge two `HyperLogLog` representations. 119 | #[inline] 120 | pub(crate) fn merge(&mut self, rhs: &HyperLogLog) { 121 | for idx in 0..Self::M as u32 { 122 | let lhs_rank = self.get_register(idx); 123 | let rhs_rank = rhs.get_register(idx); 124 | if rhs_rank > lhs_rank { 125 | self.set_register(idx, lhs_rank, rhs_rank); 126 | } 127 | } 128 | } 129 | } 130 | 131 | impl RepresentationTrait for HyperLogLog<'_, P, W> { 132 | /// Insert encoded hash into `HyperLogLog` representation. 133 | #[inline] 134 | fn insert_encoded_hash(&mut self, h: u32) -> usize { 135 | let (idx, rank) = Self::decode_hash(h); 136 | self.update_rank(idx, rank); 137 | self.to_data() 138 | } 139 | 140 | /// Return cardinality estimate of `HyperLogLog` representation 141 | #[inline] 142 | fn estimate(&self) -> usize { 143 | // SAFETY: `self.data` is always guaranteed to have 0-th and 1-st elements. 144 | let zeros = unsafe { *self.data.get_unchecked(0) }; 145 | let sum = f64::from(f32::from_bits(unsafe { *self.data.get_unchecked(1) })); 146 | let estimate = alpha(Self::M) * ((Self::M * (Self::M - zeros as usize)) as f64) 147 | / (sum + beta_horner(f64::from(zeros), P)); 148 | (estimate + 0.5) as usize 149 | } 150 | 151 | /// Return memory size of `HyperLogLog` 152 | #[inline] 153 | fn size_of(&self) -> usize { 154 | size_of::() + size_of_val(self.data) 155 | } 156 | 157 | /// Free memory occupied by the `HyperLogLog` representation 158 | /// SAFETY: caller of this method must ensure that `self.data` holds valid slice elements. 159 | #[inline] 160 | unsafe fn drop(&mut self) { 161 | drop(Box::from_raw(self.data)); 162 | } 163 | 164 | /// Convert `HyperLogLog` representation to `data` 165 | #[inline] 166 | fn to_data(&self) -> usize { 167 | (PTR_MASK & self.data.as_ptr() as usize) | REPRESENTATION_HLL 168 | } 169 | } 170 | 171 | impl From for HyperLogLog<'_, P, W> { 172 | /// Create new instance of `HyperLogLog` from given `data` 173 | #[inline] 174 | fn from(data: usize) -> Self { 175 | let ptr = (data & PTR_MASK) as *mut u32; 176 | // SAFETY: caller of this method must ensure that `data` contains valid slice pointer. 177 | let data = unsafe { slice::from_raw_parts_mut(ptr, Self::HLL_SLICE_LEN) }; 178 | Self { data } 179 | } 180 | } 181 | 182 | impl From> for HyperLogLog<'_, P, W> { 183 | /// Create new instance of `HyperLogLog` from given `hll_data` 184 | #[inline] 185 | fn from(mut hll_data: Vec) -> Self { 186 | let data = (PTR_MASK & hll_data.as_mut_ptr() as usize) | 3; 187 | std::mem::forget(hll_data); 188 | Self::from(data) 189 | } 190 | } 191 | 192 | impl Clone for HyperLogLog<'_, P, W> { 193 | /// Clone `HyperLogLog` representation 194 | #[inline] 195 | fn clone(&self) -> Self { 196 | let mut hll_data = self.data.to_vec(); 197 | let data = (PTR_MASK & hll_data.as_mut_ptr() as usize) | 3; 198 | std::mem::forget(hll_data); 199 | Self::from(data) 200 | } 201 | } 202 | 203 | impl Debug for HyperLogLog<'_, P, W> { 204 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 205 | f.write_str(&self.to_string()) 206 | } 207 | } 208 | 209 | /// Parameter for bias correction 210 | #[inline] 211 | fn alpha(m: usize) -> f64 { 212 | match m { 213 | 16 => 0.673, 214 | 32 => 0.697, 215 | 64 => 0.709, 216 | _ => 0.7213 / (1.0 + 1.079 / (m as f64)), 217 | } 218 | } 219 | /// Computes LogLog-Beta estimate bias correction using Horner's method. 220 | /// 221 | /// Paper: https://arxiv.org/pdf/1612.02284.pdf 222 | /// Wikipedia: https://en.wikipedia.org/wiki/Horner%27s_method 223 | #[inline] 224 | fn beta_horner(z: f64, precision: usize) -> f64 { 225 | let beta = BETA[precision - 4]; 226 | let zl = (z + 1.0).ln(); 227 | let mut res = 0.0; 228 | for i in (1..8).rev() { 229 | res = res * zl + beta[i]; 230 | } 231 | res * zl + beta[0] * z 232 | } 233 | 234 | /// LogLog-Beta polynomial coefficients for precision in [4..18] range. 235 | const BETA: [[f64; 8]; 15] = [ 236 | // p = 4 237 | [ 238 | -0.582581413904517, 239 | -1.93530035756005, 240 | 11.079323758035073, 241 | -22.131357446444323, 242 | 22.505391846630037, 243 | -12.000723834917984, 244 | 3.220579408194167, 245 | -0.342225302271235, 246 | ], 247 | // p = 5 248 | [ 249 | -0.7518999460733967, 250 | -0.959003007774876, 251 | 5.59973713221416, 252 | -8.209763699976552, 253 | 6.509125489447204, 254 | -2.683029373432373, 255 | 0.5612891113138221, 256 | -0.0463331622196545, 257 | ], 258 | // p = 6 259 | [ 260 | 29.825790096961963, 261 | -31.328708333772592, 262 | -10.594252303658228, 263 | -11.572012568909962, 264 | 3.818875437390749, 265 | -2.416013032853081, 266 | 0.4542208940970826, 267 | -0.0575155452020420, 268 | ], 269 | // p = 7 270 | [ 271 | 2.810292129082006, 272 | -3.9780498518175995, 273 | 1.3162680041351582, 274 | -3.92524863358059, 275 | 2.008083575394647, 276 | -0.7527151937556955, 277 | 0.1265569894242751, 278 | -0.0109946438726240, 279 | ], 280 | // p = 8 281 | [ 282 | 1.0063354488755052, 283 | -2.005806664051124, 284 | 1.6436974936651412, 285 | -2.7056080994056617, 286 | 1.392099802442226, 287 | -0.4647037427218319, 288 | 0.07384282377269775, 289 | -0.00578554885254223, 290 | ], 291 | // p = 9 292 | [ 293 | -0.09415657458167959, 294 | -0.7813097592455053, 295 | 1.7151494675071246, 296 | -1.7371125040651634, 297 | 0.8644150848904892, 298 | -0.23819027465047218, 299 | 0.03343448400269076, 300 | -0.00207858528178157, 301 | ], 302 | // p = 10 303 | [ 304 | -0.25935400670790054, 305 | -0.5259830199980581, 306 | 1.4893303492587684, 307 | -1.2964271408499357, 308 | 0.6228475621722162, 309 | -0.1567232677025104, 310 | 0.02054415903878563, 311 | -0.00112488483925502, 312 | ], 313 | // p = 11 314 | [ 315 | -4.32325553856025e-01, 316 | -1.08450736399632e-01, 317 | 6.09156550741120e-01, 318 | -1.65687801845180e-02, 319 | -7.95829341087617e-02, 320 | 4.71830602102918e-02, 321 | -7.81372902346934e03, 322 | 5.84268708489995e-04, 323 | ], 324 | // p = 12 325 | [ 326 | -3.84979202588598e-01, 327 | 1.83162233114364e-01, 328 | 1.30396688841854e-01, 329 | 7.04838927629266e-02, 330 | -8.95893971464453e-03, 331 | 1.13010036741605e-02, 332 | -1.94285569591290e-03, 333 | 2.25435774024964e-04, 334 | ], 335 | // p = 13 336 | [ 337 | -0.41655270946462997, 338 | -0.22146677040685156, 339 | 0.38862131236999947, 340 | 0.4534097974606237, 341 | -0.36264738324476375, 342 | 0.12304650053558529, 343 | -0.0170154038455551, 344 | 0.00102750367080838, 345 | ], 346 | // p = 14 347 | [ 348 | -3.71009760230692e-01, 349 | 9.78811941207509e-03, 350 | 1.85796293324165e-01, 351 | 2.03015527328432e-01, 352 | -1.16710521803686e-01, 353 | 4.31106699492820e-02, 354 | -5.99583540511831e-03, 355 | 4.49704299509437e-04, 356 | ], 357 | // p = 15 358 | [ 359 | -0.38215145543875273, 360 | -0.8906940053609084, 361 | 0.3760233577467887, 362 | 0.9933597744068238, 363 | -0.6557744163831896, 364 | 0.1833234212970361, 365 | -0.02241529633062872, 366 | 0.00121399789330194, 367 | ], 368 | // p = 16 369 | [ 370 | -0.3733187664375306, 371 | -1.41704077448123, 372 | 0.40729184796612533, 373 | 1.5615203390658416, 374 | -0.9924223353428613, 375 | 0.2606468139948309, 376 | -0.03053811369682807, 377 | 0.00155770210179105, 378 | ], 379 | // p = 17 380 | [ 381 | -0.36775502299404605, 382 | 0.5383142235137797, 383 | 0.7697028927876792, 384 | 0.5500258358645056, 385 | -0.7457558826114694, 386 | 0.2571183578582195, 387 | -0.03437902606864149, 388 | 0.00185949146371616, 389 | ], 390 | // p = 18 391 | [ 392 | -0.3647962332596054, 393 | 0.9973041232863503, 394 | 1.5535438623008122, 395 | 1.2593267719802892, 396 | -1.5332594820911016, 397 | 0.4780104220005659, 398 | -0.05951025172951174, 399 | 0.00291076804642205, 400 | ], 401 | ]; 402 | 403 | #[cfg(test)] 404 | mod tests { 405 | use super::*; 406 | 407 | #[test] 408 | fn hyerloglog_size() { 409 | assert_eq!(std::mem::size_of::>(), 16); 410 | } 411 | } 412 | -------------------------------------------------------------------------------- /src/estimator.rs: -------------------------------------------------------------------------------- 1 | use std::fmt::{Debug, Formatter}; 2 | use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher}; 3 | use std::marker::PhantomData; 4 | use std::ops::Deref; 5 | 6 | use wyhash::WyHash; 7 | 8 | use crate::representation::{Representation, RepresentationTrait}; 9 | 10 | /// Ensure that only 64-bit architecture is being used. 11 | #[cfg(target_pointer_width = "64")] 12 | pub struct CardinalityEstimator 13 | where 14 | T: Hash + ?Sized, 15 | H: Hasher + Default, 16 | { 17 | /// Data field represents tagged pointer with its format described in lib.rs 18 | pub(crate) data: usize, 19 | /// Zero-sized build hasher 20 | build_hasher: BuildHasherDefault, 21 | /// Zero-sized phantom data for type `T` 22 | _phantom_data: PhantomData, 23 | } 24 | 25 | impl CardinalityEstimator 26 | where 27 | T: Hash + ?Sized, 28 | H: Hasher + Default, 29 | { 30 | /// Creates new instance of `CardinalityEstimator` 31 | #[inline] 32 | pub fn new() -> Self { 33 | // Ensure that `P` and `W` are in correct range at compile time 34 | const { assert!(P >= 4 && P <= 18 && W >= 4 && W <= 6) } 35 | 36 | Self { 37 | // Start with empty small representation 38 | data: 0, 39 | build_hasher: BuildHasherDefault::default(), 40 | _phantom_data: PhantomData, 41 | } 42 | } 43 | 44 | /// Insert a hashable item into `CardinalityEstimator` 45 | #[inline] 46 | pub fn insert(&mut self, item: &T) { 47 | let hash = self.build_hasher.hash_one(&item); 48 | self.insert_hash(hash); 49 | } 50 | 51 | /// Return cardinality estimate 52 | #[inline] 53 | pub fn estimate(&self) -> usize { 54 | self.representation().estimate() 55 | } 56 | 57 | /// Merge cardinality estimators 58 | #[inline] 59 | pub fn merge(&mut self, rhs: &Self) { 60 | match (self.representation(), rhs.representation()) { 61 | (_, Representation::Small(rhs_small)) => { 62 | for h in rhs_small.items() { 63 | if h != 0 { 64 | self.insert_encoded_hash(h); 65 | } 66 | } 67 | } 68 | (_, Representation::Array(rhs_arr)) => { 69 | for &h in rhs_arr.deref() { 70 | self.insert_encoded_hash(h); 71 | } 72 | } 73 | (Representation::Small(lhs_small), Representation::Hll(rhs_hll)) => { 74 | let mut hll = rhs_hll.clone(); 75 | for h in lhs_small.items() { 76 | if h != 0 { 77 | hll.insert_encoded_hash(h); 78 | } 79 | } 80 | self.data = hll.to_data(); 81 | } 82 | (Representation::Array(mut lhs_arr), Representation::Hll(rhs_hll)) => { 83 | let mut hll = rhs_hll.clone(); 84 | for &h in lhs_arr.deref() { 85 | hll.insert_encoded_hash(h); 86 | } 87 | unsafe { lhs_arr.drop() }; 88 | self.data = hll.to_data(); 89 | } 90 | (Representation::Hll(mut lhs_hll), Representation::Hll(rhs_hll)) => { 91 | lhs_hll.merge(&rhs_hll); 92 | } 93 | } 94 | } 95 | 96 | /// Returns the representation type of `CardinalityEstimator`. 97 | #[inline] 98 | pub(crate) fn representation(&self) -> Representation { 99 | Representation::::from_data(self.data) 100 | } 101 | 102 | /// Insert hash into `CardinalityEstimator` 103 | #[inline] 104 | pub fn insert_hash(&mut self, hash: u64) { 105 | self.insert_encoded_hash(Self::encode_hash(hash)); 106 | } 107 | 108 | /// Insert encoded hash into `CardinalityEstimator` 109 | #[inline] 110 | fn insert_encoded_hash(&mut self, h: u32) { 111 | self.data = self.representation().insert_encoded_hash(h); 112 | } 113 | 114 | /// Compute the sparse encoding of the given hash 115 | #[inline] 116 | fn encode_hash(hash: u64) -> u32 { 117 | let idx = (hash as u32) & ((1 << (32 - W - 1)) - 1); 118 | let rank = (!hash >> P).trailing_zeros() + 1; 119 | (idx << W) | rank 120 | } 121 | 122 | /// Return memory size of `CardinalityEstimator` 123 | pub fn size_of(&self) -> usize { 124 | self.representation().size_of() 125 | } 126 | } 127 | 128 | impl Default for CardinalityEstimator 129 | where 130 | T: Hash + ?Sized, 131 | H: Hasher + Default, 132 | { 133 | fn default() -> Self { 134 | Self::new() 135 | } 136 | } 137 | 138 | impl Clone for CardinalityEstimator 139 | where 140 | T: Hash + ?Sized, 141 | H: Hasher + Default, 142 | { 143 | /// Clone `CardinalityEstimator` 144 | fn clone(&self) -> Self { 145 | let mut estimator = Self::new(); 146 | estimator.merge(self); 147 | estimator 148 | } 149 | } 150 | 151 | impl Drop for CardinalityEstimator 152 | where 153 | T: Hash + ?Sized, 154 | H: Hasher + Default, 155 | { 156 | /// Free memory occupied by `CardinalityEstimator` 157 | #[inline] 158 | fn drop(&mut self) { 159 | unsafe { self.representation().drop() }; 160 | } 161 | } 162 | 163 | impl PartialEq for CardinalityEstimator 164 | where 165 | T: Hash + ?Sized, 166 | H: Hasher + Default, 167 | { 168 | /// Compare cardinality estimators 169 | fn eq(&self, rhs: &Self) -> bool { 170 | self.representation() == rhs.representation() 171 | } 172 | } 173 | 174 | impl Debug for CardinalityEstimator 175 | where 176 | T: Hash + ?Sized, 177 | H: Hasher + Default, 178 | { 179 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 180 | write!(f, "{:?}", self.representation()) 181 | } 182 | } 183 | 184 | #[cfg(test)] 185 | pub mod tests { 186 | use super::*; 187 | use test_case::test_case; 188 | 189 | #[test_case(0 => "representation: Small(estimate: 0, size: 8), avg_err: 0.0000")] 190 | #[test_case(1 => "representation: Small(estimate: 1, size: 8), avg_err: 0.0000")] 191 | #[test_case(2 => "representation: Small(estimate: 2, size: 8), avg_err: 0.0000")] 192 | #[test_case(3 => "representation: Array(estimate: 3, size: 24), avg_err: 0.0000")] 193 | #[test_case(4 => "representation: Array(estimate: 4, size: 24), avg_err: 0.0000")] 194 | #[test_case(8 => "representation: Array(estimate: 8, size: 40), avg_err: 0.0000")] 195 | #[test_case(16 => "representation: Array(estimate: 16, size: 72), avg_err: 0.0000")] 196 | #[test_case(17 => "representation: Array(estimate: 17, size: 136), avg_err: 0.0000")] 197 | #[test_case(28 => "representation: Array(estimate: 28, size: 136), avg_err: 0.0000")] 198 | #[test_case(29 => "representation: Array(estimate: 29, size: 136), avg_err: 0.0000")] 199 | #[test_case(56 => "representation: Array(estimate: 56, size: 264), avg_err: 0.0000")] 200 | #[test_case(57 => "representation: Array(estimate: 57, size: 264), avg_err: 0.0000")] 201 | #[test_case(128 => "representation: Array(estimate: 128, size: 520), avg_err: 0.0000")] 202 | #[test_case(129 => "representation: Hll(estimate: 131, size: 660), avg_err: 0.0001")] 203 | #[test_case(256 => "representation: Hll(estimate: 264, size: 660), avg_err: 0.0119")] 204 | #[test_case(512 => "representation: Hll(estimate: 512, size: 660), avg_err: 0.0151")] 205 | #[test_case(1024 => "representation: Hll(estimate: 1033, size: 660), avg_err: 0.0172")] 206 | #[test_case(10_000 => "representation: Hll(estimate: 10417, size: 660), avg_err: 0.0281")] 207 | #[test_case(100_000 => "representation: Hll(estimate: 93099, size: 660), avg_err: 0.0351")] 208 | fn test_estimator_p10_w5(n: usize) -> String { 209 | evaluate_cardinality_estimator(CardinalityEstimator::::new(), n) 210 | } 211 | 212 | #[test_case(0 => "representation: Small(estimate: 0, size: 8), avg_err: 0.0000")] 213 | #[test_case(1 => "representation: Small(estimate: 1, size: 8), avg_err: 0.0000")] 214 | #[test_case(2 => "representation: Small(estimate: 2, size: 8), avg_err: 0.0000")] 215 | #[test_case(3 => "representation: Array(estimate: 3, size: 24), avg_err: 0.0000")] 216 | #[test_case(4 => "representation: Array(estimate: 4, size: 24), avg_err: 0.0000")] 217 | #[test_case(8 => "representation: Array(estimate: 8, size: 40), avg_err: 0.0000")] 218 | #[test_case(16 => "representation: Array(estimate: 16, size: 72), avg_err: 0.0000")] 219 | #[test_case(32 => "representation: Array(estimate: 32, size: 136), avg_err: 0.0000")] 220 | #[test_case(64 => "representation: Array(estimate: 64, size: 264), avg_err: 0.0000")] 221 | #[test_case(128 => "representation: Array(estimate: 128, size: 520), avg_err: 0.0000")] 222 | #[test_case(129 => "representation: Hll(estimate: 130, size: 3092), avg_err: 0.0001")] 223 | #[test_case(256 => "representation: Hll(estimate: 254, size: 3092), avg_err: 0.0029")] 224 | #[test_case(512 => "representation: Hll(estimate: 498, size: 3092), avg_err: 0.0068")] 225 | #[test_case(1024 => "representation: Hll(estimate: 1012, size: 3092), avg_err: 0.0130")] 226 | #[test_case(4096 => "representation: Hll(estimate: 4105, size: 3092), avg_err: 0.0089")] 227 | #[test_case(10_000 => "representation: Hll(estimate: 10068, size: 3092), avg_err: 0.0087")] 228 | #[test_case(100_000 => "representation: Hll(estimate: 95628, size: 3092), avg_err: 0.0182")] 229 | fn test_estimator_p12_w6(n: usize) -> String { 230 | evaluate_cardinality_estimator(CardinalityEstimator::::new(), n) 231 | } 232 | 233 | #[test_case(0 => "representation: Small(estimate: 0, size: 8), avg_err: 0.0000")] 234 | #[test_case(1 => "representation: Small(estimate: 1, size: 8), avg_err: 0.0000")] 235 | #[test_case(2 => "representation: Small(estimate: 2, size: 8), avg_err: 0.0000")] 236 | #[test_case(3 => "representation: Array(estimate: 3, size: 24), avg_err: 0.0000")] 237 | #[test_case(4 => "representation: Array(estimate: 4, size: 24), avg_err: 0.0000")] 238 | #[test_case(8 => "representation: Array(estimate: 8, size: 40), avg_err: 0.0000")] 239 | #[test_case(16 => "representation: Array(estimate: 16, size: 72), avg_err: 0.0000")] 240 | #[test_case(32 => "representation: Array(estimate: 32, size: 136), avg_err: 0.0000")] 241 | #[test_case(64 => "representation: Array(estimate: 64, size: 264), avg_err: 0.0000")] 242 | #[test_case(128 => "representation: Array(estimate: 128, size: 520), avg_err: 0.0000")] 243 | #[test_case(129 => "representation: Hll(estimate: 129, size: 196628), avg_err: 0.0000")] 244 | #[test_case(256 => "representation: Hll(estimate: 256, size: 196628), avg_err: 0.0000")] 245 | #[test_case(512 => "representation: Hll(estimate: 511, size: 196628), avg_err: 0.0004")] 246 | #[test_case(1024 => "representation: Hll(estimate: 1022, size: 196628), avg_err: 0.0014")] 247 | #[test_case(4096 => "representation: Hll(estimate: 4100, size: 196628), avg_err: 0.0009")] 248 | #[test_case(10_000 => "representation: Hll(estimate: 10007, size: 196628), avg_err: 0.0008")] 249 | #[test_case(100_000 => "representation: Hll(estimate: 100240, size: 196628), avg_err: 0.0011")] 250 | fn test_estimator_p18_w6(n: usize) -> String { 251 | evaluate_cardinality_estimator(CardinalityEstimator::::new(), n) 252 | } 253 | 254 | fn evaluate_cardinality_estimator( 255 | mut e: CardinalityEstimator, 256 | n: usize, 257 | ) -> String { 258 | let mut total_relative_error: f64 = 0.0; 259 | for i in 0..n { 260 | e.insert(&i); 261 | let estimate = e.estimate() as f64; 262 | let actual = (i + 1) as f64; 263 | let error = estimate - actual; 264 | let relative_error = error.abs() / actual; 265 | total_relative_error += relative_error; 266 | } 267 | 268 | let avg_relative_error = total_relative_error / ((n + 1) as f64); 269 | 270 | // Compute the expected standard error for HyperLogLog based on the precision 271 | let standard_error = 1.04 / 2.0f64.powi(P as i32).sqrt(); 272 | let tolerance = 1.2; 273 | 274 | assert!( 275 | avg_relative_error <= standard_error * tolerance, 276 | "Average relative error {} exceeds acceptable threshold {}", 277 | avg_relative_error, 278 | standard_error * tolerance 279 | ); 280 | 281 | format!( 282 | "representation: {:?}, avg_err: {:.4}", 283 | e, avg_relative_error 284 | ) 285 | } 286 | 287 | #[test_case(0, 0 => "Small(estimate: 0, size: 8)")] 288 | #[test_case(0, 1 => "Small(estimate: 1, size: 8)")] 289 | #[test_case(1, 0 => "Small(estimate: 1, size: 8)")] 290 | #[test_case(1, 1 => "Small(estimate: 2, size: 8)")] 291 | #[test_case(1, 2 => "Array(estimate: 3, size: 24)")] 292 | #[test_case(2, 1 => "Array(estimate: 3, size: 24)")] 293 | #[test_case(2, 2 => "Array(estimate: 4, size: 24)")] 294 | #[test_case(2, 3 => "Array(estimate: 5, size: 40)")] 295 | #[test_case(2, 4 => "Array(estimate: 6, size: 40)")] 296 | #[test_case(4, 2 => "Array(estimate: 6, size: 40)")] 297 | #[test_case(3, 2 => "Array(estimate: 5, size: 40)")] 298 | #[test_case(3, 3 => "Array(estimate: 6, size: 40)")] 299 | #[test_case(3, 4 => "Array(estimate: 7, size: 40)")] 300 | #[test_case(4, 3 => "Array(estimate: 7, size: 40)")] 301 | #[test_case(4, 4 => "Array(estimate: 8, size: 40)")] 302 | #[test_case(4, 8 => "Array(estimate: 12, size: 72)")] 303 | #[test_case(8, 4 => "Array(estimate: 12, size: 72)")] 304 | #[test_case(4, 12 => "Array(estimate: 16, size: 72)")] 305 | #[test_case(12, 4 => "Array(estimate: 16, size: 72)")] 306 | #[test_case(1, 127 => "Array(estimate: 128, size: 520)")] 307 | #[test_case(1, 128 => "Hll(estimate: 130, size: 3092)")] 308 | #[test_case(127, 1 => "Array(estimate: 128, size: 520)")] 309 | #[test_case(128, 1 => "Hll(estimate: 130, size: 3092)")] 310 | #[test_case(128, 128 => "Hll(estimate: 254, size: 3092)")] 311 | #[test_case(512, 512 => "Hll(estimate: 1012, size: 3092)")] 312 | #[test_case(10000, 0 => "Hll(estimate: 10068, size: 3092)")] 313 | #[test_case(0, 10000 => "Hll(estimate: 10068, size: 3092)")] 314 | #[test_case(4, 10000 => "Hll(estimate: 10068, size: 3092)")] 315 | #[test_case(10000, 4 => "Hll(estimate: 10068, size: 3092)")] 316 | #[test_case(17, 10000 => "Hll(estimate: 10073, size: 3092)")] 317 | #[test_case(10000, 17 => "Hll(estimate: 10073, size: 3092)")] 318 | #[test_case(10000, 10000 => "Hll(estimate: 19974, size: 3092)")] 319 | fn test_merge(lhs_n: usize, rhs_n: usize) -> String { 320 | let mut lhs = CardinalityEstimator::::new(); 321 | for i in 0..lhs_n { 322 | lhs.insert(&i); 323 | } 324 | 325 | let mut rhs = CardinalityEstimator::::new(); 326 | for i in lhs_n..lhs_n + rhs_n { 327 | rhs.insert(&i); 328 | } 329 | 330 | lhs.merge(&rhs); 331 | 332 | format!("{:?}", lhs) 333 | } 334 | 335 | #[test] 336 | fn test_insert() { 337 | // Create a new CardinalityEstimator. 338 | let mut e = CardinalityEstimator::::new(); 339 | 340 | // Ensure initial estimate is 0. 341 | assert_eq!(e.estimate(), 0); 342 | 343 | // Insert a test item and validate estimate. 344 | e.insert("test item 1"); 345 | assert_eq!(e.estimate(), 1); 346 | 347 | // Re-insert the same item, estimate should remain the same. 348 | e.insert("test item 1"); 349 | assert_eq!(e.estimate(), 1); 350 | 351 | // Insert a new distinct item, estimate should increase. 352 | e.insert("test item 2"); 353 | assert_eq!(e.estimate(), 2); 354 | } 355 | } 356 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cardinality-estimator 2 | ![build](https://img.shields.io/github/actions/workflow/status/cloudflare/cardinality-estimator/ci.yml?branch=main) 3 | [![docs.rs](https://docs.rs/cardinality-estimator/badge.svg)](https://docs.rs/cardinality-estimator) 4 | [![crates.io](https://img.shields.io/crates/v/cardinality-estimator.svg)](https://crates.io/crates/cardinality-estimator) 5 | [![License](https://img.shields.io/badge/license-Apache%202.0-blue)](LICENSE) 6 | 7 | `cardinality-estimator` is a Rust crate designed to estimate the number of distinct elements in a stream or dataset in an efficient manner. 8 | This library uses HyperLogLog++ with an optimized low memory footprint and high accuracy approach, suitable for large-scale data analysis tasks. 9 | We're using `cardinality-estimator` for large-scale machine learning, computing cardinality features across multiple dimensions of the request. 10 | 11 | ## Overview 12 | Our `cardinality-estimator` is highly efficient in terms of memory usage, latency, and accuracy. 13 | This is achieved by leveraging a combination of unique data structure design, efficient algorithms, and HyperLogLog++ for high cardinality ranges. 14 | 15 | ## Getting Started 16 | To use `cardinality-estimator`, add it to your `Cargo.toml` under `[dependencies]`: 17 | ```toml 18 | [dependencies] 19 | cardinality-estimator = "1.0.0" 20 | ``` 21 | Then, import `cardinality-estimator` in your Rust program: 22 | ```rust 23 | use cardinality_estimator::CardinalityEstimator; 24 | 25 | let mut estimator = CardinalityEstimator::<12, 6>::new(); 26 | estimator.insert("test"); 27 | let estimate = estimator.estimate(); 28 | 29 | println!("estimate = {}", estimate); 30 | ``` 31 | 32 | Please refer to our [examples](examples) and [benchmarks](benches) in the repository for more complex scenarios. 33 | 34 | ## Low memory footprint 35 | The `cardinality-estimator` achieves low memory footprint by leveraging an efficient data storage format. 36 | The data is stored in three different representations - `Small`, `Array`, and `HyperLogLog` - depending on the cardinality range. 37 | For instance, for a cardinality of 0 to 2, only **8 bytes** of stack memory and 0 bytes of heap memory are used. 38 | 39 | ## Low latency 40 | The crate offers low latency by using auto-vectorization for slice operations via compiler hints to use SIMD instructions. 41 | The number of zero registers and registers' harmonic sum are stored and updated dynamically as more data is inserted, resulting in fast estimate operations. 42 | 43 | ## High accuracy 44 | The cardinality-estimator achieves high accuracy by using precise counting for small cardinality ranges and HyperLogLog++ with LogLog-Beta bias correction for larger ranges. 45 | This provides expected error rates as low as 0.02% for large cardinalities. 46 | 47 | ## Benchmarks 48 | 49 | To run benchmarks you first need to install `cargo-criterion` binary: 50 | ```shell 51 | cargo install cargo-criterion 52 | ``` 53 | 54 | Then benchmarks with output format JSON to save results for further analysis: 55 | ```shell 56 | make bench 57 | ``` 58 | 59 | We've benchmarked cardinality-estimator against several other crates in the ecosystem: 60 | * [hyperloglog](https://crates.io/crates/hyperloglog) 61 | * [hyperloglogplus](https://crates.io/crates/hyperloglogplus) 62 | * [amadeus-streaming](https://crates.io/crates/amadeus-streaming) 63 | * [probabilistic-collections](https://crates.io/crates/probabilistic-collections) 64 | 65 | Please note, that [hyperloglog](https://github.com/jedisct1/rust-hyperloglog/blob/1.0.2/src/lib.rs#L33) and [probabilistic-collections](https://gitlab.com/jeffrey-xiao/probabilistic-collections-rs/-/blob/da2a331e9679e4686bdcc772c369b639b9c33dee/src/hyperloglog.rs#L103) crates have bug in calculation of precision `p` based on provided `probability`: 66 | * incorrect formula: `p = (1.04 / error_probability).powi(2).ln().ceil() as usize;` 67 | * corrected formula: `p = (1.04 / error_probability).powi(2).log2().ceil() as usize;` 68 | 69 | We're continuously working to make `cardinality-estimator` the fastest, lightest, and most accurate tool for cardinality estimation in Rust. 70 | 71 | Benchmarks presented below are executed on Linux laptop with `13th Gen Intel(R) Core(TM) i7-13800H` processor and compiler flags set to `RUSTFLAGS=-C target-cpu=native`. 72 | 73 | ### Memory usage 74 | ![Cardinality Estimators Memory Usage](benches/memory_bytes.png) 75 | 76 | Table below compares memory usage of different cardinality estimators. 77 | The number in each cell represents `stack memory bytes / heap memory bytes / heap memory blocks` at each measured cardinality. 78 | 79 | Our `cardinality-estimator` achieves the lowest stack and heap memory allocations across all different cardinalities. 80 | 81 | Note, that `hyperloglogplus` implementation has particularly high memory usage especially for cardinalities above 256. 82 | 83 | | cardinality | cardinality_estimator | amadeus_streaming | probabilistic_collections | hyperloglog | hyperloglogplus | 84 | |-------------|-----------------------|-------------------|---------------------------|----------------|--------------------| 85 | | 0 | **8 / 0 / 0** | 48 / 4096 / 1 | 128 / 4096 / 1 | 120 / 4464 / 2 | 160 / 0 / 0 | 86 | | 1 | **8 / 0 / 0** | 48 / 4096 / 1 | 128 / 4096 / 1 | 120 / 4096 / 1 | 160 / 36 / 1 | 87 | | 2 | **8 / 0 / 0** | 48 / 4096 / 1 | 128 / 4096 / 1 | 120 / 4096 / 1 | 160 / 36 / 1 | 88 | | 4 | **8 / 16 / 1** | 48 / 4096 / 1 | 128 / 4096 / 1 | 120 / 4096 / 1 | 160 / 92 / 2 | 89 | | 8 | **8 / 48 / 2** | 48 / 4096 / 1 | 128 / 4096 / 1 | 120 / 4096 / 1 | 160 / 188 / 3 | 90 | | 16 | **8 / 112 / 3** | 48 / 4096 / 1 | 128 / 4096 / 1 | 120 / 4096 / 1 | 160 / 364 / 4 | 91 | | 32 | **8 / 240 / 4** | 48 / 4096 / 1 | 128 / 4096 / 1 | 120 / 4096 / 1 | 160 / 700 / 5 | 92 | | 64 | **8 / 496 / 5** | 48 / 4096 / 1 | 128 / 4096 / 1 | 120 / 4096 / 1 | 160 / 1400 / 13 | 93 | | 128 | **8 / 1008 / 6** | 48 / 4096 / 1 | 128 / 4096 / 1 | 120 / 4096 / 1 | 160 / 3261 / 23 | 94 | | 256 | **8 / 4092 / 7** | 48 / 4096 / 1 | 128 / 4096 / 1 | 120 / 4096 / 1 | 160 / 10361 / 43 | 95 | | 512 | **8 / 4092 / 7** | 48 / 4096 / 1 | 128 / 4096 / 1 | 120 / 4096 / 1 | 160 / 38295 / 83 | 96 | | 1024 | **8 / 4092 / 7** | 48 / 4096 / 1 | 128 / 4096 / 1 | 120 / 4096 / 1 | 160 / 146816 / 163 | 97 | | 2048 | **8 / 4092 / 7** | 48 / 4096 / 1 | 128 / 4096 / 1 | 120 / 4096 / 1 | 160 / 207711 / 194 | 98 | | 4096 | **8 / 4092 / 7** | 48 / 4096 / 1 | 128 / 4096 / 1 | 120 / 4096 / 1 | 160 / 207711 / 194 | 99 | | 8192 | **8 / 4092 / 7** | 48 / 4096 / 1 | 128 / 4096 / 1 | 120 / 4096 / 1 | 160 / 207711 / 194 | 100 | | 16384 | **8 / 4092 / 7** | 48 / 4096 / 1 | 128 / 4096 / 1 | 120 / 4096 / 1 | 160 / 207711 / 194 | 101 | | 32768 | **8 / 4092 / 7** | 48 / 4096 / 1 | 128 / 4096 / 1 | 120 / 4096 / 1 | 160 / 207711 / 194 | 102 | | 65536 | **8 / 4092 / 7** | 48 / 4096 / 1 | 128 / 4096 / 1 | 120 / 4096 / 1 | 160 / 207711 / 194 | 103 | | 131072 | **8 / 4092 / 7** | 48 / 4096 / 1 | 128 / 4096 / 1 | 120 / 4096 / 1 | 160 / 207711 / 194 | 104 | | 262144 | **8 / 4092 / 7** | 48 / 4096 / 1 | 128 / 4096 / 1 | 120 / 4096 / 1 | 160 / 207711 / 194 | 105 | | 524288 | **8 / 4092 / 7** | 48 / 4096 / 1 | 128 / 4096 / 1 | 120 / 4096 / 1 | 160 / 207711 / 194 | 106 | | 1048576 | **8 / 4092 / 7** | 48 / 4096 / 1 | 128 / 4096 / 1 | 120 / 4096 / 1 | 160 / 207711 / 194 | 107 | 108 | ### Insert performance 109 | ![Cardinality Estimators Insert Time](benches/insert_time.png) 110 | 111 | Table below represents insert time in nanoseconds per element. 112 | 113 | Our `cardinality-estimator` demonstrates the lowest insert time for most of the cardinalities. 114 | 115 | | cardinality | cardinality-estimator | amadeus-streaming | probabilistic-collections | hyperloglog | hyperloglogplus | 116 | |---------------|-------------------------|---------------------|-----------------------------|---------------|-------------------| 117 | | 0 | **0.64** | 88.12 | 70.19 | 82.69 | 17.45 | 118 | | 1 | **2.42** | 91.5 | 80.2 | 131.86 | 60.65 | 119 | | 2 | **2.21** | 44.3 | 45.34 | 81.48 | 34.96 | 120 | | 4 | **6.9** | 25.59 | 24.85 | 54.38 | 36.22 | 121 | | 8 | **7.27** | 15.62 | 17.92 | 43.54 | 35.55 | 122 | | 16 | **6.99** | 12.15 | 14.44 | 37.24 | 33.4 | 123 | | 32 | **7.9** | 9.6 | 12.78 | 34.23 | 32.49 | 124 | | 64 | 10.14 | **8.97** | 11.86 | 32.55 | 39.04 | 125 | | 128 | 15.47 | **8.52** | 11.49 | 31.76 | 48.37 | 126 | | 256 | 13.42 | **8.01** | 11.24 | 31.44 | 65.58 | 127 | | 512 | 9.92 | **8.1** | 11.11 | 31.34 | 100.25 | 128 | | 1024 | 8.32 | **8.14** | 12.52 | 31.73 | 171.71 | 129 | | 2048 | **7.31** | 7.92 | 12.52 | 32.03 | 120.71 | 130 | | 4096 | **7.11** | 8.01 | 11.04 | 32.73 | 63.5 | 131 | | 8192 | 8.81 | **8.02** | 10.97 | 33.08 | 37.36 | 132 | | 16384 | 8.08 | **8.01** | 11.03 | 32.75 | 22.24 | 133 | | 32768 | **6.55** | 7.96 | 11.01 | 32.37 | 13.3 | 134 | | 65536 | **5.35** | 7.96 | 10.96 | 31.95 | 8.41 | 135 | | 131072 | **4.48** | 7.9 | 10.97 | 31.71 | 5.71 | 136 | | 262144 | **3.91** | 7.95 | 10.95 | 31.52 | 4.26 | 137 | | 524288 | 3.58 | 7.64 | 10.95 | 31.47 | **3.47** | 138 | | 1048576 | 3.35 | 7.95 | 10.95 | 31.47 | **3.04** | 139 | 140 | ### Estimate performance 141 | ![Cardinality Estimators Estimate Time](benches/estimate_time.png) 142 | 143 | Table below represents estimate time in nanoseconds per call. 144 | 145 | Our `cardinality-estimator` shows the lowest estimate time for most of the cardinalities, especially smaller cardinalities up to 128. 146 | 147 | Note, that `amadeus-streaming` implementation is also quite effective at estimate operation, however it has higher memory usage as indicated by table above. 148 | Implementations `probabilistic-collections`, `hyperloglogplus` and `hyperloglogplus` have much higher estimate time, especially for higher cardinalities. 149 | 150 | | cardinality | cardinality-estimator | amadeus-streaming | probabilistic-collections | hyperloglog | hyperloglogplus | 151 | |---------------|-------------------------|---------------------|-----------------------------|---------------|-------------------| 152 | | 0 | **0.18** | 7.9 | 15576.4 | 125.03 | 24.89 | 153 | | 1 | **0.18** | 9.19 | 15619.8 | 134.3 | 64.62 | 154 | | 2 | **0.18** | 9.18 | 15615.5 | 134.4 | 70.51 | 155 | | 4 | **0.18** | 9.2 | 15642.7 | 134.01 | 89.16 | 156 | | 8 | **0.18** | 9.19 | 15611.1 | 134.41 | 132.0 | 157 | | 16 | **0.18** | 9.19 | 15621.6 | 134.39 | 211.4 | 158 | | 32 | **0.18** | 9.19 | 15637.1 | 130.58 | 357.55 | 159 | | 64 | **0.18** | 9.19 | 15626 | 130.26 | 619.95 | 160 | | 128 | **0.18** | 9.18 | 15640.8 | 130.33 | 1134.12 | 161 | | 256 | 11.31 | **9.09** | 15668 | 133.5 | 2205.7 | 162 | | 512 | 11.3 | **9.09** | 15652 | 129.58 | 4334.05 | 163 | | 1024 | 11.31 | **9.09** | 15687.1 | 129.79 | 8392.59 | 164 | | 2048 | 11.28 | 9.11 | 15680.4 | 129.8 | **8.08** | 165 | | 4096 | **11.29** | 38.63 | 15803.4 | 129.49 | 4342.07 | 166 | | 8192 | **11.28** | 38.98 | 23285 | 129.51 | 4345.7 | 167 | | 16384 | **11.29** | 38.17 | 26950.7 | 132.96 | 4341.9 | 168 | | 32768 | **6.02** | 10.86 | 31168 | 7674.3 | 4334.98 | 169 | | 65536 | 6.05 | **4.1** | 33123.8 | 40986.4 | 4327.48 | 170 | | 131072 | 6.02 | **4.1** | 33772.4 | 42113.7 | 4327.29 | 171 | | 262144 | 6.02 | **4.11** | 34711.7 | 43587 | 4329.63 | 172 | | 524288 | 6.02 | **4.1** | 36091.2 | 43582.8 | 4327.8 | 173 | | 1048576 | 6.02 | **4.11** | 37877.1 | 45055.3 | 4327.37 | 174 | 175 | ### Error rate 176 | ![Cardinality Estimators Error Rate](benches/error_rate.png) 177 | 178 | Table below represents average absolute relative error across 100 runs of estimator on random elements at given cardinality. 179 | 180 | Our `cardinality-estimator` performs on par well with `amadeus-streaming` and `hyperloglog` estimators, but has especially smaller low error rate for cardinalities up to 128. 181 | 182 | Note, that `probabilistic-collections` implementation seems to have bug in its estimation operation for cardinalities >=32768. 183 | 184 | | cardinality | cardinality_estimator | amadeus_streaming | probabilistic_collections | hyperloglog | hyperloglogplus | 185 | |-------------|-----------------------|-------------------|---------------------------|-------------|-----------------| 186 | | 0 | **0.0000** | **0.0000** | **0.0000** | **0.0000** | **0.0000** | 187 | | 1 | **0.0000** | **0.0000** | **0.0000** | **0.0000** | **0.0000** | 188 | | 2 | **0.0000** | **0.0000** | **0.0000** | **0.0000** | **0.0000** | 189 | | 4 | **0.0000** | **0.0000** | **0.0000** | **0.0000** | **0.0000** | 190 | | 8 | **0.0000** | **0.0000** | **0.0000** | **0.0000** | **0.0000** | 191 | | 16 | **0.0000** | 0.0019 | 0.0013 | 0.0025 | **0.0000** | 192 | | 32 | **0.0000** | 0.0041 | 0.0031 | 0.0041 | **0.0000** | 193 | | 64 | **0.0000** | 0.0066 | 0.0086 | 0.0078 | **0.0000** | 194 | | 128 | **0.0000** | 0.0123 | 0.0116 | 0.0140 | **0.0000** | 195 | | 256 | 0.0080 | 0.0097 | 0.0094 | 0.0084 | **0.0000** | 196 | | 512 | 0.0088 | 0.0100 | 0.0087 | 0.0090 | **0.0000** | 197 | | 1024 | 0.0080 | 0.0094 | 0.0101 | 0.0095 | **0.0000** | 198 | | 2048 | 0.0092 | 0.0093 | **0.0090** | 0.0107 | 0.0100 | 199 | | 4096 | **0.0099** | 0.0108 | 0.0113 | 0.0114 | 0.0103 | 200 | | 8192 | 0.0096 | **0.0095** | 0.0131 | 0.0126 | 0.0109 | 201 | | 16384 | 0.0116 | **0.0107** | 0.0204 | 0.0229 | 0.0117 | 202 | | 32768 | 0.0125 | **0.0109** | 1.46e14 | 0.0437 | 0.0116 | 203 | | 65536 | 0.0132 | 0.0133 | 2.81e14 | 0.0143 | **0.0118** | 204 | | 131072 | **0.0116** | 0.0121 | 1.41e14 | 0.0128 | 0.0127 | 205 | | 262144 | 0.0137 | 0.0144 | 7.04e13 | 0.0122 | **0.0116** | 206 | | 524288 | 0.0138 | 0.0136 | 3.52e13 | **0.0116** | 0.0121 | 207 | | 1048576 | 0.0113 | 0.0124 | 1.76e13 | 0.0141 | **0.0110** | 208 | | **mean** | 0.0064 | 0.0078 | 3.14e13 | 0.0101 | **0.0052** | 209 | --------------------------------------------------------------------------------