Implementing HLL in python to estimate the cardinality

66 Views Asked by At

I'm trying to implement the HLL algorithm in python.

I'm using data folders with the format of 13 bytes

"\x00\x01\x02\x03\x05\x06\x07\x08\x09\x0a\x0b\x0c",described as follows:

srcIP = "\x00\x01\x02\x03"
srcPort = "\x04\x05"
dstIP = "\x06\x07\x08\x09"
dstPort = "\x0a\x0b"
protocol = "\x0c"

The code is as follows:

import hashlib
import math

#Actual
with open(""""folder location"""") as fl:
        tmp_trace = []
        pairSrcDst ={}
        while True:
            five_tuple = fl.read(13)
            ipPair = five_tuple[0:4] + five_tuple[6:10]
            if five_tuple == b"":
                break
            tmp_trace.append(ipPair)

for pair in tmp_trace:
   if pair[0:4] in pairSrcDst.keys():
       pairSrcDst[pair[0:4]] += 1
   else:
       pairSrcDst[pair[0:4]] = 1

accuratecount = len(pairSrcDst)
print(accuratecount)

#Estimation

def harmonic_mean(arr):
    n = len(arr)
    sum = 0
    for i in range(n):
        if arr[i] != 0:
            sum += 1.0 / arr[i]
    result = n / sum
    return result

def count_leading_zeros(string):
    count = 0
    for char in string:
        if char == '0':
            count += 1
        else:
            return count
    print(count)
    return count


with open(""""folder location"""") as fl:
    j = 0
    numberofcontainers = 32  # NUMBER OF BITS FOR BUCKET
    numberofbucketbits = math.ceil(math.log2(numberofcontainers))
    containerlist = [0] * numberofcontainers
    while True:
        five_tuple = fl.read(13)
        IPset = five_tuple[0:4] + five_tuple[6:10]
        if five_tuple == b"":
            break


        hashvalue = hashlib.sha256(IPset).digest()
        integer_value = int.from_bytes(hashvalue, byteorder='big')
        hashvalueinbit = bin(integer_value)[2:]

        bucket1 = hashvalueinbit[:numberofbucketbits] 
        bucket2 = hashvalueinbit[numberofbucketbits:]
        containerindex = int(bucket1,2)
        leadingzeros = count_leading_zeros(bucket2)
        if containerlist[containerindex] < leadingzeros:
            containerlist[containerindex] = leadingzeros


    print("container values",containerlist)

    # STEP 2 - Counting
    HarmonicMean = harmonic_mean(containerlist)
    #print(HarmonicMean)
    twopow = 2 ** HarmonicMean
    phi = 0.77351
    bitxphi = numberofcontainers / phi
    #print(bitxphi)
    Estimatedcount = twopow * bitxphi

    print("the estimated count is:", Estimatedcount)
    print("the accuracte count is :", accuratecount)

    perc = (1 - abs(accuratecount - Estimatedcount) / accuratecount) * 100

    print("the accuracy percentage is", perc)

I have tried several bucket sizes, hash functions, still the estimation is completely higher than the accurate counting.

Could anyone see a potential issues with this please? Thank you

0

There are 0 best solutions below