Flajolet Martin Algorithm Implementation

287 Views Asked by At

I am trying to implement Flajolet Martin algorithm. I have a dataset with over 6000 records but the output of the following code is 4096. Please help me in understanding the mistake being made by me.

import xxhash

import math

def return_trailing_zeroes(s):

s = str(s)  

  rev = s[::-1] 

  count = 0

  for i in rev:

    if i is '0':

     count = count + 1

   else:

    break       

  return count

def gethash(line):

  num=abs(xxhash.xxh32(line).intdigest())

  return num

fp=open("/content/drive/MyDrive/Data.txt","r")

h_max=0

for line in fp:

  hash_value_1 = gethash(line)

  binary_1 = format(hash_value_1, '032b')       

  t1 = return_trailing_zeroes(binary_1)

  if t1>h_max:

    h_max=t1

fp.close()

print(2**h_max)

I tried this implementation of HyperLogLog algorithm and the output of the following code is 2560.

def return_trailing_zeroes(s): s = str(s) rev = s[::-1] count = 0 for i in rev: if i is '0': count = count + 1 else: break
return count

h1_m=0

h2_m=0

h3_m=0

h4_m=0

fp=open("/content/drive/MyDrive/Data.txt","r")

for line in fp:

hash_value_1 = abs(xxhash.xxh32(line).intdigest())

hash_value_2 = abs(hash32(line))

hash_value_3 = abs(jhashcode.hashcode(line))

hash_value_4 = abs(mmh3.hash(line))

binary_1 = format(hash_value_1, '032b')

binary_2 = format(hash_value_2, '032b')

binary_3 = format(hash_value_3, '032b')

binary_4 = format(hash_value_4, '032b')

t1 = return_trailing_zeroes(binary_1)

t2 = return_trailing_zeroes(binary_2)

t3 = return_trailing_zeroes(binary_3)

t4 = return_trailing_zeroes(binary_4)

if t1>h1_m: h1_m=t1

if t2>h2_m: h2_m=t2

if t3>h3_m: h3_m=t3

if t4>h4_m: h4_m=t4

fp.close()

avg_hash12 = (2**(h1_m) + 2**(h2_m))/ float(2)

avg_hash34 = (2**(h3_m) + 2**(h4_m))/ float(2)

distinct_elements = math.ceil(statistics.median([avg_hash12, avg_hash34]))

print(distinct_elements)

0

There are 0 best solutions below