# 巧用redis的hyper loglog进行uv数据的统计

## 存储结构

``````struct hllhdr {
char magic;      /* "HYLL" */
uint8_t encoding;   /* HLL_DENSE or HLL_SPARSE. */
uint8_t notused; /* Reserved for future use, must be zero. */
uint8_t card;    /* Cached cardinality, little endian. */
uint8_t registers[]; /* Data bytes. */
};
``````

## 算法

hyper loglog是采用基数统计来计算的，也就是需要分桶，redis默认分为16384个桶，每个桶占用6bit。

``````uint64_t MurmurHash64A (const void * key, int len, unsigned int seed) {
const uint64_t m = 0xc6a4a7935bd1e995;
const int r = 47;
uint64_t h = seed ^ (len * m);
const uint8_t *data = (const uint8_t *)key;
const uint8_t *end = data + (len-(len&7));

while(data != end) {
uint64_t k;

#if (BYTE_ORDER == LITTLE_ENDIAN)
#ifdef USE_ALIGNED_ACCESS
memcpy(&k,data,sizeof(uint64_t));
#else
k = *((uint64_t*)data);
#endif
#else
k = (uint64_t) data;
k |= (uint64_t) data << 8;
k |= (uint64_t) data << 16;
k |= (uint64_t) data << 24;
k |= (uint64_t) data << 32;
k |= (uint64_t) data << 40;
k |= (uint64_t) data << 48;
k |= (uint64_t) data << 56;
#endif

k *= m;
k ^= k >> r;
k *= m;
h ^= k;
h *= m;
data += 8;
}

switch(len & 7) {
case 7: h ^= (uint64_t)data << 48; /* fall-thru */
case 6: h ^= (uint64_t)data << 40; /* fall-thru */
case 5: h ^= (uint64_t)data << 32; /* fall-thru */
case 4: h ^= (uint64_t)data << 24; /* fall-thru */
case 3: h ^= (uint64_t)data << 16; /* fall-thru */
case 2: h ^= (uint64_t)data << 8; /* fall-thru */
case 1: h ^= (uint64_t)data;
h *= m; /* fall-thru */
};

h ^= h >> r;
h *= m;
h ^= h >> r;
return h;
}
``````

``````int hllPatLen(unsigned char *ele, size_t elesize, long *regp) {
uint64_t hash, bit, index;
int count;

/* Count the number of zeroes starting from bit HLL_REGISTERS
* (that is a power of two corresponding to the first bit we don't use
* as index). The max run can be 64-P+1 = Q+1 bits.
*
* Note that the final "1" ending the sequence of zeroes must be
* included in the count, so if we find "001" the count is 3, and
* the smallest count possible is no zeroes at all, just a 1 bit
* at the first position, that is a count of 1.
*
* This may sound like inefficient, but actually in the average case
* there are high probabilities to find a 1 after a few iterations. */
index = hash & HLL_P_MASK; /* Register index. */
hash >>= HLL_P; /* Remove bits used to address the register. */
hash |= ((uint64_t)1<<HLL_Q); /* Make sure the loop terminates
and count will be <= Q+1. */
bit = 1;
count = 1; /* Initialized to 1 since we count the "00000...1" pattern. */
while((hash & bit) == 0) {
count++;
bit <<= 1;
}
*regp = (int) index;
return count;
}``````

``````/* Low level function to set the dense HLL register at 'index' to the
* specified value if the current value is smaller than 'count'.
*
* 'registers' is expected to have room for HLL_REGISTERS plus an
* additional byte on the right. This requirement is met by sds strings
* automatically since they are implicitly null terminated.
*
* The function always succeed, however if as a result of the operation
* the approximated cardinality changed, 1 is returned. Otherwise 0
* is returned. */
int hllDenseSet(uint8_t *registers, long index, uint8_t count) {
uint8_t oldcount;

HLL_DENSE_GET_REGISTER(oldcount,registers,index);
if (count > oldcount) {
HLL_DENSE_SET_REGISTER(registers,index,count);
return 1;
} else {
return 0;
}
}``````

## 使用场景

``pfadd USER:LOGIN:2020121608 A``

``pfadd USER:LOGIN:2020121608 B C D A``

``pfcount USER:LOGIN:2020121608``

``pfadd USER:LOGIN:2020121609 B C D A``

``pfcount USER:LOGIN:2020121608 USER:LOGIN:2020121609``