In [38]:
import numpy as np
import pandas as pd

data = pd.read_csv('filter_sizes2.csv', index_col=0)
data.head()
Out[38]:
size basic_size output_size prev_size output_and_prev_size input_size txid_size basic_n output_n prev_n output_and_prev_n input_n txid_n
height
1 215 7 4 1 4 1 4 2 1 0 1 0 1
2 215 7 4 1 4 1 4 2 1 0 1 0 1
3 215 7 4 1 4 1 4 2 1 0 1 0 1
4 215 7 4 1 4 1 4 2 1 0 1 0 1
5 215 7 4 1 4 1 4 2 1 0 1 0 1
In [44]:
import math
import scipy.optimize

def expected_data_size(c, N, B, S):
    """
    The expected size of data downloaded in bits if a client does not match a block.
    
    c is the number of filter elements the client is watching
    N is the number of filter elements in a block
    S is the size of the full block
    P is the Golomb-Rice parameter
    
    Assuming filter sizes that are N * P bits, this returns the filter size plus block size times false
    positive match probability.
    """
    P = 2.0 ** B
    M = 1.497137 * P
    bits = B + 1.0 / (1 - (1 - 1.0 / M) ** P)
    fp = (1 - (1 - 1.0 / (N * P))**(c*N))
    return N * bits + 8 * S * fp
In [45]:
def generate_func(N, S):
    """
    Generate a numpy-optimized function to compute expected data sizes. Takes numpy arrays with
    N and S and returns a lambda accepting scalar c value.
    """
    B = np.arange(1, 49)
    P = 2.0 ** B
    M = 1.497137 * P
    bits = B + 1.0 / (1 - (1 - 1.0 / M) ** P)
    Y = np.power(1 - 1.0 / np.outer(M, np.maximum(1, N)), N)
    return lambda c: np.outer(bits, N) + 8 * S * (1 - Y ** c)
In [49]:
N = data['output_n'].values
S = data['size'].values
func = generate_func(N, S)

y10 = np.sum(func(10), 1)
y100 = np.sum(func(100), 1)
y1000 = np.sum(func(1000), 1)
y10000 = np.sum(func(10000), 1)
In [50]:
P = pd.RangeIndex(1, 49)
y10 = pd.Series(y10, index=P)
y100 = pd.Series(y100, index=P)
y1000 = pd.Series(y1000, index=P)
y10000 = pd.Series(y10000, index=P)

{
    'M=10': y10.idxmin(),
    'M=100': y100.idxmin(),
    'M=1,000': y1000.idxmin(),
    'M=10,000': y10000.idxmin(),
}
Out[50]:
{'M=10': 13, 'M=100': 16, 'M=1,000': 20, 'M=10,000': 23}
In [51]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(10,5))

plt.plot(y10, label='M=10')
plt.plot(y100, label='M=100')
plt.plot(y1000, label='M=1,000')
plt.plot(y10000, label='M=10,000')
plt.title('Expected data size (bits) vs P')
plt.legend()
Out[51]:
<matplotlib.legend.Legend at 0x7f810be8a1d0>