Code 10: Probabilistic Programming Languages

from scipy import stats
import pymc3 as pm
import theano
import numpy as np
import matplotlib.pyplot as plt
import arviz as az

import datetime
print(f"Last Run {datetime.datetime.now()}")
Last Run 2021-11-19 16:52:42.074557
az.style.use("arviz-grayscale")
plt.rcParams["figure.dpi"] = 300

Posterior Computation

Code 10.1

from jax import grad

simple_grad = grad(lambda x: x**2)
print(simple_grad(4.0))
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
8.0

Code 10.2

from jax import grad
from jax.scipy.stats import norm

def model(test_point, observed):
    z_pdf = norm.logpdf(test_point, loc=0, scale=5)
    x_pdf = norm.logpdf(observed, loc=test_point, scale=1)
    logpdf = z_pdf + x_pdf
    return logpdf

model_grad = grad(model)

observed, test_point = 5.0, 2.5 
logp_val = model(test_point, observed)
grad = model_grad(test_point, observed)
print(f"log_p_val: {logp_val}")
print(f"grad: {grad}")
log_p_val: -6.697315216064453
grad: 2.4000000953674316

Code 10.3

with pm.Model() as model:
    z = pm.Normal("z", 0., 5.)
    x = pm.Normal("x", mu=z, sd=1., observed=observed)

func = model.logp_dlogp_function()
func.set_extra_values({})
print(func(np.array([test_point])))
[array(-6.69731498), array([2.4])]

Code 10.4

def fraud_detector(fraud_observations, non_fraud_observations, fraud_prior=8, non_fraud_prior=6):
    """Conjugate Beta Binomial model for fraud detection"""
    expectation = (fraud_prior + fraud_observations) / (
        fraud_prior + fraud_observations + non_fraud_prior + non_fraud_observations)
    
    if expectation > .5:
        return {"suspend_card":True}

%timeit fraud_detector(2, 0)
188 ns ± 6.11 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)

PPL Driven Transformations

Code 10.9

observed = np.repeat(2, 2)
pdf = stats.norm(0, 1).pdf(observed)
np.prod(pdf, axis=0)
0.0029150244650281948

Code 10.10

observed = np.repeat(2, 1000)
pdf = stats.norm(0, 1).pdf(observed)
np.prod(pdf, axis=0)
0.0

Code 10.11

1.2 - 1
0.19999999999999996
pdf[0], np.prod(pdf, axis=0)
(0.05399096651318806, 0.0)

Code 10.12

logpdf = stats.norm(0, 1).logpdf(observed)
np.log(pdf[0]), logpdf[0], logpdf.sum()
(-2.9189385332046727, -2.9189385332046727, -2918.9385332046736)
np.log(pdf[0])
-2.9189385332046727

Distribution Transforms

Code 10.13

lower, upper = -1, 2
domain = np.linspace(lower, upper, 5)
transform = np.log(domain - lower) - np.log(upper - domain)
print(f"Original domain: {domain}")
print(f"Transformed domain: {transform}")
Original domain: [-1.   -0.25  0.5   1.25  2.  ]
Transformed domain: [       -inf -1.09861229  0.          1.09861229         inf]
/var/folders/7p/srk5qjp563l5f9mrjtp44bh800jqsw/T/ipykernel_3646/2568761612.py:3: RuntimeWarning: divide by zero encountered in log
  transform = np.log(domain - lower) - np.log(upper - domain)

Code 10.14

with pm.Model() as model:
    x = pm.Uniform("x", -1., 2.)
    
model.vars
[x_interval__ ~ TransformedDistribution]

Code 10.15

print(model.logp({"x_interval__":-2}),
      model.logp_nojac({"x_interval__":-2}))
print(model.logp({"x_interval__":1}),
      model.logp_nojac({"x_interval__":1}))
-2.2538560220859454 -1.0986122886681098
-1.6265233750364456 -1.0986122886681098

Code 10.16

import tensorflow_probability as tfp
tfd = tfp.distributions
tfb = tfp.bijectors

lognormal0 = tfd.LogNormal(0., 1.)
lognormal1 = tfd.TransformedDistribution(tfd.Normal(0., 1.), tfb.Exp())
x = lognormal0.sample(100)

np.testing.assert_array_equal(lognormal0.log_prob(x), lognormal1.log_prob(x))

Code 10.17

y_observed = stats.norm(0, .01).rvs(20)

with pm.Model() as model_transform:
    sd = pm.HalfNormal("sd", 5)
    y = pm.Normal("y", mu=0, sigma=sd, observed=y_observed)
    trace_transform = pm.sample(chains=1, draws=100000)

print(model_transform.vars)
print(f"Diverging: {trace_transform.get_sampler_stats('diverging').sum()}")
/var/folders/7p/srk5qjp563l5f9mrjtp44bh800jqsw/T/ipykernel_3646/2626254216.py:6: FutureWarning: In v4.0, pm.sample will return an `arviz.InferenceData` object instead of a `MultiTrace` by default. You can pass return_inferencedata=True or return_inferencedata=False to be safe and silence this warning.
  trace_transform = pm.sample(chains=1, draws=100000)
Auto-assigning NUTS sampler...
INFO:pymc3:Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
INFO:pymc3:Initializing NUTS using jitter+adapt_diag...
Sequential sampling (1 chains in 1 job)
INFO:pymc3:Sequential sampling (1 chains in 1 job)
NUTS: [sd]
INFO:pymc3:NUTS: [sd]
100.00% [101000/101000 00:33<00:00 Sampling chain 0, 0 divergences]
Sampling 1 chain for 1_000 tune and 100_000 draw iterations (1_000 + 100_000 draws total) took 34 seconds.
INFO:pymc3:Sampling 1 chain for 1_000 tune and 100_000 draw iterations (1_000 + 100_000 draws total) took 34 seconds.
Only one chain was sampled, this makes it impossible to run some convergence checks
INFO:pymc3:Only one chain was sampled, this makes it impossible to run some convergence checks
[sd_log__ ~ TransformedDistribution]
Diverging: 0

Code 10.18

with pm.Model() as model_no_transform:
    sd = pm.HalfNormal("sd", 5, transform=None)
    y = pm.Normal("y", mu=0, sigma=sd, observed=y_observed)
    trace_no_transform = pm.sample(chains=1, draws=100000)

print(model_no_transform.vars)
print(f"Diverging: {trace_no_transform.get_sampler_stats('diverging').sum()}")
/var/folders/7p/srk5qjp563l5f9mrjtp44bh800jqsw/T/ipykernel_3646/2939584159.py:4: FutureWarning: In v4.0, pm.sample will return an `arviz.InferenceData` object instead of a `MultiTrace` by default. You can pass return_inferencedata=True or return_inferencedata=False to be safe and silence this warning.
  trace_no_transform = pm.sample(chains=1, draws=100000)
Auto-assigning NUTS sampler...
INFO:pymc3:Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
INFO:pymc3:Initializing NUTS using jitter+adapt_diag...
Sequential sampling (1 chains in 1 job)
INFO:pymc3:Sequential sampling (1 chains in 1 job)
NUTS: [sd]
INFO:pymc3:NUTS: [sd]
100.00% [101000/101000 00:33<00:00 Sampling chain 0, 17 divergences]
Sampling 1 chain for 1_000 tune and 100_000 draw iterations (1_000 + 100_000 draws total) took 34 seconds.
INFO:pymc3:Sampling 1 chain for 1_000 tune and 100_000 draw iterations (1_000 + 100_000 draws total) took 34 seconds.
There were 17 divergences after tuning. Increase `target_accept` or reparameterize.
ERROR:pymc3:There were 17 divergences after tuning. Increase `target_accept` or reparameterize.
Only one chain was sampled, this makes it impossible to run some convergence checks
INFO:pymc3:Only one chain was sampled, this makes it impossible to run some convergence checks
[sd ~ HalfNormal]
Diverging: 17

Operation Graphs and Automatic Reparameterization

x = 3
y = 1
x * y / x + 2
3.0

Code 10.19

theano.config.compute_test_value = 'ignore'
x = theano.tensor.vector("x")
y = theano.tensor.vector("y")
out = x*(y/x) + 0
theano.printing.debugprint(out)
Elemwise{add,no_inplace} [id A] ''   
 |Elemwise{mul,no_inplace} [id B] ''   
 | |x [id C]
 | |Elemwise{true_div,no_inplace} [id D] ''   
 |   |y [id E]
 |   |x [id C]
 |InplaceDimShuffle{x} [id F] ''   
   |TensorConstant{0} [id G]

Code 10.20

fgraph = theano.function([x,y], [out])
theano.printing.debugprint(fgraph)
DeepCopyOp [id A] 'y'   0
 |y [id B]

Code 10.21

fgraph([1],[3])
[array([3.])]

Figure 10.1 and Figure 10.2

theano.printing.pydotprint(
    out, outfile="img/chp10/symbolic_graph_unopt.png",
    var_with_name_simple=False, high_contrast=False, with_ids=True)
theano.printing.pydotprint(
    fgraph, 
    outfile="img/chp10/symbolic_graph_opt.png", 
    var_with_name_simple=False, high_contrast=False, with_ids=True)
The output file is available at img/chp10/symbolic_graph_unopt.png
The output file is available at img/chp10/symbolic_graph_opt.png

Code 10.22

with pm.Model() as model_normal:
    x = pm.Normal("x", 0., 1.)
    
theano.printing.debugprint(model_normal.logpt)
Sum{acc_dtype=float64} [id A] '__logp'   
 |MakeVector{dtype='float64'} [id B] ''   
   |Sum{acc_dtype=float64} [id C] ''   
     |Sum{acc_dtype=float64} [id D] '__logp_x'   
       |Elemwise{switch,no_inplace} [id E] ''   
         |Elemwise{mul,no_inplace} [id F] ''   
         | |TensorConstant{1} [id G]
         | |Elemwise{mul,no_inplace} [id H] ''   
         |   |TensorConstant{1} [id I]
         |   |Elemwise{gt,no_inplace} [id J] ''   
         |     |TensorConstant{1.0} [id K]
         |     |TensorConstant{0} [id L]
         |Elemwise{true_div,no_inplace} [id M] ''   
         | |Elemwise{add,no_inplace} [id N] ''   
         | | |Elemwise{mul,no_inplace} [id O] ''   
         | | | |Elemwise{neg,no_inplace} [id P] ''   
         | | | | |TensorConstant{1.0} [id Q]
         | | | |Elemwise{pow,no_inplace} [id R] ''   
         | | |   |Elemwise{sub,no_inplace} [id S] ''   
         | | |   | |x ~ Normal [id T]
         | | |   | |TensorConstant{0.0} [id U]
         | | |   |TensorConstant{2} [id V]
         | | |Elemwise{log,no_inplace} [id W] ''   
         | |   |Elemwise{true_div,no_inplace} [id X] ''   
         | |     |Elemwise{true_div,no_inplace} [id Y] ''   
         | |     | |TensorConstant{1.0} [id Q]
         | |     | |TensorConstant{3.141592653589793} [id Z]
         | |     |TensorConstant{2.0} [id BA]
         | |TensorConstant{2.0} [id BB]
         |TensorConstant{-inf} [id BC]

Effect handling

Code 10.23

import jax
import numpyro
from tensorflow_probability.substrates import jax as tfp_jax

tfp_dist = tfp_jax.distributions
numpyro_dist = numpyro.distributions

root = tfp_dist.JointDistributionCoroutine.Root
def tfp_model():
    x = yield root(tfp_dist.Normal(loc=1.0, scale=2.0, name="x"))
    z = yield root(tfp_dist.HalfNormal(scale=1., name="z"))
    y = yield tfp_dist.Normal(loc=x, scale=z, name="y")
    
def numpyro_model():
    x = numpyro.sample("x", numpyro_dist.Normal(loc=1.0, scale=2.0))
    z = numpyro.sample("z", numpyro_dist.HalfNormal(scale=1.0))
    y = numpyro.sample("y", numpyro_dist.Normal(loc=x, scale=z))
try:
    print(tfp_model())
except:
    pass
<generator object tfp_model at 0x7fec68bafac0>
try:
    print(numpyro_model())
except:
    pass

Code 10.24

sample_key = jax.random.PRNGKey(52346)

# Draw samples
jd = tfp_dist.JointDistributionCoroutine(tfp_model)
tfp_sample = jd.sample(1, seed=sample_key)

predictive = numpyro.infer.Predictive(numpyro_model, num_samples=1)
numpyro_sample = predictive(sample_key)

# Evaluate log prob
log_likelihood_tfp = jd.log_prob(tfp_sample)
log_likelihood_numpyro = numpyro.infer.util.log_density(
    numpyro_model, [], {},
    # Samples returning from JointDistributionCoroutine is a
    # Namedtuple like Python object, we convert it to a dictionary
    # so that numpyro can recognize it.
    params=tfp_sample._asdict())

# Validate that we get the same log prob
np.testing.assert_allclose(log_likelihood_tfp, log_likelihood_numpyro[0], rtol=1e-6)

Code 10.25

# Condition z to .01 in TFP and sample
jd.sample(z=.01, seed=sample_key)
StructTuple(
  x=DeviceArray(0.42507368, dtype=float32),
  z=DeviceArray(0.01, dtype=float32),
  y=DeviceArray(0.42213476, dtype=float32)
)
# Condition z to .01 in NumPyro and sample
predictive = numpyro.infer.Predictive(
    numpyro_model, num_samples=1, params={"z": np.asarray(.01)})
predictive(sample_key)
{'x': DeviceArray([1.1232405], dtype=float32),
 'y': DeviceArray([1.1318897], dtype=float32),
 'z': array([0.01])}

Code 10.26

# Conditioned z to .01 in TFP and construct conditional distributions
dist, value = jd.sample_distributions(z=.01, seed=sample_key)
assert dist.y.loc == value.x
assert dist.y.scale == value.z

# Conditioned z to .01 in NumPyro and construct conditional distributions
model = numpyro.handlers.substitute(numpyro_model, data={"z": .01})
with numpyro.handlers.seed(rng_seed=sample_key):
    # Under the seed context, the default behavior of a NumPyro model is the
    # same as in Pyro: drawing prior sample.
    model_trace = numpyro.handlers.trace(numpyro_model).get_trace()
assert model_trace["y"]["fn"].loc == model_trace["x"]["value"]
assert model_trace["y"]["fn"].scale == model_trace["z"]["value"]

Designing a PPL

Code 10.27

import numpy as np
from scipy import stats

# Draw 2 samples from a Normal(1., 2.) distribution
x = stats.norm.rvs(loc=1.0, scale=2.0, size=2, random_state=1234)
# Evaluate the log probability of the samples 
logp = stats.norm.logpdf(x, loc=1.0, scale=2.0)

Code 10.28

random_variable_x = stats.norm(loc=1.0, scale=2.0)

x = random_variable_x.rvs(size=2, random_state=1234)
logp = random_variable_x.logpdf(x)

Code 10.29

x = stats.norm(loc=1.0, scale=2.0)
y = stats.norm(loc=x, scale=0.1)
y.rvs()
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/var/folders/7p/srk5qjp563l5f9mrjtp44bh800jqsw/T/ipykernel_3646/2580115495.py in <module>
      1 x = stats.norm(loc=1.0, scale=2.0)
      2 y = stats.norm(loc=x, scale=0.1)
----> 3 y.rvs()

/opt/miniconda3/envs/bmcp/lib/python3.9/site-packages/scipy/stats/_distn_infrastructure.py in rvs(self, size, random_state)
    471         kwds = self.kwds.copy()
    472         kwds.update({'size': size, 'random_state': random_state})
--> 473         return self.dist.rvs(*self.args, **kwds)
    474 
    475     def sf(self, x):

/opt/miniconda3/envs/bmcp/lib/python3.9/site-packages/scipy/stats/_distn_infrastructure.py in rvs(self, *args, **kwds)
   1092             vals = self._rvs(*args, size=size, random_state=random_state)
   1093 
-> 1094         vals = vals * scale + loc
   1095 
   1096         # do not forget to restore the _random_state

TypeError: unsupported operand type(s) for +: 'float' and 'rv_frozen'

Code 10.30

class RandomVariable:
    def __init__(self, distribution):
        self.distribution = distribution

    def __array__(self):
        return np.asarray(self.distribution.rvs())

x = RandomVariable(stats.norm(loc=1.0, scale=2.0))
z = RandomVariable(stats.halfnorm(loc=0., scale=1.))
y = RandomVariable(stats.norm(loc=x, scale=z))

for i in range(5):
    print(np.asarray(y))
1.1571257036684732
0.5129008281470354
-0.2750327073812393
-0.1797237136258827
-2.7978765955524114

Code 10.31

class RandomVariable:
    def __init__(self, distribution, value=None):
        self.distribution = distribution
        self.set_value(value)
    
    def __repr__(self):
        return f"{self.__class__.__name__}(value={self.__array__()})"

    def __array__(self, dtype=None):
        if self.value is None:
            return np.asarray(self.distribution.rvs(), dtype=dtype)
        return self.value

    def set_value(self, value=None):
        self.value = value
    
    def log_prob(self, value=None):
        if value is not None:
            self.set_value(value)
        return self.distribution.logpdf(np.array(self))

x = RandomVariable(stats.norm(loc=1.0, scale=2.0))
z = RandomVariable(stats.halfnorm(loc=0., scale=1.))
y = RandomVariable(stats.norm(loc=x, scale=z))

Code 10.32

for i in range(3):
    print(y)

print(f"  Set x=5 and z=0.1")
x.set_value(np.asarray(5))
z.set_value(np.asarray(0.05))
for i in range(3):
    print(y)

print(f"  Reset z")
z.set_value(None)
for i in range(3):
    print(y)
RandomVariable(value=2.113889264473519)
RandomVariable(value=-0.9153866580169316)
RandomVariable(value=2.7839356392110224)
  Set x=5 and z=0.1
RandomVariable(value=4.971704202268103)
RandomVariable(value=4.925771989212217)
RandomVariable(value=5.024323806499401)
  Reset z
RandomVariable(value=4.732558936959928)
RandomVariable(value=4.998079910235703)
RandomVariable(value=4.905656937449344)

Code 10.33

# Observed y = 5.
y.set_value(np.array(5.))

posterior_density = lambda xval, zval: x.log_prob(xval) + z.log_prob(zval) + \
                y.log_prob()
posterior_density(np.array(0.), np.array(1.))
-15.881815599614018

Code 10.34

def log_prob(xval, zval, yval=5):
    x_dist = stats.norm(loc=1.0, scale=2.0)
    z_dist = stats.halfnorm(loc=0., scale=1.)
    y_dist = stats.norm(loc=xval, scale=zval)
    return x_dist.logpdf(xval) + z_dist.logpdf(zval) + y_dist.logpdf(yval)

log_prob(0, 1)
-15.881815599614018

Code 10.35

def prior_sample():
    x = stats.norm(loc=1.0, scale=2.0).rvs()
    z = stats.halfnorm(loc=0., scale=1.).rvs()
    y = stats.norm(loc=x, scale=z).rvs()
    return x, z, y

prior_sample()
(-0.6274777874169912, 0.9358428814217807, 0.029987051428045586)

Shape handling

Code 10.36

def prior_sample(size):
    x = stats.norm(loc=1.0, scale=2.0).rvs(size=size)
    z = stats.halfnorm(loc=0., scale=1.).rvs(size=size)
    y = stats.norm(loc=x, scale=z).rvs()
    return x, z, y

print([x.shape for x in prior_sample(size=(2))])
print([x.shape for x in prior_sample(size=(2, 3, 5))])
[(2,), (2,), (2,)]
[(2, 3, 5), (2, 3, 5), (2, 3, 5)]

Code 10.37

n_row, n_feature = 1000, 5
X = np.random.randn(n_row, n_feature)

def lm_prior_sample0():
    intercept = stats.norm(loc=0, scale=10.0).rvs()
    beta = stats.norm(loc=np.zeros(n_feature), scale=10.0).rvs()
    sigma = stats.halfnorm(loc=0., scale=1.).rvs()
    y_hat = X @ beta + intercept
    y = stats.norm(loc=y_hat, scale=sigma).rvs()
    return intercept, beta, sigma, y

def lm_prior_sample(size=10):
    if isinstance(size, int):
        size = (size,)
    else:
        size = tuple(size)
    intercept = stats.norm(loc=0, scale=10.0).rvs(size=size)
    beta = stats.norm(loc=np.zeros(n_feature), scale=10.0).rvs(
        size=size + (n_feature,))
    sigma = stats.halfnorm(loc=0., scale=1.).rvs(size=size)
    y_hat = np.squeeze(X @ beta[..., None]) + intercept[..., None]
    y = stats.norm(loc=y_hat, scale=sigma[..., None]).rvs()
    return intercept, beta, sigma, y
print([x.shape for x in lm_prior_sample0()])
[(), (5,), (), (1000,)]
print([x.shape for x in lm_prior_sample(size=())])
print([x.shape for x in lm_prior_sample(size=10)])
print([x.shape for x in lm_prior_sample(size=[10, 3])])
[(), (5,), (), (1000,)]
[(10,), (10, 5), (10,), (10, 1000)]
[(10, 3), (10, 3, 5), (10, 3), (10, 3, 1000)]
# def lm_prior_sample2(size=10):
#     intercept = stats.norm(loc=0, scale=10.0).rvs(size=size)
#     beta = stats.multivariate_normal(
#         mean=np.zeros(n_feature), cov=10.0).rvs(size=size)
#     sigma = stats.halfnorm(loc=0., scale=1.).rvs(size=size)
#     y_hat = np.einsum('ij,...j->...i', X, beta) + intercept[..., None]
#     y = stats.norm(loc=y_hat, scale=sigma[..., None]).rvs()
#     return intercept, beta, sigma, y

# print([x.shape for x in lm_prior_sample2(size=())])
# print([x.shape for x in lm_prior_sample2(size=10)])
# print([x.shape for x in lm_prior_sample2(size=(10, 3))])

Code 10.38

import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions

jd = tfd.JointDistributionSequential([
    tfd.Normal(0, 10),
    tfd.Sample(tfd.Normal(0, 10), n_feature),
    tfd.HalfNormal(1),
    lambda sigma, beta, intercept: tfd.Independent(
        tfd.Normal(
            loc=tf.einsum("ij,...j->...i", X, beta) + intercept[..., None],
            scale=sigma[..., None]),
        reinterpreted_batch_ndims=1,
        name="y")
])

print(jd)

n_sample = [3, 2]
for log_prob_part in jd.log_prob_parts(jd.sample(n_sample)):
    assert log_prob_part.shape == n_sample
tfp.distributions.JointDistributionSequential("JointDistributionSequential", batch_shape=[[], [], [], []], event_shape=[[], [5], [], [1000]], dtype=[float32, float32, float32, float32])