In [3]:
import pymc3 as pm
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
import theano as th
th.config.warn.round=False
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option("display.width",110)
pd.set_option("display.max_columns",12)
np.set_printoptions(linewidth=110, edgeitems=6, suppress=True)
In [2]:
data = np.array([51.06, 55.12, 53.73, 50.24, 52.05, 56.40, 48.45, 52.34,
55.65, 51.49, 51.86, 63.43, 53.00, 56.09, 51.93, 52.31, 52.33, 57.48, 57.44,
55.14, 53.93, 54.62, 56.09, 68.58, 51.36, 55.47, 50.73, 51.94, 54.95, 50.39,
52.91, 51.5, 52.68, 47.72, 49.73, 51.82, 54.99, 52.84, 53.19, 54.52, 51.46,
53.73, 51.61, 49.81, 52.42, 54.3, 53.84, 53.16]).astype(np.float32)
data = data / 5
_ = sns.kdeplot(data)
General Experimenting¶
In [3]:
# general experimenting from PYMC3 API quickstart
with pm.Model() as model:
mu = pm.Normal('mu', mu=0, sd=50, testval=0.1)
plus_2 = pm.Deterministic('mu plus 2', mu + 2)
x = pm.Uniform('x', lower=0, upper=1)
xx = pm.Normal('xx', mu=0, sd=1, shape=5)
obs = pm.Normal('obs', mu=mu, sd=1, observed=data)
model.basic_RVs, model.observed_RVs, model.free_RVs, model.deterministics
model.named_vars
#model.logp({'mu': 0})
mu.random(size=3)
xx.random(size=3)
Out[3]:
Out[3]:
Out[3]:
Out[3]:
In [4]:
# live plotting of inference
%matplotlib notebook
with model:
trace = pm.sample(2000, njobs=4, n_init=80000, live_plot=True, refresh_every=500, roll_over=1000)
%matplotlib inline
trace['mu'].shape
trace.nchains
trace.get_values('mu', chains=1).shape
Out[4]:
Out[4]:
Out[4]:
In [5]:
_ = pm.traceplot(trace)
_ = plt.show()
_ = plt.figure(figsize=(15,3))
_ = pm.forestplot(trace)
_ = plt.show()
pm.gelman_rubin(trace)
_ = pm.plot_posterior(trace)
Out[5]:
In [12]:
with model:
post_pred = pm.sample_ppc(trace, samples=1000)
post_pred['obs'].shape
ax = sns.distplot(post_pred['obs'], label='Posterior predictive mean')
_ = ax.axvline(data.mean(), color='r', ls='--', label='True mean')
_ = ax.legend()
_ = plt.show()
_ = sns.distplot(mu.random( size=10000));
Finding mean and standard deviation¶
In [7]:
%%time
np.random.seed(1)
with pm.Model() as m1:
mu = pm.Uniform('mu', min(data), max(data))
sigma = pm.HalfNormal('sigma', sd=10)
y = pm.Normal('y', mu=mu, sd=sigma, observed=data) # a student T might be better here
start = pm.find_MAP()
step = pm.NUTS() # pm.Metropolis()
trace_m1 = pm.sample(4000, step=step, start=start, njobs=4)
trace_m1b = trace_m1[500:]
In [8]:
# show priors
_ = plt.figure(figsize=(20,7))
_ = plt.subplot(121)
_ = plt.title('mu')
_ = sns.distplot(mu.random(size=100000), label='prior')
_ = sns.distplot(trace_m1b.mu, label='posterior')
_ = plt.axvline(x=data.mean(), lw=1, color='r')
_ = plt.legend()
_ = plt.subplot(122)
_ = plt.title('sigma')
_ = sns.distplot(sigma.random(size=100000), label='prior')
_ = sns.distplot(trace_m1b.sigma, label='posterior')
_ = plt.axvline(x=data.std(), lw=1, color='r')
_ = plt.legend()
_ = plt.xlim(0,30)
#_ = plt.hist(sigma.random(size=100000), bins=100, normed=True, alpha=0.7, label='prior')
#_ = plt.hist(trace_m1b.sigma, bins=100, normed=True, alpha=0.7, label='posterior')
#_ = plt.hist(mu.random(size=100000), bins=100, normed=True, alpha=0.7, label='prior')
#_ = plt.hist(trace_m1b.mu, bins=100, normed=True, alpha=0.7, label='posterior')
In [10]:
# show all relevant plots
_ = pm.traceplot(trace_m1b, lines={'mu': data.mean(), 'sigma': data.std()})
_ = pm.plot_posterior(trace_m1b, varnames=['mu'], kde_plot=True, rope=[10.4,10.6], ref_val=11)
_ = pm.plot_posterior(trace_m1b, varnames=['sigma'], kde_plot=True, rope=[0.7, 0.8], ref_val=1)
_ = pm.autocorrplot(trace_m1b)
_ = plt.figure(figsize=(4,3))
_ = plt.plot(pm.geweke(trace_m1b['mu'])[:,1], 'o')
_ = plt.axhline(1, c='red'); _ = plt.axhline(-1, c='red')
_ = plt.gca().margins(0.05)
In [43]:
# show all relevant diagnostics
pm.diagnostics.gelman_rubin(trace_m1b)
pm.diagnostics.effective_n(trace_m1b)
pm.df_summary(trace_m1b)
pd.DataFrame(data).describe().T
Out[43]:
Out[43]:
Out[43]:
Out[43]:
In [44]:
# generate posterior samples
with m1:
ppc_m1b = pm.sample_ppc(trace_m1b, samples = 2000)
samples = ppc_m1b['y'][:,0]
_ = sns.kdeplot(data)
_ = sns.kdeplot(samples)
In [ ]:
# demo of sampling from prior (omit the observed keyword)
with pm.Model() as prior_context:
sigma = pm.Gamma('gamma', alpha=2.0, beta=1.0)
mu = pm.Normal('mu', mu=0, sd=sigma)
trace = pm.sample(5000, step=pm.Metropolis())
_ = pm.traceplot(trace)
Linear Regression¶
In [ ]:
ans = sns.load_dataset('anscombe')
x_2 = ans[ans.dataset == 'II']['x'].values
y_2 = ans[ans.dataset == 'II']['y'].values
x_2 = x_2 -x_2.mean()
y_2 = y_2 -y_2.mean()
#plt.scatter(x_2, y_2)
#plt.xlabel('$x$', fontsize=16)
#plt.ylabel('$y$', fontsize=16, rotation=0)
In [ ]:
np.random.seed(1)
with pm.Model() as model_poly:
alpha = pm.Normal('alpha', mu=0, sd=10)
beta1 = pm.Normal('beta1', mu=0, sd=1)
beta2 = pm.Normal('beta2', mu=0, sd=1)
epsilon = pm.Uniform('epsilon', lower=0, upper=10)
mu = alpha + beta1 * x_2 + beta2 * x_2**2
y_pred = pm.Normal('y_pred', mu=mu, sd=epsilon, observed=y_2)
start = pm.find_MAP()
step = pm.NUTS(scaling=start)
trace_poly = pm.sample(3000, step=step, start=start)
In [ ]:
_ = pm.traceplot(trace_poly)
_ = pm.plot_posterior(trace_poly, kde_plot=True)
In [ ]:
x_p = np.linspace(-6, 6)
y_p = trace_poly['alpha'].mean() + trace_poly['beta1'].mean() * x_p + trace_poly['beta2'].mean() * x_p**2
_ = plt.scatter(x_2, y_2)
_ = plt.xlabel('$x$', fontsize=16)
_ = plt.ylabel('$y$', fontsize=16, rotation=0)
_ = plt.plot(x_p, y_p, c='r')
German Tank Problem with PYMC3¶
In [4]:
tanks = np.array([10, 256, 202, 97]).astype(np.int32)
tanks
Out[4]:
In [5]:
m = tanks.mean()
s = tanks.std()
m,s
m+2*s
Out[5]:
Out[5]:
In [6]:
# statistically optimal formula
max(tanks) + max(tanks) / len(tanks) - 1
Out[6]:
In [7]:
import scipy.stats
stats.norm.ppf(0.96943389677160008)
stats.norm.cdf(1.8725380240735363)
Out[7]:
Out[7]:
In [24]:
#P(N∣tanks)∝ P(tanks∣N) * P(N)
with pm.Model() as tank_m:
N = pm.DiscreteUniform("N", lower=tanks.max(), upper=10000) # define prior
tank_obs = pm.DiscreteUniform("tank_r", lower=0, upper=N, observed=tanks) # define likelihood
tank_r = pm.sample(10000, njobs=4)
tank_r = tank_r[2000:]
In [25]:
_ = pm.traceplot(tank_r)
_ = pm.plot_posterior(tank_r, kde_plot=False)
pm.summary(tank_r)
German Tank Problem with Pystan¶
In [18]:
import pystan
ocode = """
data {
int M; // number of serial numbers observed
real tanks[M]; // serial numbers
}
parameters {
real N;
}
model {
N ~ uniform(max(tanks), 10000); // P(N)
tanks ~ uniform(0, N); // P(D|N)
}
"""
data = {'tanks': tanks, 'M': len(tanks)}
fit = pystan.stan(model_code=ocode, data=data, init='init_r', iter=100000, warmup=1000, chains=4)
fit
Out[18]:
In [19]:
fig = fit.plot()
fig.set_size_inches((12, 2))
plt.tight_layout()
plt.show()
fit.extract()['N'].mean()
Out[19]:
Estimate Upper Limit of array of random numbers¶
In [20]:
# generate N random numbers
max_num = 246
num_range = np.arange(max_num) + 1
sample_np = np.random.choice(num_range, int(max_num * 0.05) ) # take a 5% sample from overall
sample_np
sample = th.shared(sample_np) # so can use same model with different data
Out[20]:
In [21]:
with pm.Model() as rand_m:
N = pm.DiscreteUniform("N", lower=sample.max(), upper=max_num*5) # define prior
tank_obs = pm.DiscreteUniform("tank_r", lower=0, upper=N, observed=sample) # define likelihood
In [22]:
with rand_m:
start = {'N': max_num*2}
rand_r = pm.sample(50000, start=start, njobs=4 )
In [26]:
rand_r = rand_r[5000:]
_ = pm.traceplot(rand_r)
_ = pm.plot_posterior(rand_r, kde_plot=False,point_estimate='median')
pm.summary(rand_r)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
Comments
comments powered by Disqus