%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
import warnings
warnings.filterwarnings("ignore")
def gaussian_distribution(x, mu, sigma):
'''
function:Gaussian distribution
mu: mean
sigma: std
'''
return 1 / (np.sqrt(2 * np.pi) * sigma) * np.exp(- (x - mu)**2 / (2 * np.power(sigma, 2) ))
total = 100000000
x = np.random.rand(total,1)
y = np.random.rand(total,1)
dist = np.sqrt(np.multiply(x, x) + np.multiply(y, y))
(dist < 1).sum() / total * 4
3.14166516
fig, ax = plt.subplots(1,2, figsize=(12,5))
theta = 2
x = np.arange(0,15,0.1)
y1 = theta * np.exp(-x * theta)
ax[0].plot(x, y1)
ax[0].set_title('PDF')
ax[0].set_xlabel('X')
ax[0].set_ylabel('Y')
y2 = 1 - np.exp(-x * theta)
ax[1].plot(x, y2)
ax[1].set_title('CDF')
ax[1].set_xlabel('X')
ax[1].set_ylabel('P')
Text(0, 0.5, 'P')
# 指数分布
theta = 2
x = np.arange(0,5,0.1)
y = theta * np.exp(-x * theta)
# 通过均匀分布进行采样
sample_num = 10000
z = np.random.uniform(0,1, sample_num) # 均匀分布采样
sample_x = - np.log(1 - z) / theta
plt.plot(x, y)
sns.distplot(sample_x)
<AxesSubplot:ylabel='Density'>
$ \large p_1(x) = 0.3 e^{-(x-0.3)^2} + 0.7 e^{-(x-2)^2/0.3} $
$ \large p_2(x) = \frac{p_1(x)}{Z_p} ,Z_p为归一化常数,未知(实际约为1.2113)$
目标:对$p_2(x)$进行接受-拒绝采样
步骤:
1.定义建议分布
$q(z) = Gassian(1.4,1.2)$
2.定义k
k=2.5
def p1(x):
return 0.3 * np.exp(-(x-0.3)**2) + 0.7 * np.exp(-(x-2.)**2/0.3)
def p2(x, z=1.2113):
return p1(x) / z
def kq(x, k):
return k * norm.pdf(x, loc=1.4, scale=1.2)
# return k * gaussian_distribution(x, mu=1.4, sigma=1.2) # 自定义正态分布
x = np.arange(-4.,6.,0.01)
k = 2.5
# 对 p1(x) 可视化
plt.plot(x, p1(x),color = "red")
# 对 k*q(x) 可视化
plt.plot(x, kq(x, k), color = "blue")
plt.xticks([])
plt.yticks([])
plt.ylim(0,0.9)
plt.xlim(-4,6)
plt.plot([0.3,0.3],[0,0.54601532],color = "black")
plt.plot(0.3,0.54601532,'b.')
plt.fill_between(x,p1(x), kq(x, k),color = (0.7,0.7,0.7))
plt.annotate('$x_0$',xy=(0.,0),xytext=(0.2,-0.06),fontsize=15)
plt.annotate('$u_0$',xy=(0.,0.),xytext=(0.35,0.15),fontsize=15)
plt.annotate('$kq(x_0)$',xy=(0.,0.),xytext=(-1.2,0.55),fontsize=15)
plt.annotate('$p(x)$',xy=(0.,0.),xytext=(2,0.15),fontsize=15)
plt.annotate('$kq(x)$',xy=(0.,0.),xytext=(2.7,0.5),fontsize=15)
plt.show()
接受-拒绝采样:
x = np.arange(-4.,6.,0.01)
# p1(x) 可视化
plt.plot(x, p1(x),color = "red")
size = 50000
mu = 1.4
sigma = 1.2
k = 2.5
# 1. 从 q(x) 采样,得到 sample_x
sample_x = np.random.normal(loc = mu, scale = sigma, size = size)
# 2. 计算 q
q = norm.pdf(sample_x, loc=mu, scale=sigma)
# q = 1/(np.sqrt(2*np.pi)*sigma)*np.exp(-0.5*(sample_x-mu)**2/sigma**2)
# 3. 计算 p
p = p1(sample_x)
# 4. 生成 0-1 随机数
u = np.random.uniform(low = 0, high = 1, size = size)
# 5. 选择采样点
sample = sample_x[p / (q * k) >= u]
sns.distplot(sample)
plt.show()
$f(x) \sim \mathbf{N}(0,1)$ ,求$P(X > 8)$的概率。
定义$g(x) \sim \mathbf{N}(8,1)$,对原目标进行如下转换:
$$
\begin{equation}
\begin{aligned}
P(X > 8)
&= \mathbf{E}_f[\mathbb{I}_{x > 8}] \\
&= \int _R \mathbb{I}_{x > 8} f(x) dx \\
&= \int _R \mathbb{I}_{x > 8} \frac{f(x)}{g(x)} g(x) dx \\
&= \Large \int _R \mathbb{I}_{x > 8} \frac{\frac{1}{\sqrt{2 \pi} } \mathbf{e}^{\frac{- x^2}{2}}}{\frac{1}{\sqrt{2 \pi} } \mathbf{e}^{\frac{- (x-8)^2}{2}}} \frac{1}{\sqrt{2 \pi} } \mathbf{e}^{\frac{- (x-8)^2}{2}} dx \\
&= \Large \int _R \mathbb{I}_{x > 8} \mathbf{e} ^{32-8x} \frac{1}{\sqrt{2 \pi} } \mathbf{e}^{\frac{- (x-8)^2}{2}} dx
\end{aligned}
\end{equation}
$$
定义$w(x) = \mathbb{I}_{x > 8} \mathbf{e} ^{32-8x} $。此时,可以将原问题转为函数$w(x)$关于概率分布$g(x)$期望的问题。接下来,可以从$g(x)$中进行采样,得到样本序列$x_1,x_2,\cdots,x_N$,将采样点带入$w(x)$,可以得到对概率的估计值:
$$
\large P(X > 8) \approx \frac{1}{N} \sum_{i=1} ^N \mathbb{I}_{x_i > 8} \mathbf{e} ^{32-8x_i}
$$
def theoretical():
print('CDF计算的真实概率:', norm.cdf(-8, 0, 1))
def naive_monte():
N = 10000
rand_num = np.random.normal(loc=0, scale=1, size=N)
sample = (rand_num > 8)
print("直接对p(x)采样的概率:", np.mean(sample))
def importance_sampling():
N = 10000
rand_num = np.random.normal(loc=8, scale=1, size=N)
sample = (rand_num > 8) * np.exp(-8 *rand_num + 32)
print("重要性采样概率:", np.mean(sample))
# 1. 直接使用函数f(x)的CDF进行求解
theoretical()
# 2. 通过对函数f(x)进行采样,然后进行求解
naive_monte()
# 3. 通过重要性采样进行求解
importance_sampling()
CDF计算的真实概率: 6.22096057427174e-16 直接对p(x)采样的概率: 0.0 重要性采样概率: 6.200460782041351e-16
结论:重要性采样结果和CDF接近,而直接采样效果很差。