Jupyter 版本的用 Python 实现从有偏人群中取样

也可以点击 这里 获取相关文件呀

Sampling from a biased population

  • 重新创建交互式抽样分布演示中的可视化
  • 关注的是一个假设性问题,说明了当从一个右偏的人群中取样时会发生什么
In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
sns.set()
In [3]:
mean_uofm = 155
sd_uofm = 5
mean_gym = 185
sd_gym = 5
gymperc = 0.3
totalPopSize = 40000
In [4]:
uofm_students = np.random.normal(mean_uofm, sd_uofm, int(totalPopSize * (1 - gymperc)))
students_at_gym = np.random.normal(mean_gym, sd_gym, int(totalPopSize * gymperc))
In [5]:
population = np.append(uofm_students, students_at_gym)
In [22]:
plt.figure(figsize = (10, 12))
plt.subplot(3, 1, 1)
sns.distplot(uofm_students)
plt.title("Uofm students only")
plt.xlim([140, 200])

plt.subplot(3, 1, 2)
sns.distplot(students_at_gym)
plt.title("Gym goers only")
plt.xlim([140, 200])
plt.show()

如果从整个群体中取样

In [26]:
numberSamps = 5000
sampSize = 50
In [27]:
mean_distribution = np.empty(numberSamps)
for i in list(range(numberSamps)):
    random_students = np.random.choice(population, sampSize)
    mean_distribution[i] = np.mean(random_students)
In [30]:
plt.figure(figsize = (10, 8))
plt.subplot(2, 1, 1)
sns.distplot(population)
plt.title("Full population of UofM students")
plt.axvline(x = np.mean(population))
plt.xlim([140, 200])

plt.subplot(2, 1, 2)
sns.distplot((mean_distribution))
plt.title("Sampling distribution of the mean weight of gym goers")
plt.axvline(x = np.mean(population))
plt.axvline(x = np.mean(mean_distribution), color = "black")
plt.xlim([140, 200])
plt.show()

如果取一个非代表性的样本

In [31]:
numberSamps = 5000
sampSize = 3
In [32]:
mean_distribution = np.empty(numberSamps)
for i in range(numberSamps):
    random_students = np.random.choice(students_at_gym, sampSize)
    mean_distribution[i] = np.mean(random_students)
In [33]:
plt.figure(figsize = (10, 8))
plt.subplot(2, 1, 1)
sns.distplot(population)
plt.title("Full population of UofM students")
plt.axvline(x = np.mean(population))
plt.xlim([140, 200])

plt.subplot(2, 1, 2)
sns.distplot(mean_distribution)
plt.title("Sampling distribution of the mean weight of gym goers")
plt.axvline(x = np.mean(population))
plt.axvline(x = np.mean(students_at_gym), color = "black")
plt.xlim([140, 200])
plt.show()
In [ ]: