from datascience import *
%matplotlib inline
path_data = '../../../assets/data/'
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=np.VisibleDeprecationWarning)Bootstrap¶
sf = Table.read_table('san_francisco_2019.csv')
min_salary = 15 * 20 * 50
sf = sf.where('Salary', are.above(min_salary))sf.num_rowssf_bins = np.arange(0, 726000, 25000)
sf.hist('Total Compensation', bins=sf_bins)# Parameter: Median total compensation in the population
def median_comp(t):
return percentile(50, t.column('Total Compensation'))
median_comp(sf)Bootstrap Estimates of the Parameter (Pretend it is Unknown)¶
def confidence_interval_95(sample_size):
# Collect one random sample from the population
our_sample = sf.sample(sample_size, with_replacement=False)
median_comp(our_sample)
# Generate the medians of 1000 bootstrap samples
num_repetitions = 1000
bstrap_medians = make_array()
for i in np.arange(num_repetitions):
# Resample the same number of rows, with replacement
bstrap_medians = np.append(bstrap_medians,
median_comp(our_sample.sample()))
# Find the middle 95% of medians; that's a confidence interval
left = percentile(2.5, bstrap_medians)
right = percentile(97.5, bstrap_medians)
return make_array(left, right)
confidence_interval_95(100)intervals = Table(['k', 'lower', 'upper'])
sample_size=32
for k in np.arange(100):
interval = confidence_interval_95(sample_size)
intervals = intervals.with_row([
k, interval.item(0), interval.item(1)
])
intervals.show(4)
truth = median_comp(sf)
plots.plot((truth, truth), (0, intervals.num_rows), color='green', lw=3)
plots.xticks([90000, 120000, 150000, 180000])
for row in intervals.rows:
if row.item('lower') > truth or row.item('upper') < truth:
color = 'red'
else:
color = 'blue'
plots.plot((row.item('lower'), row.item('upper')),
(row.item('k'), row.item('k')),
color, lw=1)Confidence Interval for Unknown Population Mean¶
# Random sample of mother-newborn pairs
births = Table.read_table('baby.csv')births.hist('Maternal Age')# Average age of mothers in the sample
np.average(births.column('Maternal Age'))Question¶
What is the average age of the mothers in the population?
def one_bootstrap_mean():
resample = births.sample()
return np.average(resample.column('Maternal Age'))# Generate means from 3000 bootstrap samples
num_repetitions = 3000
bstrap_means = make_array()
for i in np.arange(num_repetitions):
bstrap_means = np.append(bstrap_means, one_bootstrap_mean())Bootstrap Percentile Method for Confidence Interval¶
The interval of estimates is the “middle 95%” of the bootstrap estimates.
This is called a 95% confidence interval for the mean age in the population.
# Get the endpoints of the 95% confidence interval
left = percentile(2.5, bstrap_means)
right = percentile(97.5, bstrap_means)
make_array(left, right)resampled_means = Table().with_columns(
'Bootstrap Sample Mean', bstrap_means
)
resampled_means.hist(bins=15)
plots.plot([left, right], [0, 0], color='yellow', lw=8);births.hist('Maternal Age')
plots.plot([left, right], [0, 0], color='yellow', lw=8);