from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import numpy as np
import warnings
warnings.simplefilter("ignore")Percentiles¶
# Manually compute the 55th percentile.
x = make_array(43, 20, 51, 7, 28, 34)# Step 1. Sort the data
np.sort(x)# Step 2. Figure out where 55th percentile would be.np.arange(1, 7)/6np.arange(1, 7)/6 >= 55/100np.sort(x).item(3)# Alternatively: One line of code
percentile(55, x)Discussion Question¶
s = make_array(1, 3, 5, 7, 9)percentile(10, s) == 0percentile(39, s) == percentile(40, s)percentile(40, s) == percentile(41, s)percentile(50, s) == 5Inference: Estimation¶
sf = Table.read_table('san_francisco_2019.csv')
sf.show(3)# Who made the most money
sf.sort('Total Compensation', descending=True).show(5)# Who made the least money
sf.sort('Total Compensation', descending=False).show(5)# $15/hr, 20 hr/wk, 50 weeks
min_salary = 15 * 20 * 50
sf = sf.where('Salary', are.above(min_salary))# Population Distribution
sf_bins = np.arange(0, 726000, 25000)
sf.hist('Total Compensation', bins=sf_bins)# An Empirical Distribution
our_sample = sf.sample(400, with_replacement=False)
our_sample.hist('Total Compensation', bins=sf_bins)# Parameter: Median Total Compensation
pop_median = percentile(50, sf.column('Total Compensation'))
pop_median# Estimate: Median of a Sample
percentile(50, our_sample.column('Total Compensation'))Variability of the Estimate¶
def generate_sample_median(samp_size):
new_sample = sf.sample(samp_size, with_replacement=False)
return percentile(50, new_sample.column('Total Compensation'))generate_sample_median(400)Quantifying Uncertainty¶
sample_medians = make_array()
for i in np.arange(1000):
new_median = generate_sample_median(400)
sample_medians = np.append(sample_medians, new_median)med_bins = np.arange(120000, 160000, 2000)
Table().with_column('Sample Medians', sample_medians).hist(bins=med_bins)
plots.ylim(-0.000005, 0.00014)
plots.scatter(pop_median, 0, color='red');print('Less than 2.5% of estimates are more than', pop_median - percentile(2.5, sample_medians), 'lower than', pop_median)
print('Less than 2.5% of estimates are more than', percentile(97.5, sample_medians) - pop_median, 'higher than', pop_median)Bootstrap¶
Sample randomly
from the original sample
with replacement
the same number of times as the original sample size
# Default behavior of t.sample():
# at random with replacement,
# the same number of times as the number of rows in t
bootstrap_sample = our_sample.sample()
bootstrap_sample.hist('Total Compensation', bins=sf_bins)
percentile(50, bootstrap_sample.column('Total Compensation'))def one_bootstrap_median():
# draw the bootstrap sample
resample = our_sample.sample()
# return the median total compensation in the bootstrap sample
return percentile(50, resample.column('Total Compensation'))one_bootstrap_median()# Generate the medians of 1000 bootstrap samples
num_repetitions = 1000
bstrap_medians = make_array()
for i in np.arange(num_repetitions):
bstrap_medians = np.append(bstrap_medians, one_bootstrap_median())resampled_medians = Table().with_column('Bootstrap Sample Median', bstrap_medians)
median_bins=np.arange(120000, 160000, 2000)
resampled_medians.hist(bins = median_bins)
# Plotting parameters; you can ignore this code
parameter_green = '#32CD32'
plots.ylim(-0.000005, 0.00014)
plots.scatter(pop_median, 0, color=parameter_green, s=40, zorder=2)
plots.title('Bootstrap Medians and the Parameter (Green Dot)');Percentile Method: Middle 95% of the Bootstrap Estimates¶
left = percentile(2.5, bstrap_medians)
right = percentile(97.5, bstrap_medians)
make_array(left, right)resampled_medians.hist(bins = median_bins)
# Plotting parameters; you can ignore this code
plots.ylim(-0.000005, 0.00014)
plots.plot(make_array(left, right), make_array(0, 0), color='yellow', lw=3, zorder=1)
plots.scatter(pop_median, 0, color=parameter_green, s=40, zorder=2);