from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')Lecture 14¶
Random Sampling¶
We load in a dataset of all United flights national flights from 6/1/15 to 8/9/15, their destination and how long they were delayed, in minutes.
united = Table.read_table('united.csv')
united = united.with_column('Row', np.arange(united.num_rows)).move_to_start('Row')
unitedSome deterministic samples:
united.where('Destination', 'JFK') united.take(np.arange(0, united.num_rows, 1000))united.take(make_array(34, 6321, 10040))Random samples:
united.sample(5, with_replacement=False).sort('Row')rows = np.random.choice(np.arange(united.num_rows), 5, replace=False)
united.take(rows).sort('Row')start = np.random.choice(np.arange(1000))
systematic_sample = united.take(np.arange(start, united.num_rows, 1000))
systematic_sample.show()Distributions¶
die = Table().with_column('Face', np.arange(1, 7))
diedie.sample(10)die.hist()roll_bins = np.arange(0.5, 6.6, 1)die.hist(bins=roll_bins)die.sample(10).hist(bins=roll_bins)die.sample(1000).hist(bins=roll_bins)die.sample(100000).hist(bins=roll_bins)Large Random Samples¶
united united_bins = np.arange(-20, 201, 5)
united.hist('Delay', bins = united_bins)min(united.column('Delay'))max(united.column('Delay'))np.average(united.column('Delay'))united.sample(10).hist('Delay', bins = united_bins)united.sample(1000).hist('Delay', bins = united_bins)Simulating Statistics¶
np.median(united.column('Delay'))np.median(united.sample(10).column('Delay'))def sample_median(size):
return np.median(united.sample(size).column('Delay'))sample_median(10)sample_medians = make_array()
for i in np.arange(1000):
new_median = sample_median(10)
sample_medians = np.append(sample_medians, new_median)Table().with_column('Sample medians', sample_medians).hist(bins = np.arange(-10,31))sample_medians = make_array()
for i in np.arange(1000):
new_median = sample_median(1000)
sample_medians = np.append(sample_medians, new_median)Table().with_column(
'Sample medians', sample_medians).hist(bins = np.arange(-10,31))