Skip to article frontmatterSkip to article content
Site not loading correctly?

This may be due to an incorrect BASE_URL configuration. See the MyST Documentation for reference.

from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

Lecture 14

Random Sampling

We load in a dataset of all United flights national flights from 6/1/15 to 8/9/15, their destination and how long they were delayed, in minutes.

united = Table.read_table('united.csv')
united = united.with_column('Row', np.arange(united.num_rows)).move_to_start('Row')
united

Some deterministic samples:

united.where('Destination', 'JFK') 
united.take(np.arange(0, united.num_rows, 1000))
united.take(make_array(34, 6321, 10040))

Random samples:

united.sample(5, with_replacement=False).sort('Row')
rows = np.random.choice(np.arange(united.num_rows), 5, replace=False)
united.take(rows).sort('Row')
start = np.random.choice(np.arange(1000))
systematic_sample = united.take(np.arange(start, united.num_rows, 1000))
systematic_sample.show()

Distributions

die = Table().with_column('Face', np.arange(1, 7))
die
die.sample(10)
die.hist()
roll_bins = np.arange(0.5, 6.6, 1)
die.hist(bins=roll_bins)
die.sample(10).hist(bins=roll_bins)
die.sample(1000).hist(bins=roll_bins)
die.sample(100000).hist(bins=roll_bins)

Large Random Samples

united 
united_bins = np.arange(-20, 201, 5)
united.hist('Delay', bins = united_bins)
min(united.column('Delay'))
max(united.column('Delay'))
np.average(united.column('Delay'))
united.sample(10).hist('Delay', bins = united_bins)
united.sample(1000).hist('Delay', bins = united_bins)

Simulating Statistics

np.median(united.column('Delay'))
np.median(united.sample(10).column('Delay'))
def sample_median(size):
    return np.median(united.sample(size).column('Delay'))
sample_median(10)
sample_medians = make_array()

for i in np.arange(1000):
    new_median = sample_median(10)
    sample_medians = np.append(sample_medians, new_median)
Table().with_column('Sample medians', sample_medians).hist(bins = np.arange(-10,31))
sample_medians = make_array()

for i in np.arange(1000):
    new_median = sample_median(1000)
    sample_medians = np.append(sample_medians, new_median)
Table().with_column(
    'Sample medians', sample_medians).hist(bins = np.arange(-10,31))