Skip to article frontmatterSkip to article content
Site not loading correctly?

This may be due to an incorrect BASE_URL configuration. See the MyST Documentation for reference.

import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')

Standard Units

exams = Table.read_table('exams_fa18.csv')
exams.show(5)
exams.hist(overlay=False, bins=np.arange(0,101,5))
def standard_units(x):
    """Convert array of values to standard units"""
    return (x - np.average(x)) / np.std(x)
midterm_su = standard_units(exams.column('Midterm'))
exams = exams.with_column('Midterm in Standard Units', midterm_su)

final_su = standard_units(exams.column('Final'))
exams = exams.with_column('Final in Standard Units', final_su)

exams.show(10)
exams.select(
    'Midterm in Standard Units', 'Final in Standard Units'
).hist(overlay=False, bins=np.arange(-4,2,0.1))

The SD and Bell Shaped Curves

births = Table.read_table('baby.csv')
births.hist('Maternal Height', bins = np.arange(56.5, 72.6, 1))
heights = births.column('Maternal Height')
np.mean(heights), np.std(heights)
# 1 SD below & above the mean
np.mean(heights) - np.std(heights), np.mean(heights) + np.std(heights)

Central Limit Theorem

united = Table.read_table('united.csv')
united
united_bins = np.arange(-20, 300, 10)
united.hist('Delay', bins=united_bins)
delays = united.column('Delay')
delay_mean = np.mean(delays)
delay_sd = np.std(delays)
delay_mean, delay_sd
percentile(50, delays)
def one_sample_mean(sample_size):
    """ 
    Takes a sample from the population of flights 
    and computes its mean
    """
    sampled_flights = united.sample(sample_size)
    return np.mean(sampled_flights.column('Delay'))
one_sample_mean(100)
def ten_thousand_sample_means(sample_size):
    means = make_array()
    for i in np.arange(10000):
        mean = one_sample_mean(sample_size)
        means = np.append(means, mean)
    return means
sample_means_100 = ten_thousand_sample_means(100)
sample_means_100
len(sample_means_100)
Table().with_column(
    'Mean of 100 flight delays', sample_means_100).hist(bins=20)

print('Population Average:', delay_mean)
sample_means_400 = ten_thousand_sample_means(400)
Table().with_column(
    'Mean of 400 flight delays', sample_means_400).hist(bins=20)

print('Population Average:', delay_mean)