import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')Standard Units¶
exams = Table.read_table('exams_fa18.csv')
exams.show(5)exams.hist(overlay=False, bins=np.arange(0,101,5))def standard_units(x):
"""Convert array of values to standard units"""
return (x - np.average(x)) / np.std(x)midterm_su = standard_units(exams.column('Midterm'))
exams = exams.with_column('Midterm in Standard Units', midterm_su)
final_su = standard_units(exams.column('Final'))
exams = exams.with_column('Final in Standard Units', final_su)
exams.show(10)exams.select(
'Midterm in Standard Units', 'Final in Standard Units'
).hist(overlay=False, bins=np.arange(-4,2,0.1))The SD and Bell Shaped Curves¶
births = Table.read_table('baby.csv')births.hist('Maternal Height', bins = np.arange(56.5, 72.6, 1))heights = births.column('Maternal Height')
np.mean(heights), np.std(heights)# 1 SD below & above the mean
np.mean(heights) - np.std(heights), np.mean(heights) + np.std(heights)Central Limit Theorem¶
united = Table.read_table('united.csv')
unitedunited_bins = np.arange(-20, 300, 10)
united.hist('Delay', bins=united_bins)delays = united.column('Delay')
delay_mean = np.mean(delays)
delay_sd = np.std(delays)
delay_mean, delay_sdpercentile(50, delays)def one_sample_mean(sample_size):
"""
Takes a sample from the population of flights
and computes its mean
"""
sampled_flights = united.sample(sample_size)
return np.mean(sampled_flights.column('Delay'))one_sample_mean(100)def ten_thousand_sample_means(sample_size):
means = make_array()
for i in np.arange(10000):
mean = one_sample_mean(sample_size)
means = np.append(means, mean)
return meanssample_means_100 = ten_thousand_sample_means(100)sample_means_100len(sample_means_100)Table().with_column(
'Mean of 100 flight delays', sample_means_100).hist(bins=20)
print('Population Average:', delay_mean)sample_means_400 = ten_thousand_sample_means(400)
Table().with_column(
'Mean of 400 flight delays', sample_means_400).hist(bins=20)
print('Population Average:', delay_mean)