import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')Average (Mean)¶
values = make_array(2, 3, 3, 9)
valuessum(values)/len(values)np.average(values)np.mean(values)(2 + 3 + 3 + 9)/42*(1/4) + 3*(2/4) + 9*(1/4)2*0.25 + 3*0.5 + 9*0.25values_table = Table().with_columns('value', values)
values_tablebins_for_display = np.arange(0.5, 10.6, 1)
values_table.hist('value', bins = bins_for_display)## Make array of 10 2s, 20 3s, and 10 9s
new_vals = make_array(2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9)Table().with_column('value', new_vals).hist(bins = bins_for_display)np.average(new_vals)Table().with_column('value', new_vals).hist(bins = bins_for_display)
plots.ylim(-0.04, 0.5)
plots.plot([0, 10], [0, 0], color='grey', lw=2)
plots.scatter(4.25, -0.015, marker='^', color='red', s=100)
plots.title('Average as a Center of Gravity');Standard Deviation¶
sd_table = Table().with_columns('Value', values)
sd_tableaverage_value = np.mean(values)
average_valuedeviations = values - average_value
sd_table = sd_table.with_column('Deviation', deviations)
sd_tablesum(deviations)sd_table = sd_table.with_column('Squared Deviation', deviations ** 2)
sd_table# Variance of the data:
# mean squared deviation from average
variance = np.mean(deviations ** 2)
variance# Standard Deviation (SD):
# root mean squared deviation from average
# = square root of the variance
sd = variance ** 0.5
sdnp.std(values)Chebyshev’s Bounds: “Tail” Bounds¶
# Proportion in the range "mean +/- 3.5 SDs"
# Chebyshev's bound: this proportion is AT LEAST
z = 3.5
1 - 1/z**2# Proportion in the range "mean +/- 0.5 SDs"
# Chebyshev's bound: this proportion is AT LEAST
z = 0.5
1 - 1/z**2births = Table.read_table('baby.csv').drop('Maternal Smoker')births.labelsbirths.hist(overlay = False)mpw = births.column('Maternal Pregnancy Weight')
mean = np.mean(mpw)
sd = np.std(mpw)
mean, sdwithin_3_SDs = births.where(
'Maternal Pregnancy Weight', are.between(mean - 3*sd, mean + 3*sd))# Proportion within 3 SDs of the mean
within_3_SDs.num_rows / births.num_rows# Chebyshev's bound:
# This proportion should be at least
1 - 1/3**2births.labels# See if Chebyshev's bounds work for distributions with various shapes
for feature in births.labels:
values = births.column(feature)
mean = np.mean(values)
sd = np.std(values)
print()
print(feature)
for z in make_array(2, 3, 4, 5):
chosen = births.where(feature, are.between(mean - z*sd, mean + z*sd))
proportion = chosen.num_rows / births.num_rows
percent = round(proportion * 100, 2)
print('Average plus or minus', z, 'SDs:', percent, '%')Standard Units¶
def standard_units(x):
"""Convert array x to standard units."""
return (x - np.mean(x)) / np.std(x)ages = births.column('Maternal Age')
agesages_standard_units = standard_units(ages)
ages_standard_unitsnp.mean(ages_standard_units), np.std(ages_standard_units)
Discussion Question¶
both = Table().with_columns(
'Age in Years', ages,
'Age in Standard Units', ages_standard_units
)
bothnp.mean(ages), np.std(ages)both.hist('Age in Years', bins = np.arange(15, 46, 2))both.hist('Age in Standard Units', bins = np.arange(-2.2, 3.4, 0.35))
plots.xlim(-2, 3.1);The SD and Bell Shaped Curves¶
births.hist('Maternal Height', bins = np.arange(56.5, 72.6, 1))
plots.xticks(np.arange(57, 72, 2));Estimates by eye
The average is approximately:
Locate the point of inflection on the right. The SD is approximately:
heights = births.column('Maternal Height')
np.mean(heights), np.std(heights)