Skip to article frontmatterSkip to article content
Site not loading correctly?

This may be due to an incorrect BASE_URL configuration. See the MyST Documentation for reference.

import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')

Average (Mean)

values = make_array(2, 3, 3, 9)
values
sum(values)/len(values)
np.average(values)
np.mean(values)
(2 + 3 + 3 + 9)/4
2*(1/4) + 3*(2/4) + 9*(1/4)
2*0.25 + 3*0.5 + 9*0.25
values_table = Table().with_columns('value', values)
values_table
bins_for_display = np.arange(0.5, 10.6, 1)
values_table.hist('value', bins = bins_for_display)
## Make array of 10 2s, 20 3s, and 10 9s

new_vals = make_array(2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                      3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
                      9, 9, 9, 9, 9, 9, 9, 9, 9, 9)
Table().with_column('value', new_vals).hist(bins = bins_for_display)
np.average(new_vals)
Table().with_column('value', new_vals).hist(bins = bins_for_display)
plots.ylim(-0.04, 0.5)
plots.plot([0, 10], [0, 0], color='grey', lw=2)
plots.scatter(4.25, -0.015, marker='^', color='red', s=100)
plots.title('Average as a Center of Gravity');

Standard Deviation

sd_table = Table().with_columns('Value', values)
sd_table
average_value = np.mean(values)
average_value
deviations = values - average_value
sd_table = sd_table.with_column('Deviation', deviations)
sd_table
sum(deviations)
sd_table = sd_table.with_column('Squared Deviation', deviations ** 2)
sd_table
# Variance of the data:
# mean squared deviation from average

variance = np.mean(deviations ** 2)
variance
# Standard Deviation (SD): 
# root mean squared deviation from average
# = square root of the variance

sd = variance ** 0.5
sd
np.std(values)

Chebyshev’s Bounds: “Tail” Bounds

# Proportion in the range "mean +/- 3.5 SDs"
# Chebyshev's bound: this proportion is AT LEAST
z = 3.5
1 - 1/z**2
# Proportion in the range "mean +/- 0.5 SDs"
# Chebyshev's bound: this proportion is AT LEAST
z = 0.5
1 - 1/z**2
births = Table.read_table('baby.csv').drop('Maternal Smoker')
births.labels
births.hist(overlay = False)
mpw = births.column('Maternal Pregnancy Weight')
mean = np.mean(mpw)
sd = np.std(mpw)
mean, sd
within_3_SDs = births.where(
    'Maternal Pregnancy Weight', are.between(mean - 3*sd, mean + 3*sd))
# Proportion within 3 SDs of the mean

within_3_SDs.num_rows / births.num_rows
# Chebyshev's bound: 
# This proportion should be at least

1 - 1/3**2
births.labels
# See if Chebyshev's bounds work for distributions with various shapes

for feature in births.labels:
    values = births.column(feature)
    mean = np.mean(values)
    sd = np.std(values)
    print()
    print(feature)
    for z in make_array(2, 3, 4, 5):
        chosen = births.where(feature, are.between(mean - z*sd, mean + z*sd))
        proportion = chosen.num_rows / births.num_rows
        percent = round(proportion * 100, 2)
        print('Average plus or minus', z, 'SDs:', percent, '%')

Standard Units

def standard_units(x):
    """Convert array x to standard units."""
    return (x - np.mean(x)) / np.std(x)
ages = births.column('Maternal Age')
ages
ages_standard_units = standard_units(ages)
ages_standard_units
np.mean(ages_standard_units), np.std(ages_standard_units)

Discussion Question

both = Table().with_columns(
    'Age in Years', ages,
    'Age in Standard Units', ages_standard_units
)
both
np.mean(ages), np.std(ages)
both.hist('Age in Years', bins = np.arange(15, 46, 2))
both.hist('Age in Standard Units', bins = np.arange(-2.2, 3.4, 0.35))
plots.xlim(-2, 3.1);

The SD and Bell Shaped Curves

births.hist('Maternal Height', bins = np.arange(56.5, 72.6, 1))
plots.xticks(np.arange(57, 72, 2));

Estimates by eye

The average is approximately:

Locate the point of inflection on the right. The SD is approximately:

heights = births.column('Maternal Height')
np.mean(heights), np.std(heights)