Skip to article frontmatterSkip to article content
Site not loading correctly?

This may be due to an incorrect BASE_URL configuration. See the MyST Documentation for reference.

# HIDDEN
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
def standard_units(arr):
    """ Converts an array to standard units """
    return (arr - np.average(arr))/np.std(arr)

def correlation(t, x, y):
    """ Computes correlation: t is a table, and x and y are column names """
    x_standard = standard_units(t.column(x))
    y_standard = standard_units(t.column(y))
    return np.average(x_standard * y_standard)

def slope(t, x, y):
    """ Computes the slope of the regression line, like correlation above """
    r = correlation(t, x, y)
    y_sd = np.std(t.column(y))
    x_sd = np.std(t.column(x))
    return r * y_sd / x_sd

def intercept(t, x, y):
    """ Computes the intercept of the regression line, like slope above """
    x_mean = np.mean(t.column(x))
    y_mean = np.mean(t.column(y))
    return y_mean - slope(t, x, y)*x_mean

def fitted_values(t, x, y):
    """Return an array of the regression estimates (predictions) at all the x values"""
    a = slope(t, x, y)
    b = intercept(t, x, y)
    return a*t.column(x) + b
demographics = Table.read_table('district_demographics2016.csv')
demographics.sample(6).show()

Linear Fit

estimate of Median Income=aCollege%+b\text{estimate of Median Income} = a \cdot \text{College\%} + b

fitted_income = demographics.select('College%', 'Median Income')
fitted_income = fitted_income.with_columns('Fitted Median Income',
    fitted_values(demographics, 'College%', 'Median Income'))
fitted_income.scatter('College%')
fitted_income.sample(5).show()

Residuals

def residuals(t, x, y):
    predictions = fitted_values(t, x, y)
    return t.column(y) - predictions

Residual plots can tell us about our model fit

Case study 1: 2016 election demographics

income_residuals = fitted_income.with_columns(
    'Residual', residuals(demographics, 'College%', 'Median Income')
)
income_residuals.show(5)

These two functions will help us plot things the rest of the way.

def plot_fitted(t, x, y):
    fitted_label = 'Fitted ' + y
    (t.with_column(fitted_label, fitted_values(t, x, y))
      .select(x, y, fitted_label).scatter(0))

def plot_residuals(t, x, y):
    residuals_label = y + ' Residuals'
    (t.with_columns(residuals_label, residuals(t, x, y))
      .scatter(x, residuals_label))

plot_fitted(demographics, 'College%', 'Median Income')
plot_residuals(demographics, 'College%', 'Median Income')

Case study 2: dugongs

dugong = Table.read_table('dugong.csv')
dugong.show(5)
dugong.scatter('Length', 'Age')
correlation(dugong, 'Length', 'Age')

estimate of Age=aLength+b\text{estimate of Age} = a \cdot \text{Length} + b

plot_fitted(dugong, 'Length', 'Age')
plot_residuals(dugong, 'Length', 'Age')

Case study 3: US women

us_women = Table.read_table('us_women.csv')
us_women.show(5)
us_women.scatter('height')
correlation(us_women, 'height', 'ave weight')

estimate of height=aaverage weight+b\text{estimate of height} = a \cdot \text{average weight} + b

plot_fitted(us_women, 'height', 'ave weight')
plot_residuals(us_women, 'height', 'ave weight')

Case study 4: Heights

family_heights = Table.read_table('family_heights.csv')
family_heights.where('family', '1')
parents = (family_heights.column('father') + family_heights.column('mother'))/2
heights = Table().with_columns(
    'Parent Average', parents,
    'Child', family_heights.column('child')
    )
heights.show(6)

estimate of a child’s adult height=aaverage parent height+b\text{estimate of a child's adult height} = a \cdot \text{average parent height} + b

plot_fitted(heights, 'Parent Average', 'Child')
plot_residuals(heights, 'Parent Average', 'Child')

Mathematical Properties of Regression

The average of the residuals are zero.

round(np.average(residuals(demographics, 'College%', 'Median Income')), 10)
round(np.average(residuals(dugong, 'Length', 'Age')), 10)
round(np.average(residuals(heights, 'Parent Average', 'Child')), 10)

The correlation coefficients between:

  • The residuals and xx

  • The residuals and the fitted values

are both 0.

heights = heights.with_columns(
    'Residual', residuals(heights, 'Parent Average', 'Child'),
    'Fitted Value', fitted_values(heights, 'Parent Average', 'Child')
)
round(correlation(heights, 'Parent Average', 'Residual'), 10)
round(correlation(heights, 'Fitted Value', 'Residual'), 10)