# HIDDEN
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')Regression-Related Functions¶
def standard_units(arr):
""" Converts an array to standard units """
return (arr - np.average(arr))/np.std(arr)
def correlation(t, x, y):
""" Computes correlation: t is a table, and x and y are column names """
x_standard = standard_units(t.column(x))
y_standard = standard_units(t.column(y))
return np.average(x_standard * y_standard)
def slope(t, x, y):
""" Computes the slope of the regression line, like correlation above """
r = correlation(t, x, y)
y_sd = np.std(t.column(y))
x_sd = np.std(t.column(x))
return r * y_sd / x_sd
def intercept(t, x, y):
""" Computes the intercept of the regression line, like slope above """
x_mean = np.mean(t.column(x))
y_mean = np.mean(t.column(y))
return y_mean - slope(t, x, y)*x_mean
def fitted_values(t, x, y):
"""Return an array of the regression estimates (predictions) at all the x values"""
a = slope(t, x, y)
b = intercept(t, x, y)
return a*t.column(x) + bdemographics = Table.read_table('district_demographics2016.csv')
demographics.sample(6).show()Linear Fit
fitted_income = demographics.select('College%', 'Median Income')
fitted_income = fitted_income.with_columns('Fitted Median Income',
fitted_values(demographics, 'College%', 'Median Income'))
fitted_income.scatter('College%')
fitted_income.sample(5).show()Residuals¶
def residuals(t, x, y):
predictions = fitted_values(t, x, y)
return t.column(y) - predictionsResidual plots can tell us about our model fit¶
Case study 1: 2016 election demographics¶
income_residuals = fitted_income.with_columns(
'Residual', residuals(demographics, 'College%', 'Median Income')
)
income_residuals.show(5)These two functions will help us plot things the rest of the way.
def plot_fitted(t, x, y):
fitted_label = 'Fitted ' + y
(t.with_column(fitted_label, fitted_values(t, x, y))
.select(x, y, fitted_label).scatter(0))
def plot_residuals(t, x, y):
residuals_label = y + ' Residuals'
(t.with_columns(residuals_label, residuals(t, x, y))
.scatter(x, residuals_label))plot_fitted(demographics, 'College%', 'Median Income')plot_residuals(demographics, 'College%', 'Median Income')Case study 2: dugongs¶
dugong = Table.read_table('dugong.csv')
dugong.show(5)dugong.scatter('Length', 'Age')correlation(dugong, 'Length', 'Age')plot_fitted(dugong, 'Length', 'Age')plot_residuals(dugong, 'Length', 'Age')Case study 3: US women¶
us_women = Table.read_table('us_women.csv')
us_women.show(5)us_women.scatter('height')correlation(us_women, 'height', 'ave weight')plot_fitted(us_women, 'height', 'ave weight')plot_residuals(us_women, 'height', 'ave weight')Case study 4: Heights¶
family_heights = Table.read_table('family_heights.csv')
family_heights.where('family', '1')parents = (family_heights.column('father') + family_heights.column('mother'))/2
heights = Table().with_columns(
'Parent Average', parents,
'Child', family_heights.column('child')
)
heights.show(6)plot_fitted(heights, 'Parent Average', 'Child')plot_residuals(heights, 'Parent Average', 'Child')Mathematical Properties of Regression¶
The average of the residuals are zero.
round(np.average(residuals(demographics, 'College%', 'Median Income')), 10)round(np.average(residuals(dugong, 'Length', 'Age')), 10)round(np.average(residuals(heights, 'Parent Average', 'Child')), 10)The correlation coefficients between:
The residuals and
The residuals and the fitted values
are both 0.
heights = heights.with_columns(
'Residual', residuals(heights, 'Parent Average', 'Child'),
'Fitted Value', fitted_values(heights, 'Parent Average', 'Child')
)round(correlation(heights, 'Parent Average', 'Residual'), 10)round(correlation(heights, 'Fitted Value', 'Residual'), 10)