Skip to article frontmatterSkip to article content
Site not loading correctly?

This may be due to an incorrect BASE_URL configuration. See the MyST Documentation for reference.

from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
# Some functions for plotting. You don't have to understand how any
# of the functions in this cell work, since they use things we 
# haven't learned about in Data 8.

def resize_window(lim=3.5):
    plots.xlim(-lim, lim)
    plots.ylim(-lim, lim)
    
def draw_line(slope=0, intercept=0, x=make_array(-4, 4), color='r'):
    y = x*slope + intercept
    plots.plot(x, y, color=color)
    
def draw_vertical_line(x_position, color='black'):
    x = make_array(x_position, x_position)
    y = make_array(-4, 4)
    plots.plot(x, y, color=color)
    
def make_correlated_data(r):
    x = np.random.normal(0, 1, 1000)
    z = np.random.normal(0, 1, 1000)
    y = r*x + (np.sqrt(1-r**2))*z
    return x, y

def r_table(r):
    """
    Generate a table of 1000 x,y data points in standard units
    whose correlation is approximately equal to r
    """
    np.random.seed(8)
    x, y = make_correlated_data(r)
    return Table().with_columns('x', x, 'y', y)
def demographics_errors(slope, intercept):
    # Use four convenient points from the original data
    sample = [[14.7, 33995], [19.1, 61454], [50.7, 71183], [59.5, 105918]]
    demographics.scatter('College%', 'Median Income', alpha=0.5)
    xlims = make_array(5, 75)
    # Plot a line with the slope and intercept you specified:
    plots.plot(xlims, slope * xlims + intercept, lw=4)
    # Plot red lines from each of the four points to the line
    for x, y in sample:
        plots.plot([x, x], [y, slope * x + intercept], color='r', lw=4)
def show_demographics_rmse(slope, intercept):
    demographics_errors(slope, intercept)
    x = demographics.column('College%')
    y = demographics.column('Median Income')
    prediction = slope * x + intercept
    mse = np.mean((y - prediction) ** 2)
    print("Root mean squared error:", round(mse ** 0.5, 2))
def fitted_values(t, x, y):
    """Return an array of the regressions estimates at all the x values"""
    a = slope(t, x, y)
    b = intercept(t, x, y)
    return a*t.column(x) + b

Slope & Intercept

def standard_units(x):
    """Converts an array x to standard units"""
    return (x - np.mean(x)) / np.std(x)

def correlation(t, x, y):
    x_su = standard_units(t.column(x))
    y_su = standard_units(t.column(y))
    return np.mean(x_su * y_su)
def slope(t, x, y):
    """Computes the slope of the regression line"""
    r = correlation(t, x, y)
    y_sd = np.std(t.column(y))
    x_sd = np.std(t.column(x))
    return r * y_sd / x_sd

def intercept(t, x, y):
    """Computes the intercept of the regression line"""
    x_mean = np.mean(t.column(x))
    y_mean = np.mean(t.column(y))
    return y_mean - slope(t, x, y)*x_mean

Error in Estimation

demographics = Table.read_table('district_demographics2016.csv')
demographics.show(5)
demographics = demographics.drop(
    'State', 'District', 'Percent voting for Clinton')
demographics.show(5)
demographics.scatter('College%', 'Median Income')
college = demographics.column('College%')
income = demographics.column('Median Income')
print("     College% average", np.average(college), 'and SD', np.std(college))
print("Median Income average", np.average(income), 'and SD', np.std(income))
print("Correlation", correlation(demographics, 'College%', 'Median Income'))
regression_slope = slope(demographics, 'College%', 'Median Income')
regression_intercept = intercept(demographics, 'College%', 'Median Income')
regression_slope, regression_intercept
predicted = fitted_values(demographics, 'College%', 'Median Income')
demographics = demographics.with_column(
    'Linear Prediction', predicted)
demographics.scatter('College%')
actual = demographics.column('Median Income')
errors = actual - predicted
demographics.with_column('Error', errors)
demographics.with_column('Error', errors).hist('Error')
np.mean(errors)
np.mean(errors ** 2) ** 0.5
demographics_errors(regression_slope, regression_intercept)
demographics_errors(regression_slope, regression_intercept - 30000)
# takes any slope, any intercept

demographics_errors(1350, 19000)
demographics_errors(-1000, 75000)

Root Mean Square Error

show_demographics_rmse(-1000, 75000)
show_demographics_rmse(1350, 19000)
show_demographics_rmse(regression_slope, regression_intercept)

Numerical Optimization

x = np.arange(1, 3, 0.1)
y = (x-2)**2 + 3
Table().with_columns('x', x, 'y', y).plot('x')
def f(x):
    return ((x-2)**2) + 3
minimize(f)
f(minimize(f))
x = np.arange(-1.5, 1.5, 0.05)
y2 = 2 * np.sin(x*np.pi) + x ** 3 + x ** 4 
Table().with_columns('x', x, 'y', y2).plot('x')
def complicated_function(x):
    return 2 * np.sin(x*np.pi) + x ** 3 + x ** 4 
minimize(complicated_function)

Minimizing RMSE

def demographics_rmse(any_slope, any_intercept):
    x = demographics.column('College%')
    y = demographics.column('Median Income')
    estimate = any_slope*x + any_intercept
    return (np.mean((y - estimate) ** 2)) ** 0.5
demographics_rmse(1500, 20000)
demographics_rmse(1350, 19000)
demographics_rmse(-1000, 75000)
minimize(demographics_rmse)
make_array(regression_slope, regression_intercept)

Nonlinear Regression

shotput = Table.read_table('shotput.csv')
shotput
shotput.scatter('Weight Lifted')
def shotput_linear_rmse(any_slope, any_intercept):
    x = shotput.column('Weight Lifted')
    y = shotput.column('Shot Put Distance')
    estimate = any_slope*x + any_intercept
    return np.mean((y - estimate) ** 2) ** 0.5
best_line = minimize(shotput_linear_rmse)
best_line
weights = shotput.column(0)
linear_fit = best_line.item(0)*weights + best_line.item(1)

shotput.with_column(
    'Best Line', linear_fit
).scatter(0)

Quadratic Function

f(x) = ax2+bx+cf(x) ~=~ ax^2 + bx + c

for constants aa, bb, and cc.

def shotput_quadratic_rmse(a, b, c):
    x = shotput.column('Weight Lifted')
    y = shotput.column('Shot Put Distance')
    estimate = a*(x**2) + b*x + c
    return np.mean((y - estimate) ** 2) ** 0.5
best_quad = minimize(shotput_quadratic_rmse)
best_quad
# x = weight lifted = 100 kg
# Then predicted shot put distance:

(-0.00104)*(100**2) + 0.2827*100 - 1.5318
quad_fit = best_quad.item(0)*(weights**2) + best_quad.item(1)*weights + best_quad.item(2)
shotput.with_column('Best Quadratic Curve', quad_fit).scatter(0)
shotput_linear_rmse(best_line.item(0), best_line.item(1))
shotput_quadratic_rmse(best_quad.item(0), best_quad.item(1), best_quad.item(2))