import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')def r_scatter(r):
plots.figure(figsize=(5,5))
"Generate a scatter plot with a correlation approximately r"
x = np.random.normal(0, 1, 1000)
z = np.random.normal(0, 1, 1000)
y = r*x + (np.sqrt(1-r**2))*z
plots.scatter(x, y, color='darkblue', s=20)
plots.xlim(-4, 4)
plots.ylim(-4, 4)Prediction¶
# Note: Child heights are the **adult** heights of children in a family
families = Table.read_table('family_heights.csv')
parent_avgs = (families.column('father') + families.column('mother'))/2
heights = Table().with_columns(
'Parent Average', parent_avgs,
'Child', families.column('child'),
)
heightsheights.scatter('Parent Average', 'Child')nearby = heights.where('Parent Average', are.between(67.5, 68.5))
nearby_mean = np.average(nearby.column('Child'))
nearby_meanheights.scatter('Parent Average', 'Child')
plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)
plots.plot([68.5, 68.5], [50, 85], color='red', lw=2)
plots.scatter(68, nearby_mean, color='red', s=50);def predict_child(h):
"""Predict the height of a child whose parents have a parent average height of p_avg.
The prediction is the average height of the children whose parent average height is
in the range p_avg plus or minus 0.5.
"""
nearby = heights.where('Parent Average', are.between(h - 1/2, h + 1/2))
return np.average(nearby.column('Child'))heights_with_predictions = heights.with_columns(
'Prediction', heights.apply(predict_child, 'Parent Average'))heights_with_predictions.scatter('Parent Average')Association¶
hybrid = Table.read_table('hybrid.csv')hybrid.group('year').barh('year')hybrid.sort('msrp', descending=True)hybrid.scatter('mpg', 'msrp')hybrid.scatter('acceleration', 'msrp')suv = hybrid.where('class', 'SUV')
suv.num_rowssuv.scatter('acceleration', 'msrp')suv.scatter('mpg', 'msrp')def standard_units(x):
"Convert any array of numbers to standard units."
return (x - np.average(x)) / np.std(x)Table().with_columns(
'mpg (standard units)', standard_units(suv.column('mpg')),
'msrp (standard units)', standard_units(suv.column('msrp'))
).scatter(0, 1)
plots.xlim(-3, 3)
plots.ylim(-3, 3);suv.scatter('acceleration', 'msrp')Table().with_columns(
'acceleration (standard units)', standard_units(suv.column('acceleration')),
'msrp (standard units)', standard_units(suv.column('msrp'))
).scatter(0, 1)
plots.xlim(-3, 3)
plots.ylim(-3, 3);Correlation¶
r_scatter(-1)x = np.arange(1, 7, 1)
y = make_array(2, 3, 1, 5, 2, 7)
t = Table().with_columns(
'x', x,
'y', y
)
tt.scatter('x', 'y', s=30, color='red')t = t.with_columns(
'x (standard units)', standard_units(x),
'y (standard units)', standard_units(y)
)
tt.scatter(2, 3, s=30, color='red')t = t.with_columns(
'product of standard units', t.column(2) * t.column(3))
t# r is the average of the products of the standard units
r = np.average(t.column(2) * t.column(3))
rdef correlation(t, x, y):
"""t is a table; x and y are column labels"""
x_in_standard_units = standard_units(t.column(x))
y_in_standard_units = standard_units(t.column(y))
return np.average(x_in_standard_units * y_in_standard_units)correlation(t, 'x', 'y')suv.scatter('mpg', 'msrp')correlation(suv, 'mpg', 'msrp')suv.scatter('acceleration', 'msrp')correlation(suv, 'acceleration', 'msrp')Switching Axes¶
correlation(t, 'x', 'y')t.scatter('x', 'y', s=30, color='red')t.scatter('y', 'x', s=30, color='red')correlation(t, 'y', 'x')Nonlinearity¶
new_x = np.arange(-4, 4.1, 0.5)
nonlinear = Table().with_columns(
'x', new_x,
'y', new_x**2
)
nonlinear.scatter('x', 'y', s=30, color='r')correlation(nonlinear, 'x', 'y')Outliers¶
line = Table().with_columns(
'x', make_array(1, 2, 3, 4),
'y', make_array(1, 2, 3, 4)
)
line.scatter('x', 'y', s=30, color='r')correlation(line, 'x', 'y')outlier = Table().with_columns(
'x', make_array(1, 2, 3, 4, 5),
'y', make_array(1, 2, 3, 4, 0)
)
outlier.scatter('x', 'y', s=30, color='r')correlation(outlier, 'x', 'y')Ecological Correlations¶
sat2014 = Table.read_table('sat2014.csv').sort('State')
sat2014sat2014.scatter('Critical Reading', 'Math')correlation(sat2014, 'Critical Reading', 'Math')def rate_code(x):
if x <= 25:
return 'low'
elif x <= 50:
return 'low-moderate'
elif x <= 75:
return 'moderate_high'
else:
return 'high'rate_codes = sat2014.apply(rate_code, 'Participation Rate')sat2014 = sat2014.with_columns('Rate Code', rate_codes)
sat2014sat2014.scatter('Critical Reading', 'Math', group='Rate Code')sat2014.where('Rate Code', 'low').show()sat2014.where('Rate Code', 'high').show()