Skip to article frontmatterSkip to article content
Site not loading correctly?

This may be due to an incorrect BASE_URL configuration. See the MyST Documentation for reference.

from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import warnings
# warnings.simplefilter(action='ignore', category=np.VisibleDeprecationWarning)

Appending Arrays

first = np.arange(4)
second = np.arange(10, 17)
first
array([0, 1, 2, 3])
np.append(first, 6)
array([0, 1, 2, 3, 6])
first
array([0, 1, 2, 3])
# how do we change `first`?
first = np.append(first, 6)
np.append(first, second)
array([ 0, 1, 2, 3, 6, 10, 11, 12, 13, 14, 15, 16])
first
array([0, 1, 2, 3, 6])
second
array([10, 11, 12, 13, 14, 15, 16])

Comparison

3 > 1
True
type(3 > 1)
bool
True
True
true
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[26], line 1
----> 1 true

NameError: name 'true' is not defined

(We see that capitalization matters.)

3 = 3
  Cell In[27], line 1
    3 = 3
    ^
SyntaxError: cannot assign to literal here. Maybe you meant '==' instead of '='?
3 == 3.0
True
10 != 2
True
x = 14
y = 3
x > 15
False
12 < x
True
x < 20
True
12 < x < 20
True
10 < x-y < 13
True
x > 13 and y < 3.14159
True

(The comparison 12 < x < 20 is equivalent to 12 < x and x < 20.)

Comparisons with arrays

pets = make_array('cat', 'cat', 'dog', 'cat', 'dog', 'rabbit')
pets == 'cat'
array([ True, True, False, True, False, False], dtype=bool)
1 + 1 + 0 + 1 + 0 + 0
3
sum(make_array(True, True, False, True, False, False))
3
sum(pets == 'cat')
3
np.count_nonzero(pets == 'cat')
3
xs = np.arange(20, 31)
xs
array([20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30])
xs > 28
array([False, False, False, False, False, False, False, False, False, True, True], dtype=bool)
sum(xs > 28)
2

Conditional Statements

age = 20
if age >= 18:
    print('You can legally vote.')
else:
    if age >= 16:
        print("You can't vote just yet, but you can pre-register!")
    else:
        print("You can't vote just yet.")
You can't vote just yet, but you can pre-register!
if age >= 18:
    print('You can legally vote.')
if age >= 16:
    print("You can't vote just yet, but you can pre-register!")
else:
    print("You can't vote just yet.")
You can legally vote.
You can't vote just yet, but you can pre-register!
def vote(age):
    if age >= 18:
        return 'You can legally vote.'
    elif age >= 16:
        return "You can't vote just yet, but you can pre-register!"
    else:
        return "You can't vote just yet."
vote(3)
"You can't vote just yet."
vote(17)
"You can't vote just yet, but you can pre-register!"
vote(25)
'You can legally vote.'
def rights(age):
	my_rights = make_array()
	if age >= 16:
		my_rights = np.append(my_rights, 'register to vote')
	if age >= 18:
		my_rights = np.append(my_rights, 'vote')
	return my_rights
rights(3)
array([], dtype=float64)
rights(17)
array(['register to vote'], dtype='<U32')
rights(25)
array(['register to vote', 'vote'], dtype='<U32')
trip = Table().read_table('trip.csv').sort('Zip Code')
trip.show(5)
Loading...
def trip_kind(start, end):
    if start == end:
        return 'round trip'
    else:
        return 'one way'

# trip.show(5)

kinds = trip.apply(trip_kind, 'Start Station', 'End Station')
with_kinds = trip.with_column('Trip Kind', kinds)
with_kinds.show(5)
Loading...
# recall pivot! what will we see?
with_kinds.where('Duration', are.below(600)).pivot('Trip Kind', 'Start Station')
Loading...

Simulation

Let’s play a game: we each roll a die.

If my number is bigger: you pay me a dollar.

If they’re the same: we do nothing.

If your number is bigger: I pay you a dollar.

Steps:

  1. Find a way to simulate two dice rolls.

  2. Compute how much money we win/lose based on the result.

  3. Do steps 1 and 2 10,000 times.

Conditional Statements

# Work in progress
def one_round(my_roll, your_roll):
    if my_roll > your_roll:
        return 1
one_round(4, 3)
1
one_round(2, 6)
def one_round(my_roll, your_roll):
    if my_roll > your_roll:
        return 1
    elif your_roll > my_roll:
        return -1
    elif your_roll == my_roll:
        return 0
one_round(1, 1)
0
one_round(6, 5)
1
one_round(-7, -1)
-1

Random Selection

mornings = make_array('wake up', 'sleep in')
np.random.choice(mornings)
'wake up'
np.random.choice(mornings)
'sleep in'
np.random.choice(mornings)
'wake up'

We can also pass an argument that specifies how many times to make a random choice:

np.random.choice(mornings, 7)
array(['wake up', 'wake up', 'wake up', 'sleep in', 'wake up', 'sleep in', 'wake up'], dtype='<U8')
sum(np.random.choice(mornings, 7) == 'wake up')
5
sum(np.random.choice(mornings, 7) == 'sleep in')
4

^ Why don’t these (always) sum to 7?

morning_week = np.random.choice(mornings, 7)
morning_week
array(['sleep in', 'wake up', 'wake up', 'sleep in', 'wake up', 'wake up', 'wake up'], dtype='<U8')
sum(morning_week == 'wake up')
5
sum(morning_week == 'sleep in')
2

Simulating the roll of a die

die_faces = np.arange(1, 7)
np.random.choice(die_faces)
4
def simulate_one_round():
    my_roll = np.random.choice(die_faces)
    your_roll = np.random.choice(die_faces)
    return one_round(my_roll, your_roll)
simulate_one_round()
-1

Repeated Betting

results = make_array()
results = np.append(results, simulate_one_round())
results
array([-1., 1.])
results = np.append(results, simulate_one_round())
results
array([-1., 1., 1., -1., -1., 1., -1., -1., 1., 1., 1., 1., 0., 1., 1., 1., 1., -1., 0., 0., -1., 0., 1., -1., 1., 1., -1., 1., 0., 1.])

For Statements

for pet in make_array('cat', 'dog', 'rabbit'):
    print('I love my ' + pet)
I love my cat
I love my dog
I love my rabbit
pet = make_array('cat', 'dog', 'rabbit').item(0)
print('I love my ' + pet)

pet = make_array('cat', 'dog', 'rabbit').item(1)
print('I love my ' + pet)

pet = make_array('cat', 'dog', 'rabbit').item(2)
print('I love my ' + pet)
I love my cat
I love my dog
I love my rabbit
np.arange(10)
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
for i in np.arange(10):
    print('GO BEARSSSS')
GO BEARSSSS
GO BEARSSSS
GO BEARSSSS
GO BEARSSSS
GO BEARSSSS
GO BEARSSSS
GO BEARSSSS
GO BEARSSSS
GO BEARSSSS
GO BEARSSSS

(see slides)

game_outcomes = make_array()

for i in np.arange(5):
    game_outcomes = np.append(game_outcomes, simulate_one_round())
    
game_outcomes
array([ 1., -1., 1., -1., 0.])
game_outcomes = make_array()

for i in np.arange(10000):
    game_outcomes = np.append(game_outcomes, simulate_one_round())
    
game_outcomes
array([-1., 1., 1., ..., -1., 1., 0.])
len(game_outcomes)
10000
results = Table().with_column('My winnings', game_outcomes)
results
Loading...
results.group('My winnings').barh('My winnings')
<Figure size 600x400 with 1 Axes>
sum(game_outcomes)

Another example: simulating heads in 100 coin tosses

coin = make_array('heads', 'tails')
sum(np.random.choice(coin, 100) == 'heads')
51
# Simulate one outcome

def num_heads():
    return sum(np.random.choice(coin, 100) == 'heads')
# Decide how many times you want to repeat the experiment

repetitions = 10000
# Simulate that many outcomes

outcomes = make_array()

for i in np.arange(repetitions):
    outcomes = np.append(outcomes, num_heads())
heads = Table().with_column('Heads', outcomes)
heads.hist(bins = np.arange(29.5, 70.6))
<Figure size 600x400 with 1 Axes>

datascience library

trip.show(3)
Loading...
trip.where('Duration', are.above(1000))
# .column('Duration')
Loading...
big_trip_durations = make_array()
for duration in trip.column('Duration'):
    if duration > 1000:
        big_trip_durations = np.append(big_trip_durations, duration)
np.mean(big_trip_durations)
5259.1915219611847
np.mean(
    trip.where('Duration', are.above(1000)).column('Duration')
)
5259.1915219611847

Optional: Advanced where

ages = make_array(16, 22, 18, 15, 19, 15, 16, 21)
age = Table().with_column('Age', ages)
age
Loading...
age.where('Age', are.above_or_equal_to(18))
Loading...
voter = ages >= 18
voter
array([False, True, True, False, True, False, False, True], dtype=bool)
age.where(voter)
Loading...
is_voter = are.above_or_equal_to(18)
type(is_voter)
datascience.predicates._combinable
is_voter(22)
True
is_voter(3)
False
age.apply(is_voter, 'Age')
array([False, True, True, False, True, False, False, True], dtype=bool)
ages >= 18
array([False, True, True, False, True, False, False, True], dtype=bool)
voter
array([False, True, True, False, True, False, False, True], dtype=bool)
def my_voter_function(x):
    return x >= 18
age.where('Age', are.above_or_equal_to(18))
Loading...
age.where(voter)
Loading...