from datascience import *
import numpy as np
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
plots.rcParams["patch.force_edgecolor"] = TrueUse counts and bar charts for categorical distributions¶
nba_2020 = Table.read_table('nba_salaries.csv').where('season', 2020)
nba_2020.show(6)nba_2020.group('position').barh('position', 'count')nba_2020.where('salary', are.above(2e7)).group('position').barh('position', 'count')Exercise: Show the percentage of each players in each position instead of the count
counts = nba_2020.group('position')
percentages = counts.with_column('percentage', 100 * counts.column('count') / sum(counts.column('count')))
percentages.barh('position', 'percentage')Use binning for numerical distributions¶
# Data from https://www.boxofficemojo.com/chart/ww_top_lifetime_gross/
top_movies = Table.read_table('top_movies_2026.csv')
top_movies.show(4)top_movies.set_format([1, 2, 3], CurrencyFormatter).show(4)top_movies.scatter(2, 3)top_movies.sort(3, descending=True).show(5)Task: Visualize the distribution of movie ages¶
ages = 2026 - top_movies.column('Year')
agestop_movies = top_movies.with_column('Age', ages)
top_moviestop_movies.select('Title', 'Age').show(6)min(ages), max(ages)If you want to make equally sized bins,
np.arange()is a great tool to help you.
np.arange(0, max(ages)+10, 10)top_movies.hist('Age', bins = np.arange(0, max(ages)+10, 10), unit = 'Year')Otherwise, you can pick your own bins. These are just bins that we picked out.
my_bins = make_array(0, 5, 10, 15, 25, 60)binned_data = top_movies.bin('Age', bins = my_bins)
binned_dataNote: The last “bin” does not include any observations!!
Introducing the histogram and the area principle¶
top_movies.hist('Age', bins = my_bins, unit = 'Year')Discussion Question: Compare the bins and .¶
Which one has more movies?
Which one is more crowded?
Challenge tasks¶
Task: Find the height of the bin in the histogram above.¶
Add a column containing what percent of movies are in each bin (the area of each bin)
binned_data = binned_data.with_column('Percent', 100*binned_data.column('Age count')/top_movies.num_rows)binned_data.show()percent = binned_data.where('bin', 15).column('Percent').item(0)width = 25-15
height = percent / widthheightTask: Find the heights of the (rest of the) bins.¶
Remember that the last row in the table does not represent a bin!
height_table = binned_data.take(np.arange(binned_data.num_rows - 1))
height_table Remember np.diff?
bin_widths = np.diff(binned_data.column('bin'))bin_widthsheight_table = height_table.with_column('Width', bin_widths)
height_tableheight_table = height_table.with_column('Height',
height_table.column('Percent')/height_table.column('Width'))height_tableTo check our work one last time, let’s see if the numbers in the last column match the heights of the histogram:
top_movies.hist('Age', bins = my_bins, unit = 'Year')