from datascience import *
import numpy as np
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import warnings
warnings.simplefilter("ignore")Review of the Steps in Classification & Functions¶
distance(pt1, pt2): Returns the distance between the arrayspt1andpt2row_distance(row1, row2): Returns the distance between the rowsrow1androw2distances(training, example): Returns a table that istrainingwith an additional column'Distance'that contains the distance betweenexampleand each row oftrainingclosest(training, example, k): Returns a table of the rows corresponding to the k smallest distancesmajority_class(topk): Returns the majority class in the'Class'columnclassify(training, example, k): Returns the predicted class ofexamplebased on aknearest neighbors classifier using the historical sampletrainingclassify_all(training, test, k): Return thetesttable with aPredictioncolumn that results from callingclassifyon eachtestexample.get_accuracy(t, prediction_label='Prediction'): Return the accuracy, which is the fraction of values in thePredictioncolumn that match theClasscolumn.evaluate_accuracy(training, test, k): Classify all rows of the test set and return the accuracy.
from tqdm.notebook import tqdm # This generates animated progress bars
def distance(pt1, pt2):
"""Return the distance between two points, represented as arrays"""
return np.sqrt(sum((pt1 - pt2)**2))
def row_distance(row1, row2):
"""Return the distance between two numerical rows of a table"""
return distance(np.array(row1), np.array(row2))
def distances(training, example):
"""
Compute distance between example and every row in training.
Return training augmented with Distance column
"""
distances = make_array()
attributes_only = training.drop('Class')
for row in attributes_only.rows:
distances = np.append(distances, row_distance(row, example))
return training.with_column('Distance_to_ex', distances)
def closest(training, example, k):
"""
Return a table of the k closest neighbors to example
"""
return distances(training, example).sort('Distance_to_ex').take(np.arange(k))
def majority_class(topk):
"""
Return the class with the highest count
"""
return topk.group('Class').sort('count', descending=True).column(0).item(0)
def classify(training, example, k):
"""
Return the majority class among the
k nearest neighbors of example
"""
return majority_class(closest(training, example, k))
def classify_all(training, test, k):
"""Classify each row of the test table and add a column of the results."""
test_attributes = test.drop('Class')
guesses = make_array()
for i in tqdm(np.arange(test.num_rows)):
c = classify(training, test_attributes.row(i), k)
guesses = np.append(guesses, c)
return test.with_column("Prediction", guesses)
def get_accuracy(t, prediction_label='Prediction'):
"""Return the accuracy on a test table with Class and Prediction columns."""
return sum(t.column('Class') == t.column(prediction_label)) / t.num_rows
def evaluate_accuracy(training, test, k):
"""Return the proportion of correctly classified examples
in the test set"""
return get_accuracy(classify_all(training, test, k))Text Classification¶
SMS Spam¶
from datasets import load_dataset
sms = load_dataset('ucirvine/sms_spam', split='train').shuffle(seed=42)
sms_texts = np.array(sms['sms'])
sms_labels = np.array(sms['label'])
sms_tbl = Table().with_columns('Text', sms_texts, 'Class', sms_labels)
sms_tbl.group('Class').show()sms_tbl.where('Class', 1).sample(with_replacement=False).show(5)sms_tbl.where('Class', 0).sample(with_replacement=False).show(5)texts = sms_tbl.column('Text')
sms_data = Table().with_columns(
'Chars', np.char.str_len(texts),
'Digits', sum(np.char.count(texts, str(d)) for d in range(10)),
'Caps', sum(np.char.count(texts, chr(c)) for c in range(65, 91)),
'Exclamations', np.char.count(texts, '!'),
'Class', sms_tbl.column('Class')
)
sms_datasms_data.scatter('Digits', 'Caps', group='Class')shuffled = sms_data.sample(with_replacement=False)
test_size = 100
train_sms = shuffled.take(np.arange(test_size, shuffled.num_rows))
test_sms = shuffled.take(np.arange(test_size))
print('Training:', train_sms.num_rows, ' Test:', test_sms.num_rows)
evaluate_accuracy(train_sms, test_sms, 5)Rotten Tomatoes Movie Reviews¶
reviews_full = load_dataset('rotten_tomatoes', split='train')
reviews_short = reviews_full.filter(lambda x: 5 <= len(x['text'].split()) <= 10)
reviews = Table().with_columns('Text', reviews_short['text'],
'Class', reviews_short['label'])
reviews = reviews.sample(with_replacement=False) # Permute the rows
reviews.group('Class')reviews.sample(5)words = [ # The most common adjectives in the data
'good', 'bad', 'funny', 'little', 'much', 'new', 'best',
'many', 'own', 'other', 'big', 'great', 'most', 'few',
'real', 'first', 'full', 'american', 'romantic', 'same', 'old',
'better', 'young', 'original', 'interesting', 'human',
'hard', 'cinematic', 'enough', 'emotional', 'last', 'least', 'long',
'true', 'predictable', 'visual', 'whole', 'high', 'special',
'entertaining', 'sweet', 'enjoyable', 'narrative', 'familiar'
]
counts = Table(['Word', 'Positive', 'Negative'])
for word in words:
has_word = reviews.where('Text', are.containing(word))
counts = counts.with_row([word, has_word.where('Class', 1).num_rows,
has_word.where('Class', 0).num_rows])
countsreviews.where('Text', are.containing('funny')).where('Class', 0).sample(5, with_replacement=False)texts = reviews.column('Text')
review_words = Table().with_column('Class', reviews.column('Class'))
for word in words:
review_words = review_words.with_column(word, np.char.count(np.char.lower(texts), word))
review_words.sample(5)train_reviews = review_words.take(np.arange(test_size, reviews.num_rows))
test_reviews = review_words.take(np.arange(test_size))
print('Word-count KNN:')
evaluate_accuracy(train_reviews, test_reviews, 5)classify_all(train_reviews, test_reviews, 5).pivot('Prediction', 'Class')Sentence Embeddings¶
Data 8 students are not responsible for learning the details of the code below, such as how to call embedder.encode or understanding what it does.
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('all-MiniLM-L6-v2')
review_emb = embedder.encode(list(reviews.column('Text')), show_progress_bar=True)
print('Embedding shape:', review_emb.shape)n_features = 64 # Increasing this will help, but above 128 datahub will crash
cols = ['Class', reviews.column('Class')]
for i in range(n_features):
cols += [f'Embed{i}', review_emb[:, i]]
review_emb_table = Table().with_columns(*cols)
review_emb_table.row(0)train = review_emb_table.take(np.arange(test_size, reviews.num_rows))
test = review_emb_table.take(np.arange(test_size))
evaluate_accuracy(train, test, 5)classify_all(train, test, 5).pivot('Prediction', 'Class')BONUS MATERIAL¶
WARNING: Unfortunately, data8
Fine-tuned embeddings¶
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
finetune_texts = list(reviews.column('Text')[test_size:])
finetune_labels = torch.tensor(list(reviews.column('Class')[test_size:]))
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
batch_size = 16
model.train()
for epoch in tqdm(range(7)):
for i in range(0, len(finetune_texts), batch_size):
batch_texts = finetune_texts[i:i+batch_size]
batch_labels = finetune_labels[i:i+batch_size]
inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=64)
loss = model(**inputs, labels=batch_labels).loss
loss.backward()
optimizer.step()
optimizer.zero_grad()model.eval()
eval_texts = list(reviews.column('Text'))
with torch.no_grad():
inputs = tokenizer(eval_texts, return_tensors='pt', padding=True, truncation=True, max_length=64)
review_bert_emb = model.bert(**inputs).pooler_output.numpy()
print('Fine-tuned embedding shape:', review_bert_emb.shape)cols = ['Class', reviews.column('Class')]
for i in range(review_bert_emb.shape[1]):
cols += [f'Embed{i}', review_bert_emb[:, i]]
review_bert_table = Table().with_columns(*cols)
train = review_bert_table.take(np.arange(test_size, reviews.num_rows))
test = review_bert_table.take(np.arange(test_size))
evaluate_accuracy(train, test, 5)test_texts = list(reviews.column('Text')[:test_size])
test_labels = reviews.column('Class')[:test_size]
with torch.no_grad():
inputs = tokenizer(test_texts, return_tensors='pt', padding=True, truncation=True, max_length=64)
predictions = model(**inputs).logits.argmax(dim=1).numpy()
print('BERT classifier accuracy:', np.mean(predictions == test_labels))