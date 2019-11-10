Subscribe to Hacker Noon's best tech stories, delivered at noon
Allowing data scientists and teams the ability to track, compare, explain, reproduce ML experiments.
# Comet
from comet_ml import Experiment
# Standard packages
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# nltk
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
# sklearn for preprocessing and machine learning models
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
# Keras for neural networks
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
raw_df = pd.read_csv('twitter-airline-sentiment/Tweets.csv')
raw_df.shape()
>>> (14640, 15)
raw_df.columns
'tweet_id' , 'airline_sentiment' , 'airline_sentiment_confidence' ,
'negativereason' , 'negativereason_confidence' , 'airline' ,
'airline_sentiment_gold' , 'name' , 'negativereason_gold' ,
'retweet_count' , 'text' , 'tweet_coord' , 'tweet_created' ,
'tweet_location' , 'user_timezone'
# Create a Comet experiment to start tracking our work
experiment = Experiment(
api_key='<HIDDEN>',
project_name='nlp-airline',
workspace='demo')
experiment.add_tag('plotting')
airlines= ['US Airways',
'United',
'American',
'Southwest',
'Delta',
'Virgin America']
for i in airlines:
indices = airlines.index(i)
new_df=raw_df[raw_df['airline']==i]
count=new_df['airline_sentiment'].value_counts()
experiment.log_metric('{} negative'.format(i), count[0])
experiment.log_metric('{} neutral'.format(i), count[1])
experiment.log_metric('{} positive'.format(i), count[2])
experiment.end()
,
tweet_id
, and
text
features.
airline_sentiment
df = raw_df[['tweet_id', 'text', 'airline_sentiment']]
df['text'][1]
> "@VirginAmerica plus you've added commercials to the experience... tacky."
df['text'][750]
> "@united you are offering us 8 rooms for 32 people #FAIL"
df['text'][5800]
> "@SouthwestAir Your #Android Wi-Fi experience is terrible! $8 is a ripoff! I can't get to @NASCAR or MRN for @DISupdates #BudweiserDuels"
from nltk.tokenize import word_tokenize
def tokenize(sentence):
tokenized_sentence = word_tokenize(sentence)
return tokenized_sentence
from nltk.corpus import stopwords
class PreProcessor:
def __init__(self, df, column_name):
self.stopwords = set(stopwords.words('english'))
def remove_stopwords(self, sentence)
filtered_sentence = [
for w in sentence
if ((w not in self.stopwords) and
(len(w) > 1) and
(w[:2] != '//') and
(w != 'https'))
filtered_sentence.append(w
return filtered sentence
from nltk.stem.snowball import SnowballStemmer
class PreProcessor:
def __init__(self, df, column_name):
self.stemmer = SnowballStemmer('english')
def stem(self, sentence):
return [self.stemmer.stem(word) for word in sentence]
preprocessor = PreProcessor(df, 'text')
df['cleaned text'] = preprocessor.full_preprocess()
df = shuffle(df, random_state=seed)
# Keep 1000 samples of the data as test set
test_set = df[:1000]
# Get training and validation data
X_train, X_val, y_train, y_val = train_test_split(df['cleaned_text'][1000:], df['airline_sentiment'][1000:], test_size=0.2, random_state=seed)
# Get sentiment labels for test set
y_test = test_set['airline_sentiment']
vectorizer = TfidVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_val = vectorizer.transform(X_val)
X_test = vectorizer.transform(test_set['cleaned_text'])
experiment = Experiment(api_key='your-personal-key',
project_name='nlp-airline', workspace='demo')
# sklearn's Gradient Boosting Classifier (GBM)
gbm = GradientBoostingClassifier(n_estimators=200, max_depth=6, random_state=seed)
gbm.fit(X_train, y_train)
# Check results
train_pred = gbm.predict(X_train)
val_pred = gbm.predict(X_val)
val_accuracy = round(accuracy_score(y_val,val_pred), 4)
train_accuracy = round(accuracy_score(y_train, train_pred), 4)
# log to comet
experiment.log_metric('val_acc', val_accuracy)
experiment.log_metric('Accuracy', train_accuracy)
xgb_params = {'objective' : 'multi:softmax',
'eval_metric' : 'mlogloss',
'eta' : 0.1,
'max_depth' : 6,
'num_class' : 3,
'lambda' : 0.8,
'estimators' : 200,
'seed' : seed
}
target_train = y_train.astype('category').cat.codes
target_val = y_val.astype('category').cat.codes
# Transform data into a matrix so that we can use XGBoost
d_train = xgb.DMatrix(X_train, label = target_train)
d_val = xgb.DMatrix(X_val, label = target_val)
# Fit XGBoost
watchlist = [(d_train, 'train'), (d_val, 'validation')]
bst = xgb.train(xgb_params, d_train, 400, watchlist, early_stopping_rounds = 50, verbose_eval = 0)
# Check results for XGBoost
train_pred = bst.predict(d_train)
val_pred = bst.predict(d_val)
experiment.log_metric('val_acc', round(accuracy_score(target_val, val_pred)*100, 4))
experiment.log_metric('Accuracy', round(accuracy_score(target_train, train_pred)*100, 4))
# Generator so we can easily feed batches of data to the neural network
def batch_generator(X, y, batch_size, shuffle):
number_of_batches = X.shape[0]/batch_size
counter = 0
sample_index = np.arange(X.shape[0])
if shuffle:
np.random.shuffle(sample_index)
while True:
batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
X_batch = X[batch_index,:].toarray()
y_batch = y[batch_index]
counter += 1
yield X_batch, y_batch
if (counter == number_of_batches):
if shuffle:
np.random.shuffle(sample_index)
counter = 0
# Initialize sklearn's one-hot encoder class
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded_train = np.array(y_train).reshape(len(y_train), 1)
onehot_encoded_train = onehot_encoder.fit_transform(integer_encoded_train)
integer_encoded_val = np.array(y_val).reshape(len(y_val), 1)
onehot_encoded_val = onehot_encoder.fit_transform(integer_encoded_val)
experiment.add_tag('NN')
# Neural network architecture
initializer = keras.initializers.he_normal(seed=seed)
activation = keras.activations.elu
optimizer = keras.optimizers.Adam(lr=0.0002, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
es = EarlyStopping(monitor='val_acc', mode='max', verbose=1, patience=4)
# Build model architecture
model = Sequential()
model.add(Dense(20, activation=activation, kernel_initializer=initializer, input_dim=X_train.shape[1]))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax', kernel_initializer=initializer))
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
# Hyperparameters
epochs = 15
batch_size = 32
# Fit the model using the batch_generator
hist = model.fit_generator(generator=batch_generator(X_train, onehot_encoded_train, batch_size=batch_size, shuffle=True), epochs=epochs, validation_data=(X_val, onehot_encoded_val), steps_per_epoch=X_train.shape[0]/batch_size, callbacks=[es])
from comet_ml import Optimizer
config = {
"algorithm": "bayes",
"parameters": {
"batch_size": {"type": "integer", "min": 16, "max": 128},
"dropout": {"type": "float", "min": 0.1, "max": 0.5},
"lr": {"type": "float", "min": 0.0001, "max": 0.001},
},
"spec": {
"metric": "loss",
"objective": "minimize",
},
}
opt = Optimizer(config, api_key="<HIDDEN>", project_name="nlp-airline", workspace="demo")
for experiment in opt.get_experiments():
experiment.add_tag('LR-Optimizer')
# Neural network architecture
initializer = keras.initializers.he_normal(seed=seed)
activation = keras.activations.elu
optimizer = keras.optimizers.Adam(
lr=experiment.get_parameter("lr"),
beta_1=0.99,
beta_2=0.999,
epsilon=1e-8)
es = EarlyStopping(monitor='val_acc',
mode='max',
verbose=1,
patience=4)
batch_size = experiment.get_parameter("batch_size")
# Build model architecture
model = Sequential(# Build model like above)
score = model.evaluate(X_test, onehot_encoded_val, verbose=0)
logging.info("Score %s", score)
,
epoch
, and
batch_size
. The parallel coordinates chart shown below, another native Comet feature, provides a useful visualization of the underlying hyperparameter space our optimizer has traversed:
dropout
values.
val_acc