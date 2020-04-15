Enhance data science skills and jump on a career with Just into Data Tutorials + Applications.
Exploratory data analysis (EDA) is an approach to analyzing data sets to summarize their main characteristics, often with visual methods.A statistical model can be used or not, but primarily EDA is for seeing what the data can tell us beyond the formal modeling or hypothesis testing task.
# import packages
import pandas as pd
import numpy as np
import json
import datetime
import math
from datetime import timedelta, datetime
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot')
from matplotlib.pyplot import figure
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,8)
pd.options.mode.chained_assignment = None
import seaborn as sns
# read the data
df = pd.read_pickle('sydney.pkl')
df_numeric = df.select_dtypes(include='number')
df_numeric
A histogram is an approximate representation of the distribution of numerical data.To construct a histogram, the first step is to “bin” (or “bucket”) the range of values—that is, divide the entire range of values into a series of intervals—and then count how many values fall into each interval.
sns.distplot(df_numeric['length'], bins=50, kde=True, rug=True)
# this plots multiple seaborn histograms on different subplots.
#
def plot_multiple_histograms(df, cols):
num_plots = len(cols)
num_cols = math.ceil(np.sqrt(num_plots))
num_rows = math.ceil(num_plots/num_cols)
fig, axs = plt.subplots(num_rows, num_cols)
for ind, col in enumerate(cols):
i = math.floor(ind/num_cols)
j = ind - i*num_cols
if num_rows == 1:
if num_cols == 1:
sns.distplot(df[col], kde=True, ax=axs)
else:
sns.distplot(df[col], kde=True, ax=axs[j])
else:
sns.distplot(df[col], kde=True, ax=axs[i, j])
plot_multiple_histograms(df, ['length', 'views', 'calories', 'days_since_posted'])
A bar chart or bar plot is a chart or graph that presents categorical data with rectangular bars with heights or lengths proportional to the values that they represent.A bar graph shows comparisons among discrete categories.
#select non-numeric variables
df_non_numeric = df.select_dtypes(exclude='number')
plt.figure(figsize=(25,7))
sns.countplot(x="area",
data=df_non_numeric)
# this plots multiple seaborn countplots on different subplots.
#
def plot_multiple_countplots(df, cols):
num_plots = len(cols)
num_cols = math.ceil(np.sqrt(num_plots))
num_rows = math.ceil(num_plots/num_cols)
fig, axs = plt.subplots(num_rows, num_cols)
for ind, col in enumerate(cols):
i = math.floor(ind/num_cols)
j = ind - i*num_cols
if num_rows == 1:
if num_cols == 1:
sns.countplot(x=df[col], ax=axs)
else:
sns.countplot(x=df[col], ax=axs[j])
else:
sns.countplot(x=df[col], ax=axs[i, j])
plot_multiple_countplots(df_non_numeric, ['is_butt_area', 'is_upper_area', 'is_cardio_workout', 'is_strength_workout'])
A scatter plot uses Cartesian coordinates to display values for typically two variables for a set of data. If the points are coded (color/shape/size), one additional variable can be displayed.The data are displayed as a collection of points, each having the value of one variable determining the position on the horizontal axis and the value of the other variable determining the position on the vertical axis.
sns.relplot(x='length', y='views', data=df, aspect=2.0)
top6 = list(df['area'].value_counts().index[:5])
df['area2'] = df['area']
msk = df['area2'].isin(top6)
df.loc[~msk, 'area2'] = 'Other'
top4 = list(df['workout_type'].value_counts().index[:3])
df['workout_type2'] = df['workout_type']
msk = df['workout_type2'].isin(top4)
df.loc[~msk, 'workout_type2'] = 'Other'
order = df['area2'].value_counts().index # order the columns from highest count to lowest.
sns.catplot(x="workout_type2",
col='area2',
col_order=order,
kind="count", data=df,
aspect=0.5)
A box plot (box-and-whisker plot) is a standardized way of displaying the dataset based on a five-number summary: the minimum, the maximum, the sample median, and the first and third quartiles.
to_replace = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}
df['day_of_week_num'] = df['date'].dt.dayofweek
df['day_of_week'] = df['day_of_week_num'].replace(to_replace=to_replace)
order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
sns.boxplot(x="day_of_week", y="views", data=df, order=order)
msk = df['views'] < 400000
sns.boxplot(x="day_of_week", y="views", data=df[msk], order=order)
A swarm plot is a categorical scatterplot where the points are adjusted (only along the categorical axis) so that they don’t overlap.This gives a better representation of the distribution of values.
sns.swarmplot(x="day_of_week", y="views", data=df[msk], order=order)
df['workout_type'].value_counts()
top4 = list(df['workout_type'].value_counts().index[:3])
df['workout_type2'] = df['workout_type']
msk = df['workout_type2'].isin(top4)
df.loc[~msk, 'workout_type2'] = 'Other'
msk = df['views'] < 400000
sns.catplot(x="workout_type2", y="views",
col="day_of_week",
kind="box", data=df[msk], col_order=order,
aspect=0.5)
A heat map is a data visualization technique that shows the magnitude of a phenomenon as color in two dimensions.The variation in color may be by hue or intensity, giving obvious visual cues to the reader about how the phenomenon is clustered or varies over space.
df_area_workout = df.groupby(['area', 'workout_type'])['views'].count().reset_index()
df_area_workout_pivot = df_area_workout.pivot(index='area', columns='workout_type', values='views').fillna(0)
sns.heatmap(df_area_workout_pivot, annot=True, fmt='.0f', cmap="YlGnBu")
# group of critical features selected
cols = ['length', 'views', 'calories', 'days_since_posted', 'area', 'workout_type', 'day_of_week']
df_test = df[cols]
df_test.head()
numeric_columns = set(df_test.select_dtypes(include=['number']).columns)
non_numeric_columns = set(df_test.columns) - numeric_columns
print(numeric_columns)
print(non_numeric_columns)
for c in non_numeric_columns:
cnt = df_test[c].value_counts()
small_cnts = list(cnt[cnt < 5].index)
s_replace = {}
for sm in small_cnts:
s_replace[sm] = 'other'
df_test[c] = df_test[c].replace(s_replace)
df_test[c] = df_test[c].fillna('other')
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
# we are going to look at feature importances so we like putting random features to act as a benchmark.
df_test['rand0'] = np.random.rand(df_test.shape[0])
df_test['rand1'] = np.random.rand(df_test.shape[0])
df_test['rand2'] = np.random.rand(df_test.shape[0])
# testing for relationships.
# for numeric targets.
reg = GradientBoostingRegressor(n_estimators=100, max_depth=5,
learning_rate=0.1, loss='ls',
random_state=1)
# for categorical targets.
clf = GradientBoostingClassifier(n_estimators=100, max_depth=5,
learning_rate=0.1, loss='deviance',
random_state=1)
df_test['calories'] = df_test['calories'].fillna(0) # only calories should have missing values.
# try to predict one feature using the rest of others to test collinearity, so it's easier to interpret the results
for c in cols:
# c is the thing to predict.
if c not in ['rand0', 'rand1', 'rand2']:
X = df_test.drop([c], axis=1) # drop the thing to predict.
X = pd.get_dummies(X)
y = df_test[c]
print(c)
if c in non_numeric_columns:
scoring = 'accuracy'
model = clf
scores = cross_val_score(clf, X, y, cv=5, scoring=scoring)
print(scoring + ": %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
elif c in numeric_columns:
scoring = 'neg_root_mean_squared_error'
model = reg
scores = cross_val_score(reg, X, y, cv=5, scoring=scoring)
print(scoring.replace('neg_', '') + ": %0.2f (+/- %0.2f)" % (-scores.mean(), scores.std() * 2))
else:
print('what is this?')
model.fit(X, y)
df_importances = pd.DataFrame(data={'feature_name': X.columns, 'importance': model.feature_importances_}).sort_values(by='importance', ascending=False)
top5_features = df_importances.iloc[:5]
print('top 5 features:')
print(top5_features)
print()
# Length, cal
sns.relplot(x='length',
y='calories', size='views', sizes=(10, 1000), data=df, aspect=3.0)