https://www.linkedin.com/in/sharmistha-chatterjee-7a186310/
An algorithm A satisfies ε-differential privacy if for every t in the range of A, and for every pair of neighboring databases D and D’,
df = pd.read_csv('data/complaints.csv') #downloaded from www.kaggle.com/selener/multi-class-text-classification-tfidf/data
print(df.head(2).T)
# Create a new dataframe with two columns
df1 = df[['Product', 'Consumer complaint narrative']].copy()
df1 = df1[pd.notnull(df1['Consumer complaint narrative'])]
# Renaming second column
df1.columns = ['Product', 'Consumer_complaint']
# Percentage of complaints with text
total = df1['Consumer_complaint'].notnull().sum()
round((total / len(df) * 100), 1)
print(pd.DataFrame(df.Product.unique()).values)
df2 = df1.sample(10000, random_state=1).copy()
# Renaming categories
df2.replace({'Product':
{'Credit reporting, credit repair services, or other personal consumer reports':
'CreditReporting',
'Credit reporting': 'CreditReporting',
'Credit card': 'CreditPrepaidCard',
'Prepaid card': 'CreditPrepaidCard',
'Credit card or prepaid card': 'CreditPrepaidCard',
'Payday loan': 'PersonalLoan',
'Payday loan, title loan, or personal loan' : 'PersonalLoan',
'Money transfer': 'TransferServices',
'Virtual currency': 'TransferServices',
'Money transfer, virtual currency, or money service' : 'TransferServices',
'Student loan': 'StudentLoan',
'Checking or savings account': 'SavingsAccount',
'Vehicle loan or lease': 'VehicleLoan',
'Debt collection': 'DebtCollection',
'Bank account or service' : 'BankAccount',
'Other financial service': 'FinancialServices',
'Consumer Loan': 'ConsumerLoan',
'Money transfers': 'MoneyTransfers'}},
inplace=True)
print(pd.DataFrame(df2.Product.unique()))
# Create a new column 'category_id' with label-encoded categories
le = preprocessing.LabelEncoder()
df2['category_id'] = le.fit_transform(df2['Product'])
category_id_df = df2[['Product', 'category_id']].drop_duplicates()
print(df2.head())
fig = plt.figure(figsize=(8, 6))
colors = ['grey', 'grey', 'grey', 'grey', 'grey', 'grey', 'grey', 'grey', 'grey',
'grey', 'darkblue', 'darkblue', 'darkblue']
df2.groupby('Product').Consumer_complaint.count().sort_values().plot.barh(
ylim=0, color=colors, title='NUMBER OF COMPLAINTS IN EACH PRODUCT CATEGORY\n')
plt.xlabel('Number of ocurrences', fontsize=10)
plt.show()
product_comments = df2['Consumer_complaint'].values # Collection of documents
product_type = df2['category_id'].values # Target or the labels we want to predict (i.e., the 13 different complaints of products)
complains = []
labels = []
for i in range(0, len(product_comments)):
complain = product_comments[i]
labels.append(product_type[i])
complain = complain.replace('XX', '')
complain = complain.replace('.', '')
for word in STOPWORDS:
token = ' ' + word + ' '
complain = complain.replace(token, ' ')
complain = complain.replace(' ', ' ')
complains.append(complain)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(complains)
word_index = tokenizer.word_index
vocab_size = len(word_index)
sequences = tokenizer.texts_to_sequences(product_comments)
padded = pad_sequences(sequences, maxlen=max_length)
train_size = int(len(product_comments) * 0.7)
validation_size = int(len(product_comments) * 0.2)
training_sequences = padded[0:train_size]
train_labels = labels[0:train_size]
validation_sequences = padded[train_size:train_size+validation_size]
validation_labels = labels[train_size:train_size+validation_size]
test_sequences = padded[train_size + validation_size:]
test_labels = labels[train_size + validation_size:]
training_label_seq = np.reshape(np.array(train_labels), (len(train_labels), 1))
validation_label_seq = np.reshape(np.array(validation_labels), (len(validation_labels), 1))
test_label_seq = np.reshape(np.array(test_labels), (len(test_labels), 1))
embeddings_index = {};
with open('embedding/glove.6B/glove.6B.100d.txt') as f:
for line in f:
values = line.split();
word = values[0];
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs;
embeddings_matrix = np.zeros((vocab_size + 1, embedding_dim))
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embeddings_matrix[i] = embedding_vector
print(len(embeddings_matrix))
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size + 1, embedding_dim, input_length=max_length, weights=[embeddings_matrix],
trainable=False),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Conv1D(64, 5, activation='relu'),
tf.keras.layers.MaxPooling1D(pool_size=4),
tf.keras.layers.LSTM(64),
tf.keras.layers.Dense(13, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
if FLAGS.dpsgd:
optimizer = DPAdamGaussianOptimizer(
l2_norm_clip=FLAGS.l2_norm_clip,
noise_multiplier=FLAGS.noise_multiplier,
num_microbatches=FLAGS.microbatches,
learning_rate=FLAGS.learning_rate)
else:
optimizer = AdamOptimizer()
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
num_epochs = 10
history = model.fit(training_sequences, training_label_seq, epochs=num_epochs,
validation_data=(validation_sequences, validation_label_seq), verbose=2)
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")
scores = model.evaluate(test_sequences, test_label_seq, verbose=0)
print("Accuracy: %.2f%%" % (scores[1] * 100))
output_test = model.predict(test_sequences)
print(np.shape(output_test))
final_pred = np.argmax(output_test, axis=1)
print(np.shape(final_pred))
print(np.shape(test_label_seq))
final_pred_list = np.reshape(final_pred, (len(test_sequences), 1))
print(np.shape(final_pred_list))
results = confusion_matrix(test_label_seq, final_pred_list)
print(results)
precisions, recall, f1_score, true_sum = metrics.precision_recall_fscore_support(test_label_seq, final_pred_list)
print("Multi-label Classification LSTM CNN Precision =", precisions)
print("Multi-label Classification LSTM CNN Recall=", recall)
print("Multi-label Classification LSTM CNN F1 Score =", f1_score)
print('Multi-label Classification Accuracy: {}'.format((accuracy_score(test_label_seq, final_pred_list))))
classes = np.array(range(0, 13))
#print('Log loss: {}'.format(log_loss(classes[np.argmax(test_label_seq, axis=1)], output_test)))
# Compute the privacy budget expended.
if FLAGS.dpsgd:
eps = compute_epsilon(FLAGS.epochs * 10000 // FLAGS.batch_size) #based on total data size
print('For delta=1e-5, the current epsilon is: %.2f' % eps)
else:
print('Trained with vanilla non-private SGD optimizer')
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding (Embedding) (None, 2000, 100) 2078700
_________________________________________________________________
dropout (Dropout) (None, 2000, 100) 0
_________________________________________________________________
conv1d (Conv1D) (None, 1996, 64) 32064
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 499, 64) 0
_________________________________________________________________
lstm (LSTM) (None, 64) 33024
_________________________________________________________________
dense (Dense) (None, 13) 845
=================================================================
Total params: 2,144,633
Trainable params: 65,933
Non-trainable params: 2,078,700
_________________________________________________________________
Train on 7000 samples, validate on 2000 samples
Epoch 1/10
7000/7000 - 67s - loss: 1.7944 - accuracy: 0.3863 - val_loss: 1.5671 - val_accuracy: 0.4805
Epoch 2/10
7000/7000 - 61s - loss: 1.3817 - accuracy: 0.5553 - val_loss: 1.2037 - val_accuracy: 0.6305
Epoch 3/10
7000/7000 - 61s - loss: 1.0965 - accuracy: 0.6531 - val_loss: 1.0746 - val_accuracy: 0.6620
Epoch 4/10
7000/7000 - 61s - loss: 0.9596 - accuracy: 0.6901 - val_loss: 0.9218 - val_accuracy: 0.7005
Epoch 5/10
7000/7000 - 59s - loss: 0.8845 - accuracy: 0.7123 - val_loss: 0.9003 - val_accuracy: 0.7040
Epoch 6/10
7000/7000 - 64s - loss: 0.8186 - accuracy: 0.7330 - val_loss: 0.8818 - val_accuracy: 0.7080
Epoch 7/10
7000/7000 - 63s - loss: 0.7804 - accuracy: 0.7459 - val_loss: 0.8699 - val_accuracy: 0.7195
Epoch 8/10
7000/7000 - 65s - loss: 0.7466 - accuracy: 0.7540 - val_loss: 0.8770 - val_accuracy: 0.7135
Epoch 9/10
7000/7000 - 695s - loss: 0.7047 - accuracy: 0.7639 - val_loss: 0.9187 - val_accuracy: 0.7120
Epoch 10/10
7000/7000 - 67s - loss: 0.6657 - accuracy: 0.7799 - val_loss: 0.8833 - val_accuracy: 0.7200
Accuracy: 72.30%
(1000, 13)
(1000,)
(1000, 1)
(1000, 1)
[[ 13 0 4 4 0 0 3 0 5 1 0 0]
[ 1 2 1 5 2 0 0 0 1 2 0 1]
[ 13 1 71 14 8 0 0 0 2 0 3 0]
[ 1 1 7 329 14 0 6 0 0 1 0 2]
[ 3 0 5 57 163 0 3 1 0 2 1 0]
[ 4 0 0 0 0 0 0 0 0 0 0 0]
[ 1 0 0 9 5 0 95 1 0 3 0 0]
[ 2 1 1 2 1 0 1 1 0 5 0 1]
[ 13 0 2 2 3 0 1 0 11 0 1 0]
[ 0 1 1 8 6 0 1 2 0 33 0 0]
[ 7 1 3 1 0 0 0 0 4 1 0 0]
[ 0 0 1 3 1 0 2 0 0 0 0 5]]
Multi-label Classification LSTM CNN Precision = [0.22413793 0.28571429 0.73958333 0.75806452 0.80295567 0.
0.84821429 0.2 0.47826087 0.6875 0. 0.55555556]
Multi-label Classification LSTM CNN Recall= [0.43333333 0.13333333 0.63392857 0.91135734 0.69361702 0.
0.83333333 0.06666667 0.33333333 0.63461538 0. 0.41666667]
Multi-label Classification LSTM CNN F1 Score = [0.29545455 0.18181818 0.68269231 0.82767296 0.74429224 0.
0.84070796 0.1 0.39285714 0.66 0. 0.47619048]
Multi-label Classification Accuracy: 0.723
For delta=1e-5, the current epsilon is: 8.07
def plot_graphs(history, string):
plt.plot(history.history[string])
plt.plot(history.history['val_' + string])
plt.xlabel("Epochs")
plt.ylabel(string)
plt.legend([string, 'val_' + string])
plt.show()
orders = [1 + x / 10. for x in range(1, 100)] + list(range(12, 64))
sampling_probability = FLAGS.batch_size / 10000
rdp = compute_rdp(q=sampling_probability,
noise_multiplier=FLAGS.noise_multiplier,
steps=steps,
orders=orders) #Rényi differential privacy Gaussian Mechanism
# Delta is set to 1e-5 because product_reviews has 70000 training points.
return get_privacy_spent(orders, rdp, target_delta=1e-5)[0]
The complete source code is available at https://github.com/sharmi1206/differential-privacy-tensorflow with more examples for text and image classification