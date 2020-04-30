https://www.linkedin.com/in/sharmistha-chatterjee-7a186310/
import fastText as ft
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.metrics import accuracy_score
#appending __label__ before the target mood
labelled_mood = '__label__' + df_train.mood
df_train['labels_text'] = labelled_mood
df_test['labels_text'] = '__label__' + df_test.mood
df_train.labels_text = df_train.labels_text.str.cat(df_train.tweet, sep=' ')
kf = KFold(n_splits=k, shuffle=True)
for train_index, test_index in kf.split(YX):
YX[train_index].to_csv('train.csv, index = False)
# Fit model for this set of parameter values
model = ft.FastText.train_supervised('train.csv',
lr=lr_val,
wordNgrams=wordNgrams_val,
epoch=epoch_val)
df_valid= pd.DataFrame(data=YX[test_index]).dropna()
pred = model.predict(df_valid['labels_text'].tolist())
pred = pd.Series(pred[0]).apply(lambda x: re.sub('__label__', '', x[0]))
org =df_valid['labels_text'].apply(lambda x: re.sub('__label__', '', x[9:x.find(' ')])) #substituting label with ''
# Accuracy for each cross-validation fold
kf_accuracy = accuracy_score(org.values, pred.values)
accuracy.append(kf_accuracy)
mean_acc = np.mean(accuracy)
cross_valid_params[mean_acc] = [lr_val, wordNgrams_val, epoch_val]
#Retrieving the max accuracy from dict
max_acc = max(cross_valid_params.keys())
lr_val, wordNgrams_val, epoch_val = cross_valid_params[max_acc]
precisions, recall, f1_score, true_sum = metrics.precision_recall_fscore_support(org.values, pred_filter.values)
print("Fast Text Precision =", precisions)
print("Fast Text Recall=", recall)
print("Fast Text F1 Score =", f1_score)
accuracy_score = metrics.accuracy_score(org, pred_fil
Results of sentiment prediction for BJP
BJP Fast Text Precision = [0.45833333 0.56976744 0.73802395 0.78723404 0.70491803 0.7260274 0.59166667 0.2 ]
BJP Fast Text Recall= [0.36065574 0.37692308 0.84129693 0.71153846 0.61870504 0.79613734 0.73958333 0.01298701]
BJP Fast Text F1 Score = [0.40366972 0.4537037 0.78628389 0.74747475 0.65900383 0.75946776 0.65740741 0.02439024]
BJP Fast Tex Accuracy = 70.3%
Results of sentiment prediction for Congress
Classify Fast Text Precision = [0.42857143 0.5483871 0.74174174 0.7826087 0.71311475 0.73465347
0.59504132 0.2 ]
Classify Fast Text Recall= [0.3442623 0.39230769 0.84300341 0.69230769 0.62589928 0.79613734
0.75 0.01298701]
Classify Fast Text F1 Score = [0.38181818 0.4573991 0.78913738 0.73469388 0.66666667 0.76416066
0.66359447 0.02439024]
Accuracy= 70.5%
for index, row in data.iterrows():
JBM = 0
FBM = 0
SBM = 0
ABM = 0
DBM = 0
EBM = 0
NBM = 0
TBM = 0
add = 1
word_mood = []
bigram = ngrams(row[0], 2)
for pair in bigram:
word_mood.append(pair[0][0])
word_mood.append(pair[1][0])
processed_mood = md.get_n_gram_mood(word_mood)
# print (processed_mood)
if (processed_mood == 'sadness'):
SBM = SBM + 1
elif (processed_mood == 'joy'):
JBM = JBM + 1
elif (processed_mood == 'faith'):
FBM = FBM + 1
elif (processed_mood == 'neutral'):
NBM = NBM + 1
elif (processed_mood == 'dominance'):
DBM = DBM + 1
elif (processed_mood == 'arousal'):
EBM = EBM + 1
elif (processed_mood == 'fear'):
TBM = TBM + 1
elif (processed_mood == 'anger'):
ABM = ABM + 1
df.set_value(index, 'SBM', SBM)
df.set_value(index, 'JBM', JBM)
df.set_value(index, 'FBM', FBM)
df.set_value(index, 'NBM', NBM)
df.set_value(index, 'DBM', DBM)
df.set_value(index, 'EBM', EBM)
df.set_value(index, 'TBM', TBM)
df.set_value(index, 'ABM', ABM)
Label Encoding of Moods: {'fear': 4, 'neutral': 6, 'dominance': 2, 'joy': 5, 'faith': 3, 'anger': 0, 'sadness': 7, 'arousal': 1}
BJP
-------
Most Informative Features
EUM = 1 1 : 2 = 21.7 : 1.0
SBM = 3 7 : 5 = 17.6 : 1.0
TBM = 2 4 : 5 = 13.6 : 1.0
TUM = 1 4 : 5 = 13.4 : 1.0
pos = 0.0 6 : 3 = 12.8 : 1.0
SUM = 3 7 : 5 = 12.8 : 1.0
STM = 3 7 : 5 = 12.8 : 1.0
SBM = 2 7 : 5 = 12.6 : 1.0
compound = 0.0 6 : 3 = 12.4 : 1.0
ATM = 3 0 : 2 = 11.1 : 1.0
SBM = 1 7 : 5 = 10.5 : 1.0
retweet_count = 5 3 : 5 = 10.5 : 1.0
SUM = 1 7 : 5 = 9.9 : 1.0
STM = 1 7 : 2 = 9.5 : 1.0
DBM = 1 3 : 5 = 8.5 : 1.0
DUM = 6 3 : 5 = 8.3 : 1.0
EBM = 1 1 : 2 = 8.1 : 1.0
ETM = 2 1 : 5 = 7.7 : 1.0
ETM = 3 1 : 5 = 7.7 : 1.0
SUM = 2 7 : 5 = 7.7 : 1.0
accuracy by using Naive Bayes: 0.59
Confusion Matrix from Naive Bayes Classifier:
| 0 1 2 3 4 5 6 7 |
--+-------------------------+
0 | <.> . 1 . . 1 1 . |
1 | . <2> 2 . . 1 5 . |
2 | . 2<15> 2 . 9 . 2 |
3 | . . . <1> . 4 . . |
4 | . . . . <1> 2 2 . |
5 | . 2 8 1 1<22> 2 . |
6 | . . 1 . . . <7> . |
7 | . . . . 2 . 1 <.>|
--+-------------------------+
(row = reference; col = test)
Naive Bayes Precision 0.69
Naive Bayes Recall 0.59
Naive Bayes F_Score 0.57
Congress :
-----------
Most Informative Features
STM = 2 7 : 2 = 35.9 : 1.0
SBM = 3 7 : 2 = 31.4 : 1.0
EUM = 1 1 : 2 = 29.0 : 1.0
SUM = 3 7 : 2 = 23.2 : 1.0
STM = 3 7 : 2 = 23.2 : 1.0
ATM = 3 0 : 2 = 16.5 : 1.0
EUM = 5 1 : 2 = 15.3 : 1.0
SBM = 2 7 : 5 = 15.1 : 1.0
pos = 0.0 6 : 3 = 15.0 : 1.0
compound = 0.0 6 : 3 = 14.3 : 1.0
NBM = 16 1 : 5 = 13.4 : 1.0
SUM = 4 0 : 2 = 12.9 : 1.0
polarity = 0.5 5 : 2 = 12.6 : 1.0
ETM = 3 1 : 5 = 12.3 : 1.0
DBM = 10 7 : 5 = 12.3 : 1.0
SBM = 1 7 : 5 = 12.1 : 1.0
DBM = 1 2 : 5 = 11.9 : 1.0
SUM = 2 7 : 5 = 11.8 : 1.0
retweet_count = 5 3 : 5 = 11.8 : 1.0
TBM = 2 4 : 5 = 11.4 : 1.0
accuracy by using Naive Bayes: 0.79
(row = reference; col = test)
Naive Bayes Classifier Precision 0.80
Naive Bayes Classifier Recall 0.79
Naive Bayes Classifier F_Score 0.78
import nltk
from nltk.tokenize import *
from nltk.util import ngrams
from nltk.classify import *
import preprocessor as p
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
Naive Bayes Training and Testing
classifier = nltk.NaiveBayesClassifier.train(train_set)
classifier.show_most_informative_features(20)
print('accuracy by using Naive Bayes:', nltk.classify.util.accuracy(classifier, test_set))
print('Naive Bayes Confusion Matrix' , nltk.ConfusionMatrix(labels, tests))
print('Naive Bayes Classification Report', classification_report(labels, tests))
MaxEnt Training and Testing
classifier = MaxentClassifier.train(train_set)
classifier.show_most_informative_features(20)
print('accuracy by using Max Entropy:', nltk.classify.util.accuracy(classifier, test_set))
print('Naive Bayes Confusion Matrix' , nltk.ConfusionMatrix(labels, tests))
print('Naive Bayes Classification Report', classification_report(labels, tests))
BJP:
-----
Label Encoding of Moods
{'fear': 4, 'neutral': 6, 'dominance': 2, 'joy': 5, 'faith': 3, 'anger': 0, 'sadness': 7, 'arousal': 1}
Training and Testing by using Max Entropy.....
==> Training (10 iterations)
Iteration Log Likelihood Accuracy
---------------------------------------
1 -2.07944 0.043
2 -1.30307 0.483
3 -1.20139 0.563
4 -1.11100 0.601
5 -1.03157 0.631
6 -0.96236 0.674
7 -0.90215 0.702
8 -0.84962 0.736
9 -0.80355 0.756
Final -0.76289 0.763
1.134 pos==0.315 and label is 0
1.046 pos==0.277 and label is 7
1.035 neg==0.446 and label is 0
0.995 pos==0.18 and label is 4
0.980 pos==0.302 and label is 1
0.975 pos==0.6509999999999999 and label is 7
0.964 neg==0.231 and label is 7
0.960 pos==0.394 and label is 0
0.955 retweet_count==39 and label is 4
0.954 compound==0.6774 and label is 4
Accuracy from Max Entropy Classifier: 0.51
Confusion Matrix from MaxEnt Classifier:
| 0 1 2 3 4 5 6 7 |
--+-------------------------+
0 | <.> . 1 . . 2 . . |
1 | . <1> 5 . . 4 . . |
2 | . 1<14> . . 15 . . |
3 | . . 1 <.> . 4 . . |
4 | . . . . <.> 5 . . |
5 | . . 7 . .<29> . . |
6 | . . 8 . . . <.> . |
7 | . . 1 . . 2 . <.>|
--+-------------------------+
(row = reference; col = test)
Max Entropy Precision 0.47
Max Entropy Recall 0.51
Max Entropy F_Score 0.43
Congress:
---------
Training and Testing by using Max Entropy.....
==> Training (10 iterations)
Iteration Log Likelihood Accuracy
---------------------------------------
1 -2.07944 0.042
2 -1.30335 0.482
3 -1.20265 0.565
4 -1.11320 0.598
5 -1.03467 0.628
6 -0.96627 0.671
7 -0.90679 0.694
8 -0.85489 0.724
9 -0.80937 0.741
Final -0.76919 0.754
1.161 pos==0.217 and label is 1
1.136 pos==0.315 and label is 0
1.102 pos==0.17600000000000002 and label is 1
1.081 pos==0.277 and label is 7
1.058 pos==0.6970000000000001 and label is 1
1.052 pos==0.35200000000000004 and label is 4
1.052 neg==0.319 and label is 4
1.037 pos==0.38 and label is 4
1.037 neg==0.157 and label is 4
1.032 neg==0.446 and label is 0
Accuracy from MaxEnt Classifier: 0.72
Confusion Matrix from MaxEnt Classifier:
| 0 1 2 3 4 5 6 7 |
--+-------------------------+
0 | <1> . 1 . . 1 . . |
1 | . <4> 2 . . 4 . . |
2 | . .<25> . . 5 . . |
3 | . . . <1> . 4 . . |
4 | . . 1 . <3> 1 . . |
5 | . . 1 . .<35> . . |
6 | . . 3 . . 4 <1> . |
7 | . . 1 . . . . <2>|
--+-------------------------+
(row = reference; col = test)
Max Entropy Precision 0.79
Max Entropy Recall 0.72
Max Entropy F_Score 0.68