Data provided by Data For Everyone Library on Crowdflower and downloaded from Kaggle.
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
data = pd.read_csv("political_social_media.csv", encoding='iso-8859-1')
data.head(1)
bias_data = data['bias']
text_data = data['text']
vectorizer= CountVectorizer()
X = vectorizer.fit_transform(text_data)
featurenames = vectorizer.get_feature_names_out()
print(X[1])
(0, 11538) 1 (0, 3957) 1 (0, 17022) 1 (0, 6994) 1 (0, 11617) 1 (0, 7938) 1 (0, 4487) 1 (0, 1913) 1 (0, 3115) 1 (0, 12813) 1 (0, 8198) 1 (0, 5434) 1
from sklearn.model_selection import train_test_split
# X = text, Y = bias
X_train, X_vt, Y_train, Y_vt = train_test_split(X, bias_data, train_size=0.8)
X_validation, X_test, Y_validation, Y_test = train_test_split(X_vt, Y_vt, test_size=0.5)
X_train.shape
X_validation.shape
X_test.shape
(500, 18220)
from sklearn.svm import SVC
svc = SVC().fit(X_train, Y_train) # default args
print("Training accuracy = {}".format(svc.score(X_train, Y_train)))
print("Validation accuracy = {}".format(svc.score(X_validation, Y_validation)))
print("Test accuracy = {}".format(svc.score(X_test, Y_test)))
Training accuracy = 0.8385 Validation accuracy = 0.734 Test accuracy = 0.748
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, Y_train)
print("Training accuracy = {}".format(classifier.score(X_train, Y_train)))
print("Validation accuracy = {}".format(classifier.score(X_validation, Y_validation)))
print("Test accuracy = {}".format(classifier.score(X_test, Y_test)))
Training accuracy = 0.986 Validation accuracy = 0.74 Test accuracy = 0.762
from sklearn.model_selection import GridSearchCV
This model has the following parameters with default values:
# Recommend running with GPU
parameters = {'C': [0.1, 1, 10, 100],
'kernel': ['linear', 'sigmoid', 'rbf'],
'gamma': ['scale', 'auto'] }
svc_model = SVC()
grid_svc = GridSearchCV(svc_model, parameters)
grid_svc.fit(X_validation, Y_validation)
hyper = grid_svc.best_params_
results = grid_svc.cv_results_
for i in range(len(results['params'])):
print("Parameters: {} resulting in validation accuracy: {:.4f}".format(results['params'][i], results['mean_test_score'][i]))
Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'} resulting in validation accuracy: 0.7120 Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'sigmoid'} resulting in validation accuracy: 0.7240 Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'} resulting in validation accuracy: 0.7240 Parameters: {'C': 0.1, 'gamma': 'auto', 'kernel': 'linear'} resulting in validation accuracy: 0.7120 Parameters: {'C': 0.1, 'gamma': 'auto', 'kernel': 'sigmoid'} resulting in validation accuracy: 0.7240 Parameters: {'C': 0.1, 'gamma': 'auto', 'kernel': 'rbf'} resulting in validation accuracy: 0.7240 Parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'} resulting in validation accuracy: 0.7200 Parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'sigmoid'} resulting in validation accuracy: 0.7260 Parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'} resulting in validation accuracy: 0.7180 Parameters: {'C': 1, 'gamma': 'auto', 'kernel': 'linear'} resulting in validation accuracy: 0.7200 Parameters: {'C': 1, 'gamma': 'auto', 'kernel': 'sigmoid'} resulting in validation accuracy: 0.7240 Parameters: {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'} resulting in validation accuracy: 0.7240 Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'} resulting in validation accuracy: 0.7200 Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'sigmoid'} resulting in validation accuracy: 0.6880 Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'} resulting in validation accuracy: 0.7060 Parameters: {'C': 10, 'gamma': 'auto', 'kernel': 'linear'} resulting in validation accuracy: 0.7200 Parameters: {'C': 10, 'gamma': 'auto', 'kernel': 'sigmoid'} resulting in validation accuracy: 0.7260 Parameters: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'} resulting in validation accuracy: 0.7240 Parameters: {'C': 100, 'gamma': 'scale', 'kernel': 'linear'} resulting in validation accuracy: 0.7200 Parameters: {'C': 100, 'gamma': 'scale', 'kernel': 'sigmoid'} resulting in validation accuracy: 0.6280 Parameters: {'C': 100, 'gamma': 'scale', 'kernel': 'rbf'} resulting in validation accuracy: 0.7180 Parameters: {'C': 100, 'gamma': 'auto', 'kernel': 'linear'} resulting in validation accuracy: 0.7200 Parameters: {'C': 100, 'gamma': 'auto', 'kernel': 'sigmoid'} resulting in validation accuracy: 0.7220 Parameters: {'C': 100, 'gamma': 'auto', 'kernel': 'rbf'} resulting in validation accuracy: 0.7180
svc_best = SVC(C=hyper['C'], gamma=hyper['gamma'], kernel=hyper['kernel']).fit(X_train, Y_train)
print("Test accuracy for best parameters = {}".format(svc_best.score(X_test, Y_test)))
print("Validation accuracy for best parameters = {}".format(svc_best.score(X_validation, Y_validation)))
print("Best value for C = {}".format(hyper['C']))
print("Best value for gamma = {}".format(hyper['gamma']))
print("Best kernel = {}".format(hyper['kernel']))
Test accuracy for best parameters = 0.706 Validation accuracy for best parameters = 0.68 Best value for C = 1 Best value for gamma = scale Best kernel = sigmoid
The model has the following hyperparameters:
from sklearn.linear_model import LogisticRegression
parameters = {'C': [0.1, 0.6, 1, 10],
'max_iter': [100,1000],
'solver':['lbfgs','liblinear']}
lr_classifier = LogisticRegression()
grid_lr = GridSearchCV(lr_classifier, parameters)
grid_lr.fit(X_validation, Y_validation)
hyper = grid_lr.best_params_
results = grid_lr.cv_results_
for i in range(len(results['params'])):
print("Parameters: {} resulting in validation accuracy: {:.4f}".format(results['params'][i], results['mean_test_score'][i]))
Parameters: {'C': 0.1, 'max_iter': 100, 'solver': 'lbfgs'} resulting in validation accuracy: 0.7160 Parameters: {'C': 0.1, 'max_iter': 100, 'solver': 'liblinear'} resulting in validation accuracy: 0.7160 Parameters: {'C': 0.1, 'max_iter': 1000, 'solver': 'lbfgs'} resulting in validation accuracy: 0.7160 Parameters: {'C': 0.1, 'max_iter': 1000, 'solver': 'liblinear'} resulting in validation accuracy: 0.7160 Parameters: {'C': 0.6, 'max_iter': 100, 'solver': 'lbfgs'} resulting in validation accuracy: 0.7200 Parameters: {'C': 0.6, 'max_iter': 100, 'solver': 'liblinear'} resulting in validation accuracy: 0.7200 Parameters: {'C': 0.6, 'max_iter': 1000, 'solver': 'lbfgs'} resulting in validation accuracy: 0.7200 Parameters: {'C': 0.6, 'max_iter': 1000, 'solver': 'liblinear'} resulting in validation accuracy: 0.7200 Parameters: {'C': 1, 'max_iter': 100, 'solver': 'lbfgs'} resulting in validation accuracy: 0.7220 Parameters: {'C': 1, 'max_iter': 100, 'solver': 'liblinear'} resulting in validation accuracy: 0.7180 Parameters: {'C': 1, 'max_iter': 1000, 'solver': 'lbfgs'} resulting in validation accuracy: 0.7220 Parameters: {'C': 1, 'max_iter': 1000, 'solver': 'liblinear'} resulting in validation accuracy: 0.7180 Parameters: {'C': 10, 'max_iter': 100, 'solver': 'lbfgs'} resulting in validation accuracy: 0.7200 Parameters: {'C': 10, 'max_iter': 100, 'solver': 'liblinear'} resulting in validation accuracy: 0.7160 Parameters: {'C': 10, 'max_iter': 1000, 'solver': 'lbfgs'} resulting in validation accuracy: 0.7200 Parameters: {'C': 10, 'max_iter': 1000, 'solver': 'liblinear'} resulting in validation accuracy: 0.7160
lr_best = LogisticRegression(C=hyper['C'], max_iter=hyper['max_iter'], solver=hyper['solver']).fit(X_train, Y_train)
print("Validation accuracy for best parameters = {}".format(lr_best.score(X_validation, Y_validation)))
print("Test accuracy for best parameters = {}".format(lr_best.score(X_test, Y_test)))
print("Best value for C = {}".format(hyper['C']))
print("Best value for max_iter = {}".format(hyper['max_iter']))
print("Best solver = {}".format(hyper['solver']))
Validation accuracy for best parameters = 0.74 Test accuracy for best parameters = 0.762 Best value for C = 1 Best value for max_iter = 100 Best solver = lbfgs