import numpy as np
import pandas as pd
import os
import pickle
import gc
import pandas_profiling as pdb
import matplotlib.pyplot as plt
import tqdm
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
import lightgbm as lgb
from lightgbm import early_stopping
import warnings
warnings.filterwarnings("ignore")
from IPython.display import display, HTML, Markdown, Latex
import itertools
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, round(cm[i, j],2),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
def check(df):
display(Markdown('### df.head()'))
display(df.head())
display(Markdown('### len(df)'),len(df))
display(Markdown('### len(df.columns)'),len(df.columns))
display(Markdown('### df.info()'))
display(df.info())
display(Markdown('### df.isnull().sum()'))
display(df.isnull().sum())
def cross_validation(x,y):
x_tr, x_va, y_tr, y_va = train_test_split(x,
y,
test_size=0.2,
shuffle=True,
stratify=y,
random_state=123)
metrics = []
imp = pd.DataFrame()
n_splits = 5
cv = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123).split(x, y))
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'learning_rate': 0.1,
'num_leaves': 16,
'n_estimators': 100000,
"random_state": 123,
"importance_type": "gain",
'verbosity': -1,
}
for nfold in np.arange(n_splits):
idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
x_tr, y_tr = x.loc[idx_tr, :], y.loc[idx_tr, :]
x_va, y_va = x.loc[idx_va, :], y.loc[idx_va, :]
model = lgb.LGBMClassifier(**params)
model.fit(x_tr,
y_tr,
eval_set=[(x_tr,y_tr), (x_va,y_va)],
callbacks=[early_stopping(100,verbose=False)],
)
y_tr_pred = model.predict(x_tr)
y_va_pred = model.predict(x_va)
metric_tr = accuracy_score(y_tr, y_tr_pred)
metric_va = accuracy_score(y_va, y_va_pred)
metrics.append([nfold, metric_tr, metric_va])
_imp = pd.DataFrame({"col":x.columns, "imp":model.feature_importances_, "nfold":nfold})
imp = pd.concat([imp, _imp], axis=0, ignore_index=True)
metrics = np.array(metrics)
display(Markdown("### Cross validation result"))
print("[cv ] tr: {:.2f}+-{:.2f}, va: {:.2f}+-{:.2f}".format(
metrics[:,1].mean(), metrics[:,1].std(),
metrics[:,2].mean(), metrics[:,2].std(),
))
imp = imp.groupby("col")["imp"].agg(["mean", "std"])
imp.columns = ["imp", "imp_std"]
imp = imp.reset_index(drop=False)
display(Markdown("### Feature importance"))
display(imp.sort_values("imp", ascending=False, ignore_index=True))
def eval_baseline(x,y):
x_tr, x_va2, y_tr, y_va2 = train_test_split(x,
y,
test_size=0.2,
shuffle=True,
stratify=y,
random_state=123)
x_tr1, x_va1, y_tr1, y_va1 = train_test_split(x_tr,
y_tr,
test_size=0.2,
shuffle=True,
stratify=y_tr,
random_state=789)
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'learning_rate': 0.1,
'num_leaves': 16,
'n_estimators': 100000,
"random_state": 123,
"importance_type": "gain",
'verbosity': -1,
}
model = lgb.LGBMClassifier(**params)
model.fit(x_tr1,
y_tr1,
eval_set=[(x_tr1,y_tr1), (x_va1,y_va1)],
callbacks=[early_stopping(100,verbose=False)],
)
y_va1_pred = model.predict(x_va1)
y_va2_pred = model.predict(x_va2)
display(Markdown("### Comparison of model accuracy"))
print("[Validation data] acc: {:.4f}".format(accuracy_score(y_va1, y_va1_pred)))
print("[Test data] acc: {:.4f}".format(accuracy_score(y_va2, y_va2_pred)))
display(Markdown("### Comparison of error distribution"))
display(Markdown("#### Validation data"))
plt.figure()
plt.grid(False)
plot_confusion_matrix(confusion_matrix(y_va1, y_va1_pred),
classes=model.classes_,
title='Confusion matrix, without normalization')
plt.figure()
plt.grid(False)
plot_confusion_matrix(confusion_matrix(y_va1, y_va1_pred, normalize="all"),
classes=model.classes_,
normalize=True,
title='Normalized confusion matrix')
display(Markdown("#### Test data"))
plt.figure()
plt.grid(False)
plot_confusion_matrix(confusion_matrix(y_va2, y_va2_pred),
classes=model.classes_,
title='Confusion matrix, without normalization')
plt.figure()
plt.grid(False)
plot_confusion_matrix(confusion_matrix(y_va2, y_va2_pred, normalize="all"),
classes=model.classes_,
normalize=True,
title='Confusion matrix, without normalization')
y_va1_pred_prob = model.predict_proba(x_va1)[:,1]
y_va2_pred_prob = model.predict_proba(x_va2)[:,1]
fig = plt.figure(figsize=(10,8))
fig.add_subplot(2,1,1)
plt.title("Validation data")
plt.hist(y_va1_pred_prob[np.array(y_va1).reshape(-1)==1], bins=10, alpha=0.5, label="1")
plt.hist(y_va1_pred_prob[np.array(y_va1).reshape(-1)==0], bins=10, alpha=0.5, label="0")
plt.grid()
plt.legend()
fig.add_subplot(2,1,2)
plt.title("Test data")
plt.hist(y_va2_pred_prob[np.array(y_va2).reshape(-1)==1], bins=10, alpha=0.5, label="1")
plt.hist(y_va2_pred_prob[np.array(y_va2).reshape(-1)==0], bins=10, alpha=0.5, label="0")
plt.grid()
plt.legend()