kaggle用のスニペット

import numpy as np
import pandas as pd
import os
import pickle
import gc
import pandas_profiling as pdb
import matplotlib.pyplot as plt
import tqdm
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
import lightgbm as lgb
from lightgbm import early_stopping
import warnings
warnings.filterwarnings("ignore")
from IPython.display import display, HTML, Markdown, Latex
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, round(cm[i, j],2),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

def check(df):
    display(Markdown('### df.head()'))
    display(df.head())
    display(Markdown('### len(df)'),len(df))
    display(Markdown('### len(df.columns)'),len(df.columns))
    display(Markdown('### df.info()'))
    display(df.info())
    display(Markdown('### df.isnull().sum()'))
    display(df.isnull().sum())
    
def cross_validation(x,y):
    x_tr, x_va, y_tr, y_va = train_test_split(x,
                                              y,
                                              test_size=0.2,
                                              shuffle=True,
                                              stratify=y, 
                                              random_state=123) 
    metrics = []
    imp = pd.DataFrame()
    n_splits = 5
    cv = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123).split(x, y))
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary', 
        'metric': 'auc',
        'learning_rate': 0.1,
        'num_leaves': 16,
        'n_estimators': 100000,
        "random_state": 123,
        "importance_type": "gain",
        'verbosity': -1,
    }
    for nfold in np.arange(n_splits):
        # print("-"*20, 'Fold',nfold, "-"*20)
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        x_tr, y_tr = x.loc[idx_tr, :], y.loc[idx_tr, :]
        x_va, y_va = x.loc[idx_va, :], y.loc[idx_va, :]

        model = lgb.LGBMClassifier(**params)
        model.fit(x_tr,
              y_tr,
              eval_set=[(x_tr,y_tr), (x_va,y_va)],
              callbacks=[early_stopping(100,verbose=False)],
             )
        y_tr_pred = model.predict(x_tr)
        y_va_pred = model.predict(x_va)
        metric_tr = accuracy_score(y_tr, y_tr_pred)
        metric_va = accuracy_score(y_va, y_va_pred)
        # print("[accuracy] tr: {:.2f}, va: {:.2f}".format(metric_tr, metric_va))    
        metrics.append([nfold, metric_tr, metric_va])
    
        _imp = pd.DataFrame({"col":x.columns, "imp":model.feature_importances_, "nfold":nfold})
        imp = pd.concat([imp, _imp], axis=0, ignore_index=True)

    # print("-"*20, "Result", "-"*20)
    metrics = np.array(metrics)
    # print(metrics)

    display(Markdown("### Cross validation result"))
    print("[cv ] tr: {:.2f}+-{:.2f}, va: {:.2f}+-{:.2f}".format(
        metrics[:,1].mean(), metrics[:,1].std(),
        metrics[:,2].mean(), metrics[:,2].std(),
    ))

    imp = imp.groupby("col")["imp"].agg(["mean", "std"])
    imp.columns = ["imp", "imp_std"]
    imp = imp.reset_index(drop=False)

    display(Markdown("### Feature importance"))
    display(imp.sort_values("imp", ascending=False, ignore_index=True))
    
def eval_baseline(x,y):
    x_tr, x_va2, y_tr, y_va2 = train_test_split(x,
                                            y,
                                            test_size=0.2,
                                            shuffle=True,
                                            stratify=y,
                                            random_state=123)
    x_tr1, x_va1, y_tr1, y_va1 = train_test_split(x_tr,
                                              y_tr,
                                              test_size=0.2,
                                              shuffle=True,
                                              stratify=y_tr,
                                              random_state=789)
    params = {
    'boosting_type': 'gbdt',
    'objective': 'binary', 
    'metric': 'auc',
    'learning_rate': 0.1,
    'num_leaves': 16,
    'n_estimators': 100000,
    "random_state": 123,
    "importance_type": "gain",
    'verbosity': -1,
    }
    model = lgb.LGBMClassifier(**params)
    model.fit(x_tr1,
          y_tr1,
          eval_set=[(x_tr1,y_tr1), (x_va1,y_va1)],
          callbacks=[early_stopping(100,verbose=False)],
    )
    y_va1_pred = model.predict(x_va1)
    y_va2_pred = model.predict(x_va2)
    
    display(Markdown("### Comparison of model accuracy"))
    print("[Validation data] acc: {:.4f}".format(accuracy_score(y_va1, y_va1_pred)))
    print("[Test data] acc: {:.4f}".format(accuracy_score(y_va2, y_va2_pred)))

    display(Markdown("### Comparison of error distribution"))
    display(Markdown("#### Validation data"))
    plt.figure()
    plt.grid(False)
    plot_confusion_matrix(confusion_matrix(y_va1, y_va1_pred), 
                          classes=model.classes_,
                          title='Confusion matrix, without normalization')
    plt.figure()
    plt.grid(False)
    plot_confusion_matrix(confusion_matrix(y_va1, y_va1_pred, normalize="all"), 
                          classes=model.classes_, 
                          normalize=True,
                          title='Normalized confusion matrix')
    display(Markdown("#### Test data"))
    plt.figure()
    plt.grid(False)
    plot_confusion_matrix(confusion_matrix(y_va2, y_va2_pred), 
                          classes=model.classes_,
                          title='Confusion matrix, without normalization')
    plt.figure()
    plt.grid(False)
    plot_confusion_matrix(confusion_matrix(y_va2, y_va2_pred, normalize="all"), 
                          classes=model.classes_,
                          normalize=True,
                          title='Confusion matrix, without normalization')
    
    # Draw histogram
    y_va1_pred_prob = model.predict_proba(x_va1)[:,1]
    y_va2_pred_prob = model.predict_proba(x_va2)[:,1]
    
    fig = plt.figure(figsize=(10,8))
    fig.add_subplot(2,1,1)
    plt.title("Validation data")
    plt.hist(y_va1_pred_prob[np.array(y_va1).reshape(-1)==1], bins=10, alpha=0.5, label="1")
    plt.hist(y_va1_pred_prob[np.array(y_va1).reshape(-1)==0], bins=10, alpha=0.5, label="0")
    plt.grid()
    plt.legend()

    fig.add_subplot(2,1,2)
    plt.title("Test data")
    plt.hist(y_va2_pred_prob[np.array(y_va2).reshape(-1)==1], bins=10, alpha=0.5, label="1")
    plt.hist(y_va2_pred_prob[np.array(y_va2).reshape(-1)==0], bins=10, alpha=0.5, label="0")
    plt.grid()
    plt.legend()