♣️

Classification 코드 정리

Tags
Python
MachineLearning
ID matched
Created
Jan 12, 2023 12:42 AM
Last Updated
Last updated July 15, 2023
 
 

라이브러리 불러오기

import seaborn as sns import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from datetime import datetime from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier from lightgbm import LGBMClassifier from xgboost import XGBClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.neural_network import MLPClassifier from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
 

유틸 함수 추가하기

def get_validation(y_test, y_pred): matrix = confusion_matrix(y_test, y_pred).tolist() return { "accuracy": round(accuracy_score(y_test, y_pred), 2), "TP": str(matrix[0][0]), "FP": str(matrix[0][1]), "precision_0": round(precision_score(y_test, y_pred, pos_label=0), 2), "recall_0": round(recall_score(y_test, y_pred, pos_label=0), 2), "f1score_0": round(f1_score(y_test, y_pred, pos_label=0), 2), "TN": str(matrix[1][1]), "FN": str(matrix[1][0]), "precision_1": round(precision_score(y_test, y_pred, pos_label=1), 2), "recall_1": round(recall_score(y_test, y_pred, pos_label=1), 2), "f1score_1": round(f1_score(y_test, y_pred, pos_label=1), 2), }
from io import StringIO from google.colab import output io = StringIO() def autosave_print(*text): output.clear() print(*text, file=io) print(io.getvalue(), end="") def clear_print(): io.seek(0) io.truncate(0) def clear_output(): output.clear()
 

데이터 불러오기

data = load_breast_cancer() df_cancer = pd.DataFrame(data=data.data, columns=data.feature_names) df_cancer['target'] = data.target df_cancer.info()
notion image
df_cancer.head()
notion image
df_cancer.target.value_counts()
notion image
 

데이터 시각화하기

cols_size = len(df_cancer.columns) plt.figure(figsize=(15, 30)) for i, column in enumerate(df_cancer.columns): ax = plt.subplot(cols_size // 4 + 1, 4, i+1) label_size = len(df_cancer[column].unique()) if label_size < 10: sns.countplot(x=column, hue='target', data=df_cancer, ax=ax) plt.title(column) plt.xlabel("") plt.ylabel("") ax.get_legend().remove() else: sns.violinplot(x='target', y=column, data=df_cancer, ax=ax) plt.title(column) plt.xlabel("") plt.ylabel("")
notion image
 

상관계수 출력하기

df_cancer.corr()[['target']].sort_values(by='target', key=abs, ascending=False)
notion image
 

Train, Test 데이터 분리하기

X = df_cancer.iloc[:, :-1] y = df_cancer.iloc[:, -1] print(X.shape, y.shape) X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=0.2, random_state=2045) print("Train Size:", X_train.shape, y_train.shape) print("Test Size:", X_test.shape, y_test.shape)
notion image
print("# Train") print(y_train.value_counts() / y_train.shape[0]) print("# Test") print(y_test.value_counts() / y_test.shape[0])
notion image
 

모델별 학습하기

%%time clear_print() grap_time = datetime.now() model_lr = LogisticRegression(n_jobs=-1, verbose=3, random_state=2045) model_lr.fit(X_train, y_train) autosave_print("LogisticRegression", datetime.now() - grap_time) grap_time = datetime.now() model_dt = DecisionTreeClassifier(random_state=2045) model_dt.fit(X_train, y_train) autosave_print("DecisionTreeClassifier", datetime.now() - grap_time) grap_time = datetime.now() model_random = RandomForestClassifier(n_jobs=-1, verbose=3, random_state=2045) model_random.fit(X_train, y_train) autosave_print("RandomForestClassifier", datetime.now() - grap_time) grap_time = datetime.now() model_ada = AdaBoostClassifier(random_state=2045) model_ada.fit(X_train, y_train) autosave_print("AdaBoostClassifier", datetime.now() - grap_time) grap_time = datetime.now() model_gbm = GradientBoostingClassifier(verbose=3, random_state=2045) model_gbm.fit(X_train, y_train) autosave_print("GradientBoostingClassifier", datetime.now() - grap_time) grap_time = datetime.now() model_lgbm = LGBMClassifier(n_jobs=-1, random_state=2045) model_lgbm.fit(X_train, y_train) autosave_print("LGBMClassifier", datetime.now() - grap_time) grap_time = datetime.now() model_xgbm = XGBClassifier(n_jobs=-1, random_state=2045) model_xgbm.fit(X_train, y_train) autosave_print("XGBClassifier", datetime.now() - grap_time) grap_time = datetime.now() model_kn = KNeighborsClassifier(n_jobs=-1) model_kn.fit(X_train, y_train) autosave_print("KNeighborsClassifier", datetime.now() - grap_time)
notion image
 

모델별 학습 결과 평가하기

validations = {} y_pred_lr = model_lr.predict(X_test) validations["logistic_regression"] = get_validation(y_test, y_pred_lr) y_pred_dt = model_dt.predict(X_test) validations["decision_tree"] = get_validation(y_test, y_pred_dt) y_pred_random = model_random.predict(X_test) validations["random_forest"] = get_validation(y_test, y_pred_random) y_pred_ada = model_ada.predict(X_test) validations["ada_boosting"] = get_validation(y_test, y_pred_ada) y_pred_gbm = model_gbm.predict(X_test) validations["gradient_boosting"] = get_validation(y_test, y_pred_gbm) y_pred_lgbm = model_lgbm.predict(X_test) validations["light_gbm"] = get_validation(y_test, y_pred_lgbm) y_pred_xgbm = model_xgbm.predict(X_test) validations["extra_gbm"] = get_validation(y_test, y_pred_xgbm) y_pred_kn = model_kn.predict(X_test) validations["k_neighbor"] = get_validation(y_test, y_pred_kn) df = pd.DataFrame(validations).transpose() df.style.background_gradient(subset=['TP', 'TN'], cmap='BuGn')
notion image