라이브러리 불러오기유틸 함수 추가하기데이터 불러오기성별 정보 처리하기데이터 시각화하기상관계수 출력하기Train, Test 데이터 분리하기모델별 학습하기모델별 학습 결과 평가하기그래프로 시각화하기
라이브러리 불러오기
import seaborn as sns import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import load_diabetes from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from datetime import datetime from sklearn.linear_model import LinearRegression from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor from lightgbm import LGBMRegressor from xgboost import XGBRegressor from sklearn.neighbors import KNeighborsRegressor from sklearn.metrics import mean_squared_error, r2_score
유틸 함수 추가하기
def get_validation(y_test, y_pred): mse = mean_squared_error(y_test, y_pred) return { "MSE": mse, "SQRT": np.sqrt(mse), "R2 Score": r2_score(y_test, y_pred) }
from io import StringIO from google.colab import output io = StringIO() def autosave_print(*text): output.clear() clear_output() print(*text, file=io) print(io.getvalue(), end="") def clear_print(): io.seek(0) io.truncate(0) def clear_output(): output.clear()
데이터 불러오기
data = load_diabetes() df_diabetes = pd.DataFrame(data=data.data, columns=data.feature_names) df_diabetes['target'] = data.target df_diabetes.info()
df_diabetes.head()
성별 정보 처리하기
df_diabetes.sex.value_counts()
encoder = LabelEncoder() df_diabetes['sex'] = encoder.fit_transform(df_diabetes['sex']) df_diabetes.sex.value_counts()
데이터 시각화하기
cols_size = len(df_diabetes.columns) plt.figure(figsize=(15, 12)) for i, column in enumerate(df_diabetes.columns): ax = plt.subplot(cols_size // 4 + 1, 4, i+1) label_size = len(df_diabetes[column].unique()) if label_size < 10: sns.violinplot(x=column, y='target', hue=column, data=df_diabetes, ax=ax) plt.title(column) plt.xlabel("") plt.ylabel("") ax.get_legend().remove() else: sns.regplot(x=column, y='target', data=df_diabetes, ax=ax) plt.title(column) plt.xlabel("") plt.ylabel("")
상관계수 출력하기
df_diabetes.corr()[['target']].sort_values(by='target', key=abs, ascending=False)
Train, Test 데이터 분리하기
X = df_diabetes.iloc[:, :-1] y = df_diabetes.iloc[:, -1] print(X.shape, y.shape) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=2045) print("Train Size:", X_train.shape, y_train.shape) print("Test Size:", X_test.shape, y_test.shape)
모델별 학습하기
%%time clear_print() grap_time = datetime.now() model_lr = LinearRegression(n_jobs=-1) model_lr.fit(X_train, y_train) autosave_print("LinearRegression", datetime.now() - grap_time) grap_time = datetime.now() model_dt = DecisionTreeRegressor(random_state=2045) model_dt.fit(X_train, y_train) autosave_print("DecisionTreeRegressor", datetime.now() - grap_time) grap_time = datetime.now() model_random = RandomForestRegressor(n_jobs=-1, verbose=3, random_state=2045) model_random.fit(X_train, y_train) autosave_print("RandomForestRegressor", datetime.now() - grap_time) grap_time = datetime.now() model_ada = AdaBoostRegressor(random_state=2045) model_ada.fit(X_train, y_train) autosave_print("AdaBoostRegressor", datetime.now() - grap_time) grap_time = datetime.now() model_gbm = GradientBoostingRegressor(verbose=3, random_state=2045) model_gbm.fit(X_train, y_train) autosave_print("GradientBoostingRegressor", datetime.now() - grap_time) grap_time = datetime.now() model_lgbm = LGBMRegressor(n_jobs=-1, random_state=2045) model_lgbm.fit(X_train, y_train) autosave_print("LGBMRegressor", datetime.now() - grap_time) grap_time = datetime.now() model_xgbm = XGBRegressor(n_jobs=-1, random_state=2045) model_xgbm.fit(X_train, y_train) autosave_print("XGBRegressor", datetime.now() - grap_time) grap_time = datetime.now() model_kn = KNeighborsRegressor(n_jobs=-1) model_kn.fit(X_train, y_train) autosave_print("KNeighborsRegressor", datetime.now() - grap_time)
모델별 학습 결과 평가하기
validations = {} y_pred_lr = model_lr.predict(X_test) validations["logistic_regression"] = get_validation(y_test, y_pred_lr) y_pred_dt = model_dt.predict(X_test) validations["decision_tree"] = get_validation(y_test, y_pred_dt) y_pred_random = model_random.predict(X_test) validations["random_forest"] = get_validation(y_test, y_pred_random) y_pred_ada = model_ada.predict(X_test) validations["ada_boosting"] = get_validation(y_test, y_pred_ada) y_pred_gbm = model_gbm.predict(X_test) validations["gradient_boosting"] = get_validation(y_test, y_pred_gbm) y_pred_lgbm = model_lgbm.predict(X_test) validations["light_gbm"] = get_validation(y_test, y_pred_lgbm) y_pred_xgbm = model_xgbm.predict(X_test) validations["extra_gbm"] = get_validation(y_test, y_pred_xgbm) y_pred_kn = model_kn.predict(X_test) validations["k_neighbor"] = get_validation(y_test, y_pred_kn) df = pd.DataFrame(validations).transpose() df.style.background_gradient(subset=['R2 Score'], cmap='BuGn')
그래프로 시각화하기
plt.figure(figsize=(15, 8)) sns.kdeplot(y_train, label='y_train') sns.kdeplot(y_test, label='y_test') sns.kdeplot(y_pred_lr, label='linear_regression') sns.kdeplot(y_pred_dt, label='decision_tree') sns.kdeplot(y_pred_random, label='random_forest') sns.kdeplot(y_pred_ada, label='ada_boosting') sns.kdeplot(y_pred_gbm, label='gradient_boosting') sns.kdeplot(y_pred_lgbm, label='light_gbm') sns.kdeplot(y_pred_xgbm, label='extra_gbm') sns.kdeplot(y_pred_kn, label='k_neighbor') plt.legend() plt.show()