🚠

Regression 코드 정리

Tags
Python
MachineLearning
ID matched
Created
Jan 12, 2023 12:42 AM
Last Updated
Last updated July 15, 2023
 
 

라이브러리 불러오기

import seaborn as sns import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import load_diabetes from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from datetime import datetime from sklearn.linear_model import LinearRegression from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor from lightgbm import LGBMRegressor from xgboost import XGBRegressor from sklearn.neighbors import KNeighborsRegressor from sklearn.metrics import mean_squared_error, r2_score
 

유틸 함수 추가하기

def get_validation(y_test, y_pred): mse = mean_squared_error(y_test, y_pred) return { "MSE": mse, "SQRT": np.sqrt(mse), "R2 Score": r2_score(y_test, y_pred) }
from io import StringIO from google.colab import output io = StringIO() def autosave_print(*text): output.clear() clear_output() print(*text, file=io) print(io.getvalue(), end="") def clear_print(): io.seek(0) io.truncate(0) def clear_output(): output.clear()
 

데이터 불러오기

data = load_diabetes() df_diabetes = pd.DataFrame(data=data.data, columns=data.feature_names) df_diabetes['target'] = data.target df_diabetes.info()
notion image
df_diabetes.head()
notion image
 

성별 정보 처리하기

df_diabetes.sex.value_counts()
notion image
encoder = LabelEncoder() df_diabetes['sex'] = encoder.fit_transform(df_diabetes['sex']) df_diabetes.sex.value_counts()
notion image
 

데이터 시각화하기

cols_size = len(df_diabetes.columns) plt.figure(figsize=(15, 12)) for i, column in enumerate(df_diabetes.columns): ax = plt.subplot(cols_size // 4 + 1, 4, i+1) label_size = len(df_diabetes[column].unique()) if label_size < 10: sns.violinplot(x=column, y='target', hue=column, data=df_diabetes, ax=ax) plt.title(column) plt.xlabel("") plt.ylabel("") ax.get_legend().remove() else: sns.regplot(x=column, y='target', data=df_diabetes, ax=ax) plt.title(column) plt.xlabel("") plt.ylabel("")
notion image
 

상관계수 출력하기

df_diabetes.corr()[['target']].sort_values(by='target', key=abs, ascending=False)
notion image
 

Train, Test 데이터 분리하기

X = df_diabetes.iloc[:, :-1] y = df_diabetes.iloc[:, -1] print(X.shape, y.shape) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=2045) print("Train Size:", X_train.shape, y_train.shape) print("Test Size:", X_test.shape, y_test.shape)
notion image
 

모델별 학습하기

%%time clear_print() grap_time = datetime.now() model_lr = LinearRegression(n_jobs=-1) model_lr.fit(X_train, y_train) autosave_print("LinearRegression", datetime.now() - grap_time) grap_time = datetime.now() model_dt = DecisionTreeRegressor(random_state=2045) model_dt.fit(X_train, y_train) autosave_print("DecisionTreeRegressor", datetime.now() - grap_time) grap_time = datetime.now() model_random = RandomForestRegressor(n_jobs=-1, verbose=3, random_state=2045) model_random.fit(X_train, y_train) autosave_print("RandomForestRegressor", datetime.now() - grap_time) grap_time = datetime.now() model_ada = AdaBoostRegressor(random_state=2045) model_ada.fit(X_train, y_train) autosave_print("AdaBoostRegressor", datetime.now() - grap_time) grap_time = datetime.now() model_gbm = GradientBoostingRegressor(verbose=3, random_state=2045) model_gbm.fit(X_train, y_train) autosave_print("GradientBoostingRegressor", datetime.now() - grap_time) grap_time = datetime.now() model_lgbm = LGBMRegressor(n_jobs=-1, random_state=2045) model_lgbm.fit(X_train, y_train) autosave_print("LGBMRegressor", datetime.now() - grap_time) grap_time = datetime.now() model_xgbm = XGBRegressor(n_jobs=-1, random_state=2045) model_xgbm.fit(X_train, y_train) autosave_print("XGBRegressor", datetime.now() - grap_time) grap_time = datetime.now() model_kn = KNeighborsRegressor(n_jobs=-1) model_kn.fit(X_train, y_train) autosave_print("KNeighborsRegressor", datetime.now() - grap_time)
notion image
 

모델별 학습 결과 평가하기

validations = {} y_pred_lr = model_lr.predict(X_test) validations["logistic_regression"] = get_validation(y_test, y_pred_lr) y_pred_dt = model_dt.predict(X_test) validations["decision_tree"] = get_validation(y_test, y_pred_dt) y_pred_random = model_random.predict(X_test) validations["random_forest"] = get_validation(y_test, y_pred_random) y_pred_ada = model_ada.predict(X_test) validations["ada_boosting"] = get_validation(y_test, y_pred_ada) y_pred_gbm = model_gbm.predict(X_test) validations["gradient_boosting"] = get_validation(y_test, y_pred_gbm) y_pred_lgbm = model_lgbm.predict(X_test) validations["light_gbm"] = get_validation(y_test, y_pred_lgbm) y_pred_xgbm = model_xgbm.predict(X_test) validations["extra_gbm"] = get_validation(y_test, y_pred_xgbm) y_pred_kn = model_kn.predict(X_test) validations["k_neighbor"] = get_validation(y_test, y_pred_kn) df = pd.DataFrame(validations).transpose() df.style.background_gradient(subset=['R2 Score'], cmap='BuGn')
notion image
 

그래프로 시각화하기

plt.figure(figsize=(15, 8)) sns.kdeplot(y_train, label='y_train') sns.kdeplot(y_test, label='y_test') sns.kdeplot(y_pred_lr, label='linear_regression') sns.kdeplot(y_pred_dt, label='decision_tree') sns.kdeplot(y_pred_random, label='random_forest') sns.kdeplot(y_pred_ada, label='ada_boosting') sns.kdeplot(y_pred_gbm, label='gradient_boosting') sns.kdeplot(y_pred_lgbm, label='light_gbm') sns.kdeplot(y_pred_xgbm, label='extra_gbm') sns.kdeplot(y_pred_kn, label='k_neighbor') plt.legend() plt.show()
notion image