ย
๋ผ์ด๋ธ๋ฌ๋ฆฌ ๋ถ๋ฌ์ค๊ธฐ๋ฐ์ดํฐ ๋ถ๋ฌ์ค๊ธฐ ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ๋ฐ์ดํฐ์๊ฐํ1 (pairpot)๋ฐ์ดํฐ์๊ฐํ2 (heatmap) ์๊ด๊ณ์ ์ถ๋ ฅ๋จ์ํ๊ท (Simple Regression)๋คํญํ๊ท (Polynomial Regression)๋ค์คํ๊ท (Multiple Regression)
ย
์๋์ ์ฝ๋๋ ์ฝ๋ฉ ํ๊ฒฝ์์ ํ
์คํธ๋์์ต๋๋ค.
ย
๋ผ์ด๋ธ๋ฌ๋ฆฌ ๋ถ๋ฌ์ค๊ธฐ
- ์ฝ๋
import numpy as np import seaborn as sns import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, PolynomialFeatures from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error
ย
๋ฐ์ดํฐ ๋ถ๋ฌ์ค๊ธฐ
- ์ฝ๋
DF = sns.load_dataset('mpg') DF.info()
ย
๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ
- ์ฝ๋
# null ๋ฐ์ดํฐ ์ฒ๋ฆฌ DF.drop(index=DF[DF['horsepower'].isnull()].index, inplace=True) # origin ๋ฒ์ฃผํ ์ฒ๋ฆฌ encoder = LabelEncoder() DF['orgin_state'] = encoder.fit_transform(DF[['origin']]) print('origin:', DF['orgin_state'].unique()) # object ์ปฌ๋ผ ์ญ์ DF.drop(columns=['name', 'origin'], axis=1, inplace=True) print("**" * 20) DF.info()
ย
๋ฐ์ดํฐ์๊ฐํ1 (pairpot)
- ์ ์ฒด ์ปฌ๋ผ
sns.pairplot(DF) plt.show()
- ์ปฌ๋ผ ๋ฒ์ ์ค์
sns.pairplot(DF, vars=['mpg', 'displacement', 'weight']) plt.show()
- ํน์ ์ปฌ๋ผ
sns.pairplot(DF, y_vars=['mpg']) plt.show()
ย
๋ฐ์ดํฐ์๊ฐํ2 (heatmap)
- ์ ์ฒด ์ปฌ๋ผ
sns.heatmap(DF.corr()) plt.show()
- ํน์ ์ปฌ๋ผ
sns.heatmap(DF.corr()[['mpg']]) plt.show()
ย
์๊ด๊ณ์ ์ถ๋ ฅ
- ์ ์ฒด ์ปฌ๋ผ
DF.corr()
- ํน์ ์ปฌ๋ผ
DF.corr()[['mpg']]
- ํน์ ์ปฌ๋ผ (์ ๋ ฌ)
DF.corr()[['mpg']].sort_values(by='mpg', key=abs, ascending=False)
ย
๋จ์ํ๊ท (Simple Regression)
- ์์
X = DF[['weight']] y = DF['mpg'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2045) RA = LinearRegression() RA.fit(X_train, y_train) print('weight:', RA.coef_) print('bias:', RA.intercept_) print('R Score:', RA.score(X_test, y_test)) y_hat_test = RA.predict(X_test) mse = mean_squared_error(y_test, y_hat_test) print('MSE: %.2f / %.2f' % (mse, np.sqrt(mse))) plt.figure(figsize=(9, 6)) ax1 = sns.kdeplot(y_test, label='y_test') ax2 = sns.kdeplot(y_hat_test, label='y_hat_simple', ax=ax1) ax3 = sns.kdeplot(y_train, label='y_train', ax=ax1) plt.legend() plt.show()
ย
๋คํญํ๊ท (Polynomial Regression)
- ์์
X = DF[['weight']] y = DF['mpg'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=6691) poly = PolynomialFeatures(degree=2, include_bias=False) X_train_poly = poly.fit_transform(X_train) X_test_poly = poly.fit_transform(X_test) RA = LinearRegression() RA.fit(X_train_poly, y_train) print('weight:', RA.coef_) print('bias:', RA.intercept_) print('R Score:', RA.score(X_test_poly, y_test)) y_hat_test = RA.predict(X_test_poly) mse = mean_squared_error(y_test, y_hat_test) print('MSE: %.2f / %.2f' % (mse, np.sqrt(mse))) plt.figure(figsize=(9, 6)) ax1 = sns.kdeplot(y_test, label='y_test') ax2 = sns.kdeplot(y_hat_test, label='y_hat_simple', ax=ax1) ax3 = sns.kdeplot(y_train, label='y_train', ax=ax1) plt.legend() plt.show()
ย
๋ค์คํ๊ท (Multiple Regression)
- ์์
X = DF[['weight', 'displacement', 'horsepower']] y = DF['mpg'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2045) RA = LinearRegression() RA.fit(X_train, y_train) plt.figure(figsize=(9, 6)) ax1 = sns.kdeplot(y_test, label='y_test') ax2 = sns.kdeplot(y_hat_test, label='y_hat_simple', ax=ax1) ax3 = sns.kdeplot(y_train, label='y_train', ax=ax1) plt.legend() plt.show()
ย