import numpy as np import pandas as pd from sklearn.model_selection import train_test_split np.random.seed(0) n = 15 x = np.linspace(0,10,n) + np.random.randn(n)/5 y = np.sin(x)+x/6 + np.random.randn(n)/10 X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0)
使用polynomial regression來fit data
這個dataset很明顯是non-linear separable,嘗試先用polynomial feature transformation來增加維度。我們測試不同polynomial degree對一百個input點預測:def polynomial_regression(): from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import make_pipeline # generate points used for prediction x_prediction = np.linspace(0, 10, 100) x_prediction = x_prediction[:, np.newaxis] # create matrix version X_poly = X_train[:, np.newaxis] result = [] for count, degree in enumerate([1, 3, 6, 9]): model = make_pipeline(PolynomialFeatures(degree), LinearRegression()) model.fit(X_poly, y_train) y_predicted = model.predict(x_prediction) result.append(y_predicted) result = np.array(result) return result
這四個model train出來,利用100個點預測如下:
找出不同degree的R-squared score
我們現在把degree 0 ~ 9的polynomial regression R^2 score列出來,包含training set 和 test set:def ploynomial_r2_score(): from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import make_pipeline # create matrix version X_poly = X_train[:, np.newaxis] train_scores = [] test_scores = [] for count, degree in enumerate(range(0,10)): model = make_pipeline(PolynomialFeatures(degree), LinearRegression()) model.fit(X_poly, y_train) train_r2_score = model.score(X_poly, y_train) test_r2_score = model.score(X_test[:, np.newaxis], y_test) train_scores.append(train_r2_score) test_scores.append(test_r2_score) result = (np.array(train_scores), np.array(test_scores)) return result
可以看到結果如下:
degree 0 training score = 0.0, test score = -0.4780864173714179
degree 1 training score = 0.4292457781234663, test score = -0.45237104233936676
degree 2 training score = 0.4510998044408247, test score = -0.06856984149915935
degree 3 training score = 0.587199536877985, test score = 0.005331052945771075
degree 4 training score = 0.9194194471769332, test score = 0.7300494281868128
degree 5 training score = 0.9757864143068216, test score = 0.8770830091535732
degree 6 training score = 0.9901823324795085, test score = 0.9214093981312127
degree 7 training score = 0.9935250927840401, test score = 0.9202150411626775
degree 8 training score = 0.9963754538774482, test score = 0.6324794961087974
degree 9 training score = 0.9980370625648909, test score = -0.6452460241564286
所以最好的model 是degree = 6
Lasso regression
polynomial regression在高維度的時候明顯 (degree >= 8) 造成overfitting,試著用lasso regression來regularize。def lasso_regression(): from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import Lasso, LinearRegression from sklearn.pipeline import make_pipeline # create matrix version X_train_2D = X_train[:, np.newaxis] X_test_2D = X_test[:, np.newaxis] model_lr = make_pipeline(PolynomialFeatures(12), LinearRegression()) model_lr.fit(X_train_2D, y_train) model_lasso = make_pipeline(PolynomialFeatures(12), Lasso(alpha=0.01, max_iter=10000)) model_lasso.fit(X_train_2D, y_train) lr_test_r2_score = model_lr.score(X_test_2D, y_test) lasso_test_r2_score = model_lasso.score(X_test_2D, y_test) return (lr_test_r2_score, lasso_test_r2_score)
degree = 12的時候,lasso regression得到的test set R^2 score = 0.84,而ordinary linear regression則得到一個 -4.31的值,為何有負的?
沒有留言:
張貼留言