- import numpy as np
- import pandas as pd
- from sklearn.model_selection import train_test_split
- np.random.seed(0)
- n = 15
- x = np.linspace(0,10,n) + np.random.randn(n)/5
- y = np.sin(x)+x/6 + np.random.randn(n)/10
- X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0)
使用polynomial regression來fit data
這個dataset很明顯是non-linear separable,嘗試先用polynomial feature transformation來增加維度。我們測試不同polynomial degree對一百個input點預測:
- def polynomial_regression():
- from sklearn.linear_model import LinearRegression
- from sklearn.preprocessing import PolynomialFeatures
- from sklearn.pipeline import make_pipeline
- # generate points used for prediction
- x_prediction = np.linspace(0, 10, 100)
- x_prediction = x_prediction[:, np.newaxis]
- # create matrix version
- X_poly = X_train[:, np.newaxis]
- result = []
- for count, degree in enumerate([1, 3, 6, 9]):
- model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
- model.fit(X_poly, y_train)
- y_predicted = model.predict(x_prediction)
- result.append(y_predicted)
- result = np.array(result)
- return result
這四個model train出來,利用100個點預測如下:
找出不同degree的R-squared score
我們現在把degree 0 ~ 9的polynomial regression R^2 score列出來,包含training set 和 test set:
- def ploynomial_r2_score():
- from sklearn.linear_model import LinearRegression
- from sklearn.preprocessing import PolynomialFeatures
- from sklearn.pipeline import make_pipeline
- # create matrix version
- X_poly = X_train[:, np.newaxis]
- train_scores = []
- test_scores = []
- for count, degree in enumerate(range(0,10)):
- model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
- model.fit(X_poly, y_train)
- train_r2_score = model.score(X_poly, y_train)
- test_r2_score = model.score(X_test[:, np.newaxis], y_test)
- train_scores.append(train_r2_score)
- test_scores.append(test_r2_score)
- result = (np.array(train_scores), np.array(test_scores))
- return result
可以看到結果如下:
degree 0 training score = 0.0, test score = -0.4780864173714179
degree 1 training score = 0.4292457781234663, test score = -0.45237104233936676
degree 2 training score = 0.4510998044408247, test score = -0.06856984149915935
degree 3 training score = 0.587199536877985, test score = 0.005331052945771075
degree 4 training score = 0.9194194471769332, test score = 0.7300494281868128
degree 5 training score = 0.9757864143068216, test score = 0.8770830091535732
degree 6 training score = 0.9901823324795085, test score = 0.9214093981312127
degree 7 training score = 0.9935250927840401, test score = 0.9202150411626775
degree 8 training score = 0.9963754538774482, test score = 0.6324794961087974
degree 9 training score = 0.9980370625648909, test score = -0.6452460241564286
所以最好的model 是degree = 6
Lasso regression
polynomial regression在高維度的時候明顯 (degree >= 8) 造成overfitting,試著用lasso regression來regularize。
- def lasso_regression():
- from sklearn.preprocessing import PolynomialFeatures
- from sklearn.linear_model import Lasso, LinearRegression
- from sklearn.pipeline import make_pipeline
- # create matrix version
- X_train_2D = X_train[:, np.newaxis]
- X_test_2D = X_test[:, np.newaxis]
- model_lr = make_pipeline(PolynomialFeatures(12), LinearRegression())
- model_lr.fit(X_train_2D, y_train)
- model_lasso = make_pipeline(PolynomialFeatures(12), Lasso(alpha=0.01, max_iter=10000))
- model_lasso.fit(X_train_2D, y_train)
- lr_test_r2_score = model_lr.score(X_test_2D, y_test)
- lasso_test_r2_score = model_lasso.score(X_test_2D, y_test)
- return (lr_test_r2_score, lasso_test_r2_score)
degree = 12的時候,lasso regression得到的test set R^2 score = 0.84,而ordinary linear regression則得到一個 -4.31的值,為何有負的?
沒有留言:
張貼留言