train_df と test_df を用意して、関数に入れる。
#------------------------- # 関数定義 #------------------------- import xgboost as xgb def run_xgb_kfold(train_df,test_df,features,target,folds,params): oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) cv_list = [] feature_imps = pd.DataFrame() for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[features], train_df['manufacturer'])): print ('FOLD:' + str(n_fold)) train_x, train_y = train_df[features].iloc[train_idx], train_df[target].iloc[train_idx] valid_x, valid_y = train_df[features].iloc[valid_idx], train_df[target].iloc[valid_idx] dtrain = xgb.DMatrix(train_x, label=train_y) dval = xgb.DMatrix(valid_x, label=valid_y) num_round = 30000 #訓練データはdtrain、評価用のテストデータはdvalidと設定 watchlist = [(dtrain, 'train'), (dval, 'eval')] model = xgb.train(params, dtrain,#訓練データ num_round,#設定した学習回数 early_stopping_rounds=20, evals=watchlist, ) oof_preds[valid_idx] = model.predict(dval) oof_cv = rmse(valid_y, oof_preds[valid_idx]) cv_list.append(oof_cv) print (cv_list) sub_preds += model.predict(xgb.DMatrix(test_df[features])) / folds.n_splits feature_imp = pd.DataFrame(sorted(zip(model.get_score(importance_type='gain'),features)), columns=['Value','Feature']) feature_imp['fold'] = n_fold feature_imps = pd.concat([feature_imps,feature_imp],axis=0) cv = rmse(train_df[target], oof_preds) print('Full OOF RMSE %.6f' % cv) train_df['prediction'] = oof_preds test_df['prediction'] = sub_preds return train_df,test_df,feature_imps
#------------------------- # XGBoost への適用 #------------------------- train_df = df[df['flag']=='train'] train_df['target_col'] = np.log1p(train_df['target_col']) test_df = df[df['flag']=='test'] target = 'target_col' drop_features = ['id', 'target_col', 'description', 'flag'] + drop_feature_list features = [f for f in train_df.columns if f not in drop_features] print ('features:', len(features),features) n_splits = 5 seed = 817 folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed) params = { 'objective': 'reg:squarederror', 'silent':1, 'random_state':1234, # 学習用の指標 (RMSE) 'eval_metric': 'rmse', } train_xgb,test_xgb,feature_imps = run_xgb_kfold(train_df,test_df,features,target,folds,params)