def cv_model(clf, train_x, train_y, test_x, clf_name, seed = 2023): ''' clf:调用模型 train_x:训练数据 train_y:训练数据对应标签 test_x:测试数据 clf_name:选择使用模型名 seed:随机种子 ''' folds = 5 kf = KFold(n_splits=folds, shuffle=True, random_state=seed) oof = np.zeros(train_x.shape[0]) test_predict = np.zeros(test_x.shape[0]) cv_scores = [] for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)): print('************************************ {} ************************************'.format(str(i+1))) trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index] if clf_name == "lgb": train_matrix = clf.Dataset(trn_x, label=trn_y) valid_matrix = clf.Dataset(val_x, label=val_y) params = { 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'mae', 'min_child_weight': 6, 'num_leaves': 2 ** 6, 'lambda_l2': 10, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'bagging_freq': 4, 'learning_rate': 0.1, 'seed': 2023, 'nthread' : 16, 'verbose' : -1, } model = clf.train(params, train_matrix, 2000, valid_sets=[train_matrix, valid_matrix], categorical_feature=[], verbose_eval=200, early_stopping_rounds=100) val_pred = model.predict(val_x, num_iteration=model.best_iteration) test_pred = model.predict(test_x, num_iteration=model.best_iteration) if clf_name == "xgb": xgb_params = { 'booster': 'gbtree', 'objective': 'reg:squarederror', 'eval_metric': 'mae', 'max_depth': 5, 'lambda': 10, 'subsample': 0.7, 'colsample_bytree': 0.7, 'colsample_bylevel': 0.7, 'eta': 0.1, 'tree_method': 'hist', 'seed': 520, 'nthread': 16 } train_matrix = clf.DMatrix(trn_x , label=trn_y) valid_matrix = clf.DMatrix(val_x , label=val_y) test_matrix = clf.DMatrix(test_x) watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')] model = clf.train(xgb_params, train_matrix, num_boost_round=2000, evals=watchlist, verbose_eval=200, early_stopping_rounds=100) val_pred = model.predict(valid_matrix) test_pred = model.predict(test_matrix) if clf_name == "cat": params = {'learning_rate': 0.1, 'depth': 5, 'bootstrap_type':'Bernoulli','random_seed':2023, 'od_type': 'Iter', 'od_wait': 100, 'random_seed': 11, 'allow_writing_files': False} model = clf(iterations=2000, **params) model.fit(trn_x, trn_y, eval_set=(val_x, val_y), metric_period=200, use_best_model=True, cat_features=[], verbose=1) val_pred = model.predict(val_x) test_pred = model.predict(test_x) oof[valid_index] = val_pred test_predict += test_pred / kf.n_splits score = mean_absolute_error(val_y, val_pred) cv_scores.append(score) print(cv_scores) return oof, test_predict
# 选择lightgbm模型lgb_oof, lgb_test = cv_model(lgb, train_df[cols], train_df['power'], test_df[cols], 'lgb')# 选择xgboost模型xgb_oof, xgb_test = cv_model(xgb, train_df[cols], train_df['power'], test_df[cols], 'xgb')# 选择catboost模型cat_oof, cat_test = cv_model(CatBoostRegressor, train_df[cols], train_df['power'], test_df[cols], 'cat')
# 进行取平均融合final_test = (lgb_test + xgb_test + cat_test) / 3
评论