数据预处理&特征工程——Feature_Engineering.ipynb & Feature_Engineering2.ipynb
mlxtend: 含有聚和算法Stacking
项目整体运行时间预估为60min左右,在Ubuntu系统,8G内存,运行结果见所提交的jupyter notebook文件
import numpy as np import pandas as pd import pickle %matplotlib inline from IPython.display import display
# bids = pd.read_csv("bids.csv") bids = pickle.load(open("bids.pkl"))
print bids.shape display(bids.head())
(7656329, 9)
bid_id | bidder_id | auction | merchandise | device | time | country | ip | url | |
0 | 0 | 8dac2b259fd1c6d1120e519fb1ac14fbqvax8 | ewmzr | jewelry | phone0 | 9759243157894736 | us | | vasstdc27m7nks3 |
1 | 1 | 668d393e858e8126275433046bbd35c6tywop | aeqok | furniture | phone1 | 9759243157894736 | in | | jmqlhflrzwuay9c |
2 | 2 | aa5f360084278b35d746fa6af3a7a1a5ra3xe | wa00e | home goods | phone2 | 9759243157894736 | py | | vasstdc27m7nks3 |
3 | 3 | 3939ac3ef7d472a59a9c5f893dd3e39fh9ofi | jefix | jewelry | phone4 | 9759243157894736 | in | | vasstdc27m7nks3 |
4 | 4 | 8393c48eaf4b8fa96886edc7cf27b372dsibi | jefix | jewelry | phone5 | 9759243157894736 | in | | vasstdc27m7nks3 |
bidders = bids.groupby("bidder_id")针对国家、商品单一特征多类别转换为多个独立特征进行统计
cates = (bids["merchandise"].unique()).tolist() countries = (bids["country"].unique()).tolist() def dummy_coun_cate(group): coun_cate = dict.fromkeys(cates, 0) coun_cate.update(dict.fromkeys(countries, 0)) for cat, value in group["merchandise"].value_counts().iteritems(): coun_cate[cat] = value for c in group["country"].unique(): coun_cate[c] = 1 coun_cate = pd.Series(coun_cate) return coun_cate
bidder_coun_cate = bidders.apply(dummy_coun_cate)
display(bidder_coun_cate.describe()) bidder_coun_cate.to_csv("coun_cate.csv")
ad | ae | af | ag | al | am | an | ao | ar | at | ... | vc | ve | vi | vn | ws | ye | za | zm | zw | zz | |
count | 6609.000000 | 6609.000000 | 6609.000000 | 6609.000000 | 6609.000000 | 6609.000000 | 6609.000000 | 6609.000000 | 6609.000000 | 6609.000000 | ... | 6609.000000 | 6609.000000 | 6609.000000 | 6609.000000 | 6609.000000 | 6609.000000 | 6609.000000 | 6609.000000 | 6609.000000 | 6609.000000 |
mean | 0.002724 | 0.205629 | 0.054774 | 0.001059 | 0.048570 | 0.023907 | 0.000303 | 0.036314 | 0.120442 | 0.052655 | ... | 0.000605 | 0.033591 | 0.000303 | 0.130882 | 0.001967 | 0.040551 | 0.274474 | 0.067181 | 0.069753 | 0.000757 |
std | 0.052121 | 0.404191 | 0.227555 | 0.032530 | 0.214984 | 0.152770 | 0.017395 | 0.187085 | 0.325502 | 0.223362 | ... | 0.024596 | 0.180186 | 0.017395 | 0.337297 | 0.044311 | 0.197262 | 0.446283 | 0.250354 | 0.254750 | 0.027497 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
50% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
75% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
8 rows × 209 columns
def bidder_interval(group): time_diff = np.ediff1d(group["time"]) bidder_interval = {} if len(time_diff) == 0: diff_mean = 0 diff_std = 0 diff_median = 0 diff_zeros = 0 else: diff_mean = np.mean(time_diff) diff_std = np.std(time_diff) diff_median = np.median(time_diff) diff_zeros = time_diff.shape[0] - np.count_nonzero(time_diff) bidder_interval["tmean"] = diff_mean bidder_interval["tstd"] = diff_std bidder_interval["tmedian"] = diff_median bidder_interval["tzeros"] = diff_zeros bidder_interval = pd.Series(bidder_interval) return bidder_interval
bidder_inv = bidders.apply(bidder_interval)
display(bidder_inv.describe()) bidder_inv.to_csv("bidder_inv.csv")
tmean | tmedian | tstd | tzeros | |
count | 6.609000e+03 | 6.609000e+03 | 6.609000e+03 | 6609.000000 |
mean | 2.933038e+12 | 1.860285e+12 | 3.440901e+12 | 122.986231 |
std | 8.552343e+12 | 7.993497e+12 | 6.512992e+12 | 3190.805229 |
min | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000 |
25% | 1.192853e+10 | 2.578947e+09 | 1.749995e+09 | 0.000000 |
50% | 2.641139e+11 | 5.726316e+10 | 5.510107e+11 | 0.000000 |
75% | 1.847456e+12 | 6.339474e+11 | 2.911282e+12 | 0.000000 |
max | 7.610295e+13 | 7.610295e+13 | 3.800092e+13 | 231570.000000 |
时间间隔统计:统计各个用户在各个拍卖场每次竞拍的时间间隔的 均值、方差、中位数和0值
def auc_features_count(group): time_diff = np.ediff1d(group["time"]) if len(time_diff) == 0: diff_mean = 0 diff_std = 0 diff_median = 0 diff_zeros = 0 else: diff_mean = np.mean(time_diff) diff_std = np.std(time_diff) diff_median = np.median(time_diff) diff_zeros = time_diff.shape[0] - np.count_nonzero(time_diff) row = dict.fromkeys(cates, 0) row.update(dict.fromkeys(countries, 0)) row["devices_c"] = group["device"].unique().shape[0] row["countries_c"] = group["country"].unique().shape[0] row["ip_c"] = group["ip"].unique().shape[0] row["url_c"] = group["url"].unique().shape[0] # row["merch_c"] = group["merchandise"].unique().shape[0] row["bids_c"] = group.shape[0] row["tmean"] = diff_mean row["tstd"] = diff_std row["tmedian"] = diff_median row["tzeros"] = diff_zeros for cat, value in group["merchandise"].value_counts().iteritems(): row[cat] = value for c in group["country"].unique(): row[c] = 1 row = pd.Series(row) return row
bidder_auc = bids.groupby(["bidder_id", "auction"]).apply(auc_features_count)
print bidder_auc.shape
(382336, 218)模型设计与参数评估 合并特征
import numpy as np import pandas as pd %matplotlib inline from IPython.display import display
def merge_data(): train = pd.read_csv("train.csv") test = pd.read_csv("test.csv") time_differences = pd.read_csv("tdiff.csv", index_col=0) bids_auc = pd.read_csv("bids_auc.csv") bids_auc = bids_auc.groupby("bidder_id").mean() bidders = pd.read_csv("cnt_bidder.csv", index_col=0) country_cate = pd.read_csv("coun_cate.csv", index_col=0) bidder_inv = pd.read_csv("bidder_inv.csv", index_col=0) bidders = bidders.merge(country_cate, right_index=True, left_index=True) bidders = bidders.merge(bidder_inv, right_index=True, left_index=True) bidders = bidders.merge(bids_auc, right_index=True, left_index=True) bidders = bidders.merge(time_differences, right_index=True, left_index=True) train = train.merge(bidders, left_on="bidder_id", right_index=True) train.to_csv("train_full.csv", index=False) test = test.merge(bidders, left_on="bidder_id", right_index=True) test.to_csv("test_full.csv", index=False)
train_full = pd.read_csv("train_full.csv") test_full = pd.read_csv("test_full.csv") print train_full.shape print test_full.shape
(1983, 445) (4626, 444)
train_full["outcome"] = train_full["outcome"].astype(int) ytrain = train_full["outcome"] train_full.drop("outcome", 1, inplace=True) test_ids = test_full["bidder_id"] labels = ["payment_account", "address", "bidder_id"] train_full.drop(labels=labels, axis=1, inplace=True) test_full.drop(labels=labels, axis=1, inplace=True)设计交叉验证 模型选择
根据之前的分析,由于当前的数据集中存在正负例不均衡的问题,所以考虑选取了RandomForestClassfier, GradientBoostingClassifier, xgboost, lightgbm等四种模型来针对数据及进行训练和预测,确定最终模型的基本思路如下:
from scipy import interp import matplotlib.pyplot as plt from itertools import cycle # from sklearn.cross_validation import StratifiedKFold from sklearn.model_selection import StratifiedKFold from sklearn.metrics import roc_auc_score, roc_curve, auc def kfold_plot(train, ytrain, model): # kf = StratifiedKFold(y=ytrain, n_folds=5) kf = StratifiedKFold(n_splits=5) scores = [] mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) exe_time = [] colors = cycle(["cyan", "indigo", "seagreen", "yellow", "blue"]) lw = 2 i=0 for (train_index, test_index), color in zip(kf.split(train, ytrain), colors): X_train, X_test = train.iloc[train_index], train.iloc[test_index] y_train, y_test = ytrain.iloc[train_index], ytrain.iloc[test_index] begin_t = time.time() predictions = model(X_train, X_test, y_train) end_t = time.time() exe_time.append(round(end_t-begin_t, 3)) # model = model # model.fit(X_train, y_train) # predictions = model.predict_proba(X_test)[:, 1] scores.append(roc_auc_score(y_test.astype(float), predictions)) fpr, tpr, thresholds = roc_curve(y_test, predictions) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=lw, color=color, label="ROC fold %d (area = %0.2f)" % (i, roc_auc)) i += 1 plt.plot([0, 1], [0, 1], linestyle="--", lw=lw, color="k", label="Luck") mean_tpr /= kf.get_n_splits(train, ytrain) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, color="g", linestyle="--", label="Mean ROC (area = %0.2f)" % mean_auc, lw=lw) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel("False Positive Rate") plt.ylabel("True Positive Rate") plt.title("Receiver operating characteristic") plt.legend(loc="lower right") plt.show() # print "scores: ", scores print "mean scores: ", np.mean(scores) print "mean model process time: ", np.mean(exe_time), "s" return scores, np.mean(scores), np.mean(exe_time)
dct_scores = {} mean_score = {} mean_time = {}RandomForestClassifier
from sklearn.model_selection import GridSearchCV import time
from sklearn.ensemble import RandomForestClassifier def forest_model(X_train, X_test, y_train): # begin_t = time.time() model = RandomForestClassifier(n_estimators=160, max_features=35, max_depth=8, random_state=7) model.fit(X_train, y_train) # end_t = time.time() # print "train time of forest model: ",round(end_t-begin_t, 3), "s" predictions = model.predict_proba(X_test)[:, 1] return predictions
dct_scores["forest"], mean_score["forest"], mean_time["forest"] = kfold_plot(train_full, ytrain, forest_model) # kfold_plot(train_full, ytrain, model_forest)
mean scores: 0.909571935157 mean model process time: 0.643 s
from sklearn.ensemble import GradientBoostingClassifier def gradient_model(X_train, X_test, y_train): model = GradientBoostingClassifier(n_estimators=200, random_state=7, max_depth=5, learning_rate=0.03) model.fit(X_train, y_train) predictions = model.predict_proba(X_test)[:, 1] return predictions
dct_scores["gbm"], mean_score["gbm"], mean_time["gbm"] = kfold_plot(train_full, ytrain, gradient_model)
mean scores: 0.911847771023 mean model process time: 4.1948 s
import xgboost as xgb def xgboost_model(X_train, X_test, y_train): X_train = xgb.DMatrix(X_train.values, label=y_train.values) X_test = xgb.DMatrix(X_test.values) params = {"objective": "binary:logistic", "eval_metric": "auc", "silent": 1, "seed": 7, "max_depth": 6, "eta": 0.01} model = xgb.train(params, X_train, 600) predictions = model.predict(X_test) return predictions
/home/lancelot/anaconda2/envs/udacity/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20. "This module will be removed in 0.20.", DeprecationWarning)
dct_scores["xgboost"], mean_score["xgboost"], mean_time["xgboost"] = kfold_plot(train_full, ytrain, xgboost_model)
mean scores: 0.915372340426 mean model process time: 3.1482 s
import lightgbm as lgb def lightgbm_model(X_train, X_test, y_train): X_train = lgb.Dataset(X_train.values, y_train.values) params = {"objective": "binary", "metric": {"auc"}, "learning_rate": 0.01, "max_depth": 6, "seed": 7} model = lgb.train(params, X_train, num_boost_round=600) predictions = model.predict(X_test) return predictions
dct_scores["lgbm"], mean_score["lgbm"], mean_time["lgbm"] = kfold_plot(train_full, ytrain, lightgbm_model)
mean scores: 0.921512158055 mean model process time: 0.3558 s模型比较
def plot_model_comp(title, y_label, dct_result): data_source = dct_result.keys() y_pos = np.arange(len(data_source)) # model_auc = [0.910, 0.912, 0.915, 0.922] model_auc = dct_result.values() barlist = plt.bar(y_pos, model_auc, align="center", alpha=0.5) # get the index of highest score max_val = max(model_auc) idx = model_auc.index(max_val) barlist[idx].set_color("r") plt.xticks(y_pos, data_source) plt.ylabel(y_label) plt.title(title) plt.show() print "The highest auc score is {0} of model: {1}".format(max_val, data_source[idx])
plot_model_comp("Model Performance", "roc-auc score", mean_score)
The highest auc score is 0.921512158055 of model: lgbm
def plot_time_comp(title, y_label, dct_result): data_source = dct_result.keys() y_pos = np.arange(len(data_source)) # model_auc = [0.910, 0.912, 0.915, 0.922] model_auc = dct_result.values() barlist = plt.bar(y_pos, model_auc, align="center", alpha=0.5) # get the index of highest score min_val = min(model_auc) idx = model_auc.index(min_val) barlist[idx].set_color("r") plt.xticks(y_pos, data_source) plt.ylabel(y_label) plt.title(title) plt.show() print "The shortest time is {0} of model: {1}".format(min_val, data_source[idx])
plot_time_comp("Time of Building Model", "time(s)", mean_time)
The shortest time is 0.3558 of model: lgbm
auc_forest = dct_scores["forest"] auc_gb = dct_scores["gbm"] auc_xgb = dct_scores["xgboost"] auc_lgb = dct_scores["lgbm"] print "std of forest auc score: ",np.std(auc_forest) print "std of gbm auc score: ",np.std(auc_gb) print "std of xgboost auc score: ",np.std(auc_xgb) print "std of lightgbm auc score: ",np.std(auc_lgb) data_source = ["roc-fold-1", "roc-fold-2", "roc-fold-3", "roc-fold-4", "roc-fold-5"] y_pos = np.arange(len(data_source)) plt.plot(y_pos, auc_forest, "b-", label="forest") plt.plot(y_pos, auc_gb, "r-", label="gbm") plt.plot(y_pos, auc_xgb, "y-", label="xgboost") plt.plot(y_pos, auc_lgb, "g-", label="lightgbm") plt.title("roc-auc score of each epoch") plt.xlabel("epoch") plt.ylabel("roc-auc score") plt.legend() plt.show()
std of forest auc score: 0.0413757504568 std of gbm auc score: 0.027746291638 std of xgboost auc score: 0.0232931322563 std of lightgbm auc score: 0.0287156755513
stacking: 第三方库mlxtend里的stacking方法对子模型进行聚合得到聚合模型,并采用之前相同的cv方法对该模型进行打分评价
voting: 使用sklearn内置的VotingClassifier进行四个模型的聚合
def choose_xgb_model(X_train, y_train): tuned_params = [{"objective": ["binary:logistic"], "learning_rate": [0.01, 0.03, 0.05], "n_estimators": [100, 150, 200], "max_depth":[4, 6, 8]}] begin_t = time.time() clf = GridSearchCV(xgb.XGBClassifier(seed=7), tuned_params, scoring="roc_auc") clf.fit(X_train, y_train) end_t = time.time() print "train time: ",round(end_t-begin_t, 3), "s" print "current best parameters of xgboost: ",clf.best_params_ return clf.best_estimator_
bst_xgb = choose_xgb_model(train_full, ytrain)
train time: 48.141 s current best parameters of xgboost: {"n_estimators": 150, "objective": "binary:logistic", "learning_rate": 0.05, "max_depth": 4}
def choose_lgb_model(X_train, y_train): tuned_params = [{"objective": ["binary"], "learning_rate": [0.01, 0.03, 0.05], "n_estimators": [100, 150, 200], "max_depth":[4, 6, 8]}] begin_t = time.time() clf = GridSearchCV(lgb.LGBMClassifier(seed=7), tuned_params, scoring="roc_auc") clf.fit(X_train, y_train) end_t = time.time() print "train time: ",round(end_t-begin_t, 3), "s" print "current best parameters of lgb: ",clf.best_params_ return clf.best_estimator_
bst_lgb = choose_lgb_model(train_full, ytrain)
train time: 12.543 s current best parameters of lgb: {"n_estimators": 150, "objective": "binary", "learning_rate": 0.05, "max_depth": 4}
from mlxtend.classifier import StackingClassifier from sklearn import linear_model def stacking_model(X_train, X_test, y_train): lr = linear_model.LogisticRegression(random_state=7) sclf = StackingClassifier(classifiers=[bst_xgb, bst_lgb], use_probas=True, average_probas=False, meta_classifier=lr) sclf.fit(X_train, y_train) predictions = sclf.predict_proba(X_test)[:, 1] return predictions
dct_scores["stacking_1"], mean_score["stacking_1"], mean_time["stacking_1"] = kfold_plot(train_full, ytrain, stacking_model)
mean scores: 0.92157674772 mean model process time: 0.7022 s
def choose_forest_model(X_train, y_train): tuned_params = [{"n_estimators": [100, 150, 200], "max_features": [8, 15, 30], "max_depth":[4, 8, 10]}] begin_t = time.time() clf = GridSearchCV(RandomForestClassifier(random_state=7), tuned_params, scoring="roc_auc") clf.fit(X_train, y_train) end_t = time.time() print "train time: ",round(end_t-begin_t, 3), "s" print "current best parameters: ",clf.best_params_ return clf.best_estimator_
bst_forest = choose_forest_model(train_full, ytrain)
train time: 42.201 s current best parameters: {"max_features": 15, "n_estimators": 150, "max_depth": 8}
def choose_gradient_model(X_train, y_train): tuned_params = [{"n_estimators": [100, 150, 200], "learning_rate": [0.03, 0.05, 0.07], "min_samples_leaf": [8, 15, 30], "max_depth":[4, 6, 8]}] begin_t = time.time() clf = GridSearchCV(GradientBoostingClassifier(random_state=7), tuned_params, scoring="roc_auc") clf.fit(X_train, y_train) end_t = time.time() print "train time: ",round(end_t-begin_t, 3), "s" print "current best parameters: ",clf.best_params_ return clf.best_estimator_
bst_gradient = choose_gradient_model(train_full, ytrain)
train time: 641.872 s current best parameters: {"n_estimators": 100, "learning_rate": 0.03, "max_depth": 8, "min_samples_leaf": 30}
def stacking_model2(X_train, X_test, y_train): lr = linear_model.LogisticRegression(random_state=7) sclf = StackingClassifier(classifiers=[bst_xgb, bst_forest, bst_gradient, bst_lgb], use_probas=True, average_probas=False, meta_classifier=lr) sclf.fit(X_train, y_train) predictions = sclf.predict_proba(X_test)[:, 1] return predictions
dct_scores["stacking_2"], mean_score["stacking_2"], mean_time["stacking_2"] = kfold_plot(train_full, ytrain, stacking_model2)
mean scores: 0.92686550152 mean model process time: 4.0878 s
from sklearn.ensemble import VotingClassifier def voting_model(X_train, X_test, y_train): vclf = VotingClassifier(estimators=[("xgb", bst_xgb), ("rf", bst_forest), ("gbm",bst_gradient), ("lgb", bst_lgb)], voting="soft", weights=[2, 1, 1, 2]) vclf.fit(X_train, y_train) predictions = vclf.predict_proba(X_test)[:, 1] return predictions
dct_scores["voting"], mean_score["voting"], mean_time["voting"] = kfold_plot(train_full, ytrain, voting_model)
mean scores: 0.926889564336 mean model process time: 4.055 s
plot_model_comp("Model Performance", "roc-auc score", mean_score)
The highest auc score is 0.926889564336 of model: voting
综合模型,对测试文件进行最终预测# predict(train_full, test_full, y_train) def submit(X_train, X_test, y_train, test_ids): predictions = voting_model(X_train, X_test, y_train) sub = pd.read_csv("sampleSubmission.csv") result = pd.DataFrame() result["bidder_id"] = test_ids result["outcome"] = predictions sub = sub.merge(result, on="bidder_id", how="left") # Fill missing values with mean mean_pred = np.mean(predictions) sub.fillna(mean_pred, inplace=True) sub.drop("prediction", 1, inplace=True) sub.to_csv("result.csv", index=False, header=["bidder_id", "prediction"])
submit(train_full, test_full, ytrain, test_ids)
Chen, K. T., Pao, H. K. K., & Chang, H. C. (2008, October). Game bot identification based on manifold learning. In Proceedings of the 7th ACM SIGCOMM Workshop on Network and System Support for Games (pp. 21-26). ACM.
Alayed, H., Frangoudes, F., & Neuman, C. (2013, August). Behavioral-based cheating detection in online first person shooters using machine learning techniques. In Computational Intelligence in Games (CIG), 2013 IEEE Conference on (pp. 1-8). IEEE.
阅读 1122·2021-11-16 11:45
阅读 3137·2021-10-13 09:40
阅读 728·2019-08-26 13:45
阅读 1229·2019-08-26 13:32
阅读 2183·2019-08-26 13:23
阅读 929·2019-08-26 12:16
阅读 2835·2019-08-26 11:37
阅读 1767·2019-08-26 10:32