>>> from sklearn.datasets import load_iris
>>> from sklearn.feature_selection import SelectKBest
>>> from sklearn.feature_selection import chi2
>>> iris = load_iris()
>>> X, y = iris.data, iris.target
>>> X.shape
(150, 4)
>>> X_new = SelectKBest(chi2, k=2).fit_transform(X, y)
from statsmodels.stats.outliers_influence import variance_inflation_factor
def check_vif(data, test, target_name, cols_to_drop, verbose=0):
if verbose == 1:
print('Checking VIF values')
X_train_multicoll = data.drop([target_name], axis=1).copy()
X_train_multicoll['intercept'] = 1
max_vif_value = float('inf')
if verbose == 1:
print(X_train_multicoll.shape)
while max_vif_value > 100:
vif = [variance_inflation_factor(X_train_multicoll.values, i) for i in
range(X_train_multicoll.shape[1])]
g = [i for i in list(zip(X_train_multicoll.columns, vif))]
g = [i for i in g if i[0] != 'intercept']
max_vif = max(g, key=itemgetter(1))
if verbose == 1:
print(max_vif)
if max_vif[1] < 100:
if verbose == 1:
print('Done')
break
else:
X_train_multicoll.drop([max_vif[0]], axis=1, inplace=True)
cols_to_drop.append(max_vif[0])
data.drop([max_vif[0]], axis=1, inplace=True)
test.drop([max_vif[0]], axis=1, inplace=True)
if verbose == 1:
print(X_train_multicoll.shape)
max_vif_value = max_vif[1]
return data, test, cols_to_drop
Text
def permutation_importances(rf, X_train, y_train, metric):
baseline = metric(rf, X_train, y_train)
imp = []
for col in X_train.columns:
save = X_train[col].copy()
X_train[col] = np.random.permutation(X_train[col])
m = metric(rf, X_train, y_train)
X_train[col] = save
imp.append(baseline - m)
return np.array(imp)
perm = PermutationImportance(model, random_state=1).fit(X_train, y_train)
eli5.show_weights(perm, top=50)
https://www.kaggle.com/artgor/eda-feature-engineering-and-model-interpretation
https://www.kaggle.com/artgor/eda-feature-engineering-and-model-interpretation
from boruta import BorutaPy
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)
# find all relevant features - 5 features should be selected
feat_selector.fit(X, y)
# check selected features - first 5 features are selected
feat_selector.support_
# check ranking of features
feat_selector.ranking_
# call transform() on X to filter it down to selected features
X_filtered = feat_selector.transform(X)
from boostaroota import BoostARoota
br = BoostARoota(metric='logloss')
#Fit the model for the subset of variables
br.fit(x, y)
#Can look at the important variables - will return a pandas series
br.keep_vars_
#Then modify dataframe to only include the important variables
x1 = br.transform(x)
features = X_train.columns
X_train['target'] = 0
X_valid['target'] = 1
train_test = pd.concat([X_train, X_valid], axis =0)
target = train_test['target']
# train model
Guided Regularized Random Forests
Генетические алгоритмы