streamline your model building
Israel Saeta Pérez - PyBCN March 2016
slides.com/israelsaetaperez/sklearn-pandas
* improved in sklearn>=0.16!
*
scikit-learn transformers
Standardize: mean = 0, stdev = 1
>>> from sklearn import preprocessing
>>> import numpy as np
>>> X = np.array([[ 1., -1., 2.],
... [ 2., 0., 0.],
... [ 0., 1., -1.]])
>>> scaler = preprocessing.StandardScaler().fit(X)
>>> scaler.transform(X)
array([[ 0. ..., -1.22..., 1.33...],
[ 1.22..., 0. ..., -0.26...],
[-1.22..., 1.22..., -1.06...]])
Dummyfication
>>> from sklearn.preprocessing import OneHotEncoder
>>> enc = OneHotEncoder()
>>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
OneHotEncoder(categorical_features='all', dtype=<... 'float'>,
handle_unknown='error', n_values='auto', sparse=True)
>>> enc.transform([[0, 1, 1]]).toarray()
array([[ 1., 0., 0., 1., 0., 0., 1., 0., 0.]])
input features must be numbers :(
1st
2nd
3rd
input features must be numbers :(
imputation
>>> import numpy as np
>>> from sklearn.preprocessing import Imputer
>>> imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
>>> imp.fit([[1, 2], [np.nan, 3], [7, 6]])
Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)
>>> X = [[np.nan, 2], [6, np.nan], [7, 6]]
>>> print(imp.transform(X))
[[ 4. 2. ]
[ 6. 3.666...]
[ 7. 6. ]]
1912, Southampton - New York
>>> df = pd.read_csv('train.csv')
>>> df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 90.5+ KB
Missing values
Index
Binary
Label
Categorical
# copy original df
dft = df.copy()
# encode string features as numbers
dft['Sex'] = LabelEncoder().fit_transform(dft['Sex'])
dft['Embarked'] = dft['Embarked'].replace({'S': 1, 'C': 2, 'Q': 3})
# impute missing values
dft['Embarked'] = Imputer(strategy='most_frequent').fit_transform(dft[['Embarked']])
dft['Age'] = Imputer(strategy='mean').fit_transform(dft[['Age']])
# standardize continuous variables
to_standardize = ['Age', 'SibSp', 'Parch', 'Fare']
dft[to_standardize] = StandardScaler().fit_transform(dft[to_standardize])
# select input columns for the model
X = dft[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
Xt = OneHotEncoder(categorical_features=[0, 1, 6]).fit_transform(X)
Feature indexes :(
>>> y = dft['Survived'].values
>>> clf = LogisticRegression()
>>> scores = cross_validation.cross_val_score(clf, Xt, y, cv=10)
>>> print('Accuracy: {:0.3f}'.format(scores.mean()))
Accuracy: 0.800
bridge between Scikit-Learn’s machine learning methods and pandas-style Data Frames
dft = df.copy()
dft['Embarked'] = dft['Embarked'].replace({'S': 1, 'C': 2, 'Q': 3})
mapper = DataFrameMapper([
('Pclass', LabelBinarizer()),
(['Embarked'], [Imputer(strategy='most_frequent'), OneHotEncoder()]),
(['Age'], [Imputer(strategy='mean'), StandardScaler()]),
(['SibSp', 'Parch', 'Fare'], StandardScaler()),
('Sex', LabelBinarizer())
])
(n_samples,)
(n_samples, n_feats)
don't accept string features :(
>>> clf = LogisticRegression()
>>> pipe = make_pipeline(mapper, clf)
>>> scores = cross_validation.cross_val_score(pipe, X, y, cv=10)
>>> print('Accuracy: {:0.3f}'.format(scores.mean()))
Accuracy: 0.800
can be used inside pipeline!