python random forest

This is an example of a Random Forest in Python.

import pandas as pd
import numpy as np
import re
import pprint
import pickle
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

Load the Pickled dataset

input_df=pickle.load(open('titanic.pkl','rb'))
features_list = input_df.columns.values[1::]
X = input_df.values[:, 1::]
y = input_df.values[:, 0]

# Fit a random forest with (mostly) default parameters to determine feature importance
forest = RandomForestClassifier(oob_score=True, n_estimators=10000)
forest.fit(X, y)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10000, n_jobs=1, oob_score=True,
            random_state=None, verbose=0, warm_start=False)
feature_importance = forest.feature_importances_

# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())

# A threshold below which to drop features from the final data set. Specifically, this number represents
# the percentage of the most important feature's importance value
fi_threshold = 5

# Get the indexes of all features over the importance threshold
important_idx = np.where(feature_importance > fi_threshold)[0]

# Create a list of all the feature names above the importance threshold
important_features = features_list[important_idx]
feature_values=feature_importance[important_idx]
for f,v in zip(important_features,feature_values):
    print('{:20} {:5.2f}'.format(f,v))

# Get the sorted indexes of important features
sorted_idx = np.argsort(feature_importance[important_idx])[::-1]
important_features = features_list[sorted_idx]
feature_values=feature_importance[sorted_idx]
print("\n\nIn Descending Order")
for f,v in zip(important_features,feature_values):
    print('{:20} {:5.2f}'.format(f,v))
#print "nFeatures sorted by importance (DESC):n", important_features[sorted_idx]
pclass               27.13
age                  100.00
sibsp                21.10
parch                15.72
cabin_count           9.33
USA_based            15.09
sex_female           32.36
sex_male             33.61
embarked_C            7.63
embarked_S            6.78
title_Miss            9.91
title_Mr             34.19
title_Mrs            11.75
fare_code_H           7.05
fare_code_L           6.52
fare_code_M           5.68
deck_1               10.11


In Descending Order
age                  100.00
title_Capt            0.18
sex_male             33.61
sex_female           32.36
pclass               27.13
sibsp                21.10
parch                15.72
USA_based            15.09
title_Col             0.42
title_Jonkheer        0.09
embarked_S            6.78
cabin_count           9.33
embarked_C            7.63
title_Don             0.11
embarked_Q            3.04
title_Dona            0.01
title_Dr              0.75
# Adapted from http://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regression.html
pos = np.arange(sorted_idx.shape[0]) + .5
plt.subplot(1, 2, 2)
plt.barh(pos, feature_importance[important_idx][sorted_idx[::-1]], align='center')
plt.yticks(pos, important_features[sorted_idx[::-1]])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.draw()
plt.show()

# Remove non-important features from the feature set, and reorder those remaining
#X = X[:, important_idx][:, sorted_idx]

png

input_df.head()
survived pclass age sibsp parch cabin_count USA_based sex_female sex_male embarked_C ... fare_code_M deck_1 deck_A deck_B deck_C deck_D deck_E deck_F deck_G deck_T
0 1 1 29 0 0 1 1 1 0 0 ... 0 0 0 1 0 0 0 0 0 0
1 1 1 0 1 2 2 1 0 1 0 ... 0 0 0 0 1 0 0 0 0 0
2 0 1 2 1 2 2 1 1 0 0 ... 0 0 0 0 1 0 0 0 0 0
3 0 1 30 1 2 2 1 0 1 0 ... 0 0 0 0 1 0 0 0 0 0
4 0 1 25 1 2 2 1 1 0 0 ... 0 0 0 0 1 0 0 0 0 0

5 rows × 43 columns