python random forest
This is an example of a Random Forest in Python.
import pandas as pd import numpy as np import re import pprint import pickle from sklearn.ensemble import RandomForestClassifier import matplotlib.pyplot as plt
Load the Pickled dataset
input_df=pickle.load(open('titanic.pkl','rb'))
features_list = input_df.columns.values[1::] X = input_df.values[:, 1::] y = input_df.values[:, 0] # Fit a random forest with (mostly) default parameters to determine feature importance forest = RandomForestClassifier(oob_score=True, n_estimators=10000) forest.fit(X, y)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_split=1e-07, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=1, oob_score=True, random_state=None, verbose=0, warm_start=False)
feature_importance = forest.feature_importances_ # make importances relative to max importance feature_importance = 100.0 * (feature_importance / feature_importance.max()) # A threshold below which to drop features from the final data set. Specifically, this number represents # the percentage of the most important feature's importance value fi_threshold = 5 # Get the indexes of all features over the importance threshold important_idx = np.where(feature_importance > fi_threshold)[0] # Create a list of all the feature names above the importance threshold important_features = features_list[important_idx] feature_values=feature_importance[important_idx] for f,v in zip(important_features,feature_values): print('{:20} {:5.2f}'.format(f,v)) # Get the sorted indexes of important features sorted_idx = np.argsort(feature_importance[important_idx])[::-1] important_features = features_list[sorted_idx] feature_values=feature_importance[sorted_idx] print("\n\nIn Descending Order") for f,v in zip(important_features,feature_values): print('{:20} {:5.2f}'.format(f,v)) #print "nFeatures sorted by importance (DESC):n", important_features[sorted_idx]
pclass 27.13 age 100.00 sibsp 21.10 parch 15.72 cabin_count 9.33 USA_based 15.09 sex_female 32.36 sex_male 33.61 embarked_C 7.63 embarked_S 6.78 title_Miss 9.91 title_Mr 34.19 title_Mrs 11.75 fare_code_H 7.05 fare_code_L 6.52 fare_code_M 5.68 deck_1 10.11 In Descending Order age 100.00 title_Capt 0.18 sex_male 33.61 sex_female 32.36 pclass 27.13 sibsp 21.10 parch 15.72 USA_based 15.09 title_Col 0.42 title_Jonkheer 0.09 embarked_S 6.78 cabin_count 9.33 embarked_C 7.63 title_Don 0.11 embarked_Q 3.04 title_Dona 0.01 title_Dr 0.75
# Adapted from http://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regression.html pos = np.arange(sorted_idx.shape[0]) + .5 plt.subplot(1, 2, 2) plt.barh(pos, feature_importance[important_idx][sorted_idx[::-1]], align='center') plt.yticks(pos, important_features[sorted_idx[::-1]]) plt.xlabel('Relative Importance') plt.title('Variable Importance') plt.draw() plt.show() # Remove non-important features from the feature set, and reorder those remaining #X = X[:, important_idx][:, sorted_idx]
input_df.head()
survived | pclass | age | sibsp | parch | cabin_count | USA_based | sex_female | sex_male | embarked_C | ... | fare_code_M | deck_1 | deck_A | deck_B | deck_C | deck_D | deck_E | deck_F | deck_G | deck_T | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 29 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 1 | 1 | 0 | 1 | 2 | 2 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 1 | 2 | 1 | 2 | 2 | 1 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 1 | 30 | 1 | 2 | 2 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 1 | 25 | 1 | 2 | 2 | 1 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
5 rows × 43 columns