Upload tfdecisiontrees_final.py
Browse files- tfdecisiontrees_final.py +274 -0
tfdecisiontrees_final.py
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""TFDecisionTrees_Final.ipynb
|
| 3 |
+
|
| 4 |
+
Automatically generated by Colaboratory.
|
| 5 |
+
|
| 6 |
+
Original file is located at
|
| 7 |
+
https://colab.research.google.com/drive/1QCdVlNQ8LszC_v3ek10DUeO9V0IvVzpm
|
| 8 |
+
|
| 9 |
+
# Classification with TF Decision Trees
|
| 10 |
+
Source code from https://keras.io/examples/structured_data/classification_with_tfdf/
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
!pip install huggingface_hub
|
| 14 |
+
|
| 15 |
+
!pip install numpy==1.20
|
| 16 |
+
|
| 17 |
+
!pip install folium==0.2.1
|
| 18 |
+
|
| 19 |
+
!pip install imgaug==0.2.6
|
| 20 |
+
|
| 21 |
+
!pip install tensorflow==2.8.0
|
| 22 |
+
|
| 23 |
+
!pip install -U tensorflow_decision_forests
|
| 24 |
+
|
| 25 |
+
!pip install ipykernel==4.10
|
| 26 |
+
|
| 27 |
+
!apt-get install -y git-lfs
|
| 28 |
+
|
| 29 |
+
!pip install wurlitzer
|
| 30 |
+
|
| 31 |
+
from huggingface_hub import notebook_login
|
| 32 |
+
from huggingface_hub.keras_mixin import push_to_hub_keras
|
| 33 |
+
|
| 34 |
+
notebook_login()
|
| 35 |
+
|
| 36 |
+
import math
|
| 37 |
+
import urllib
|
| 38 |
+
import numpy as np
|
| 39 |
+
import pandas as pd
|
| 40 |
+
import tensorflow as tf
|
| 41 |
+
from tensorflow import keras
|
| 42 |
+
from tensorflow.keras import layers
|
| 43 |
+
import tensorflow_decision_forests as tfdf
|
| 44 |
+
import os
|
| 45 |
+
import tempfile
|
| 46 |
+
|
| 47 |
+
tmpdir = tempfile.mkdtemp()
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
from wurlitzer import sys_pipes
|
| 51 |
+
except:
|
| 52 |
+
from colabtools.googlelog import CaptureLog as sys_pipes
|
| 53 |
+
|
| 54 |
+
input_path = "https://archive.ics.uci.edu/ml/machine-learning-databases/census-income-mld/census-income"
|
| 55 |
+
input_column_header = "income_level"
|
| 56 |
+
|
| 57 |
+
#Load data
|
| 58 |
+
|
| 59 |
+
BASE_PATH = input_path
|
| 60 |
+
CSV_HEADER = [ l.decode("utf-8").split(":")[0].replace(" ", "_")
|
| 61 |
+
for l in urllib.request.urlopen(f"{BASE_PATH}.names")
|
| 62 |
+
if not l.startswith(b"|")][2:]
|
| 63 |
+
|
| 64 |
+
CSV_HEADER.append(input_column_header)
|
| 65 |
+
|
| 66 |
+
train_data = pd.read_csv(f"{BASE_PATH}.data.gz", header=None, names=CSV_HEADER)
|
| 67 |
+
test_data = pd.read_csv(f"{BASE_PATH}.test.gz", header=None, names=CSV_HEADER)
|
| 68 |
+
|
| 69 |
+
train_data["migration_code-change_in_msa"] = train_data["migration_code-change_in_msa"].apply(lambda x: "Unansw" if x == " ?" else x)
|
| 70 |
+
|
| 71 |
+
test_data["migration_code-change_in_msa"] = test_data["migration_code-change_in_msa"].apply(lambda x: "Unansw" if x == " ?" else x)
|
| 72 |
+
|
| 73 |
+
print(train_data["migration_code-change_in_msa"].unique())
|
| 74 |
+
|
| 75 |
+
for i, value in enumerate(CSV_HEADER):
|
| 76 |
+
if value == "fill_inc_questionnaire_for_veteran's_admin":
|
| 77 |
+
CSV_HEADER[i] = "fill_inc_veterans_admin"
|
| 78 |
+
elif value == "migration_code-change_in_msa":
|
| 79 |
+
CSV_HEADER[i] = "migration_code_chx_in_msa"
|
| 80 |
+
elif value == "migration_code-change_in_reg":
|
| 81 |
+
CSV_HEADER[i] = "migration_code_chx_in_reg"
|
| 82 |
+
elif value == "migration_code-move_within_reg":
|
| 83 |
+
CSV_HEADER[i] = "migration_code_move_within_reg"
|
| 84 |
+
|
| 85 |
+
#inspect the classes of the label, the input_column_header in this case
|
| 86 |
+
classes = train_data["income_level"].unique().tolist()
|
| 87 |
+
print(f"Label classes: {classes}")
|
| 88 |
+
|
| 89 |
+
#rename columns containing invalid characters
|
| 90 |
+
train_data = train_data.rename(columns={"fill_inc_questionnaire_for_veteran's_admin": "fill_inc_veterans_admin", "migration_code-change_in_msa": "migration_code_chx_in_msa", "migration_code-change_in_reg" : "migration_code_chx_in_reg", "migration_code-move_within_reg" : "migration_code_move_within_reg"})
|
| 91 |
+
test_data = train_data.rename(columns={"fill_inc_questionnaire_for_veteran's_admin": "fill_inc_veterans_admin", "migration_code-change_in_msa": "migration_code_chx_in_msa", "migration_code-change_in_reg" : "migration_code_chx_in_reg", "migration_code-move_within_reg" : "migration_code_move_within_reg"})
|
| 92 |
+
|
| 93 |
+
#convert from string to integers
|
| 94 |
+
# This stage is necessary if your classification label is represented as a
|
| 95 |
+
# string. Note: Keras expected classification labels to be integers.
|
| 96 |
+
target_labels = [" - 50000.", " 50000+."]
|
| 97 |
+
train_data[input_column_header] = train_data[input_column_header].map(target_labels.index)
|
| 98 |
+
test_data[input_column_header] = test_data[input_column_header].map(target_labels.index)
|
| 99 |
+
|
| 100 |
+
#Observe shape of training and test data
|
| 101 |
+
print(f"Train data shape: {train_data.shape}")
|
| 102 |
+
print(f"Test data shape: {test_data.shape}")
|
| 103 |
+
print(train_data.head().T)
|
| 104 |
+
|
| 105 |
+
#define metadata
|
| 106 |
+
|
| 107 |
+
# Target column name.
|
| 108 |
+
TARGET_COLUMN_NAME = "income_level"
|
| 109 |
+
# Weight column name.
|
| 110 |
+
WEIGHT_COLUMN_NAME = "instance_weight"
|
| 111 |
+
# Numeric feature names.
|
| 112 |
+
NUMERIC_FEATURE_NAMES = [
|
| 113 |
+
"age",
|
| 114 |
+
"wage_per_hour",
|
| 115 |
+
"capital_gains",
|
| 116 |
+
"capital_losses",
|
| 117 |
+
"dividends_from_stocks",
|
| 118 |
+
"num_persons_worked_for_employer",
|
| 119 |
+
"weeks_worked_in_year",
|
| 120 |
+
]
|
| 121 |
+
|
| 122 |
+
# Categorical features and their vocabulary lists.
|
| 123 |
+
CATEGORICAL_FEATURES_WITH_VOCABULARY = {
|
| 124 |
+
feature_name: sorted(
|
| 125 |
+
[str(value) for value in list(train_data[feature_name].unique())]
|
| 126 |
+
)
|
| 127 |
+
for feature_name in CSV_HEADER
|
| 128 |
+
if feature_name
|
| 129 |
+
not in list(NUMERIC_FEATURE_NAMES + [WEIGHT_COLUMN_NAME, TARGET_COLUMN_NAME])
|
| 130 |
+
}
|
| 131 |
+
# All features names.
|
| 132 |
+
FEATURE_NAMES = NUMERIC_FEATURE_NAMES + list(
|
| 133 |
+
CATEGORICAL_FEATURES_WITH_VOCABULARY.keys()
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
"""Configure hyperparameters for the tree model."""
|
| 137 |
+
|
| 138 |
+
GROWING_STRATEGY = "BEST_FIRST_GLOBAL"
|
| 139 |
+
NUM_TREES = 250
|
| 140 |
+
MIN_EXAMPLES = 6
|
| 141 |
+
MAX_DEPTH = 5
|
| 142 |
+
SUBSAMPLE = 0.65
|
| 143 |
+
SAMPLING_METHOD = "RANDOM"
|
| 144 |
+
VALIDATION_RATIO = 0.1
|
| 145 |
+
|
| 146 |
+
#Implement training & evaluation procedure
|
| 147 |
+
def prepare_sample(features, target, weight):
|
| 148 |
+
for feature_name in features:
|
| 149 |
+
if feature_name in CATEGORICAL_FEATURES_WITH_VOCABULARY:
|
| 150 |
+
if features[feature_name].dtype != tf.dtypes.string:
|
| 151 |
+
# Convert categorical feature values to string.
|
| 152 |
+
features[feature_name] = tf.strings.as_string(features[feature_name])
|
| 153 |
+
return features, target, weight
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def run_experiment(model, train_data, test_data, num_epochs=1, batch_size=None):
|
| 157 |
+
|
| 158 |
+
train_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(
|
| 159 |
+
train_data, label="income_level", weight="instance_weight"
|
| 160 |
+
).map(prepare_sample, num_parallel_calls=tf.data.AUTOTUNE)
|
| 161 |
+
test_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(
|
| 162 |
+
test_data, label="income_level", weight="instance_weight"
|
| 163 |
+
).map(prepare_sample, num_parallel_calls=tf.data.AUTOTUNE)
|
| 164 |
+
|
| 165 |
+
model.fit(train_dataset, epochs=num_epochs, batch_size=batch_size)
|
| 166 |
+
_, accuracy = model.evaluate(test_dataset, verbose=0)
|
| 167 |
+
push_to_hub = True
|
| 168 |
+
print(f"Test accuracy: {round(accuracy * 100, 2)}%")
|
| 169 |
+
|
| 170 |
+
#Create model inputs
|
| 171 |
+
|
| 172 |
+
def create_model_inputs():
|
| 173 |
+
inputs = {}
|
| 174 |
+
for feature_name in FEATURE_NAMES:
|
| 175 |
+
if feature_name in NUMERIC_FEATURE_NAMES:
|
| 176 |
+
inputs[feature_name] = layers.Input(
|
| 177 |
+
name=feature_name, shape=(), dtype=tf.float32
|
| 178 |
+
)
|
| 179 |
+
else:
|
| 180 |
+
inputs[feature_name] = layers.Input(
|
| 181 |
+
name=feature_name, shape=(), dtype=tf.string
|
| 182 |
+
)
|
| 183 |
+
return inputs
|
| 184 |
+
|
| 185 |
+
"""# Experiment 1: Decision Forests with raw features"""
|
| 186 |
+
|
| 187 |
+
#Decision Forest with raw features
|
| 188 |
+
def specify_feature_usages(inputs):
|
| 189 |
+
feature_usages = []
|
| 190 |
+
|
| 191 |
+
for feature_name in inputs:
|
| 192 |
+
if inputs[feature_name].dtype == tf.dtypes.float32:
|
| 193 |
+
feature_usage = tfdf.keras.FeatureUsage(
|
| 194 |
+
name=feature_name, semantic=tfdf.keras.FeatureSemantic.NUMERICAL
|
| 195 |
+
)
|
| 196 |
+
else:
|
| 197 |
+
feature_usage = tfdf.keras.FeatureUsage(
|
| 198 |
+
name=feature_name, semantic=tfdf.keras.FeatureSemantic.CATEGORICAL
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
feature_usages.append(feature_usage)
|
| 202 |
+
return feature_usages
|
| 203 |
+
|
| 204 |
+
#Create GB trees model
|
| 205 |
+
def create_gbt_model():
|
| 206 |
+
gbt_model = tfdf.keras.GradientBoostedTreesModel(
|
| 207 |
+
features = specify_feature_usages(create_model_inputs()),
|
| 208 |
+
exclude_non_specified_features = True,
|
| 209 |
+
growing_strategy = GROWING_STRATEGY,
|
| 210 |
+
num_trees = NUM_TREES,
|
| 211 |
+
max_depth = MAX_DEPTH,
|
| 212 |
+
min_examples = MIN_EXAMPLES,
|
| 213 |
+
subsample = SUBSAMPLE,
|
| 214 |
+
validation_ratio = VALIDATION_RATIO,
|
| 215 |
+
task = tfdf.keras.Task.CLASSIFICATION,
|
| 216 |
+
loss = "DEFAULT",
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
gbt_model.compile(metrics=[keras.metrics.BinaryAccuracy(name="accuracy")])
|
| 220 |
+
return gbt_model
|
| 221 |
+
|
| 222 |
+
#Train and evaluate model
|
| 223 |
+
gbt_model = create_gbt_model()
|
| 224 |
+
run_experiment(gbt_model, train_data, test_data)
|
| 225 |
+
|
| 226 |
+
#Inspect the model: Model type, mask, input features, feature importance
|
| 227 |
+
print(gbt_model.summary())
|
| 228 |
+
|
| 229 |
+
inspector = gbt_model.make_inspector()
|
| 230 |
+
[field for field in dir(inspector) if not field.startswith("_")]
|
| 231 |
+
|
| 232 |
+
#plot the model
|
| 233 |
+
tfdf.model_plotter.plot_model_in_colab(gbt_model, tree_idx=0, max_depth=3)
|
| 234 |
+
|
| 235 |
+
#display variable importance
|
| 236 |
+
inspector.variable_importances()
|
| 237 |
+
|
| 238 |
+
print("Model type:", inspector.model_type())
|
| 239 |
+
print("Number of trees:", inspector.num_trees())
|
| 240 |
+
print("Objective:", inspector.objective())
|
| 241 |
+
print("Input features:", inspector.features())
|
| 242 |
+
|
| 243 |
+
inspector.features()
|
| 244 |
+
|
| 245 |
+
#save_path = os.path.join(tmpdir, "raw/1/")
|
| 246 |
+
gbt_model.save("/Users/tdubon/TF_Model")
|
| 247 |
+
|
| 248 |
+
"""# Creating HF Space"""
|
| 249 |
+
|
| 250 |
+
from huggingface_hub import KerasModelHubMixin
|
| 251 |
+
from huggingface_hub.keras_mixin import push_to_hub_keras
|
| 252 |
+
push_to_hub_keras(gbt_model, repo_url="https://huggingface.co/keras-io/TF_Decision_Trees")
|
| 253 |
+
|
| 254 |
+
#Clone and configure
|
| 255 |
+
!git clone https://tdubon:[email protected]/tdubon/TF_Decision_Trees
|
| 256 |
+
|
| 257 |
+
!cd TFClassificationForest
|
| 258 |
+
!git config --global user.email "[email protected]"
|
| 259 |
+
# Tip: using the same email than for your huggingface.co account will link your commits to your profile
|
| 260 |
+
!git config --global user.name "tdubon"
|
| 261 |
+
|
| 262 |
+
!git add .
|
| 263 |
+
!git commit -m "Initial commit"
|
| 264 |
+
!git push
|
| 265 |
+
|
| 266 |
+
tf.keras.models.save_model(
|
| 267 |
+
gbt_model, "/Users/tdubon/TFClassificationForest", overwrite=True, include_optimizer=True, save_format=None,
|
| 268 |
+
signatures=None, options=None, save_traces=True)
|
| 269 |
+
|
| 270 |
+
# Commented out IPython magic to ensure Python compatibility.
|
| 271 |
+
gbt_model.make_inspector().export_to_tensorboard("/tmp/tb_logs/model_1")
|
| 272 |
+
|
| 273 |
+
# %load_ext tensorboard
|
| 274 |
+
# %tensorboard --logdir "/tmp/tb_logs"
|