|
| 1 | +from __future__ import print_function |
| 2 | + |
| 3 | +import math |
| 4 | + |
| 5 | +from IPython import display |
| 6 | +from matplotlib import cm |
| 7 | +from matplotlib import gridspec |
| 8 | +from matplotlib import pyplot as plt |
| 9 | +import numpy as np |
| 10 | +import pandas as pd |
| 11 | +from sklearn import metrics |
| 12 | +from sklearn.utils import shuffle |
| 13 | +import tensorflow as tf |
| 14 | +from tensorflow.python.data import Dataset |
| 15 | + |
| 16 | +tf.logging.set_verbosity(tf.logging.ERROR) |
| 17 | +pd.options.display.max_rows = 10 |
| 18 | +pd.options.display.float_format = '{:.1f}'.format |
| 19 | + |
| 20 | +california_housing_dataframe = pd.read_csv("https://dl.google.com/mlcc/mledu-datasets/california_housing_train.csv", sep=",") |
| 21 | +california_housing_dataframe = shuffle(california_housing_dataframe) |
| 22 | + |
| 23 | +def preprocess_features(california_housing_dataframe): |
| 24 | + """Prepares input features from California housing data set. |
| 25 | + Args: |
| 26 | + california_housing_dataframe: A Pandas DataFrame expected to contain data |
| 27 | + from the California housing data set. |
| 28 | + Returns: |
| 29 | + A DataFrame that contains the features to be used for the model, including |
| 30 | + synthetic features. |
| 31 | + """ |
| 32 | + selected_features = california_housing_dataframe[ |
| 33 | + ["latitude", |
| 34 | + "longitude", |
| 35 | + "housing_median_age", |
| 36 | + "total_rooms", |
| 37 | + "total_bedrooms", |
| 38 | + "population", |
| 39 | + "households", |
| 40 | + "median_income"]] |
| 41 | + processed_features = selected_features.copy() |
| 42 | + # Create a synthetic feature. |
| 43 | + processed_features["rooms_per_person"] = ( |
| 44 | + california_housing_dataframe["total_rooms"] / |
| 45 | + california_housing_dataframe["population"]) |
| 46 | + return processed_features |
| 47 | + |
| 48 | +def preprocess_targets(california_housing_dataframe): |
| 49 | + """Prepares target features (i.e., labels) from California housing data set. |
| 50 | + Args: |
| 51 | + california_housing_dataframe: A Pandas DataFrame expected to contain data |
| 52 | + from the California housing data set. |
| 53 | + Returns: |
| 54 | + A DataFrame that contains the target feature. |
| 55 | + """ |
| 56 | + output_targets = pd.DataFrame() |
| 57 | + # Scale the target to be in units of thousands of dollars. |
| 58 | + output_targets["median_house_value"] = ( |
| 59 | + california_housing_dataframe["median_house_value"] / 1000.0) |
| 60 | + return output_targets |
| 61 | + |
| 62 | + |
| 63 | + |
| 64 | +# Choose the first 12000 (out of 17000) examples for training. |
| 65 | +training_examples = preprocess_features(california_housing_dataframe.head(12000)) |
| 66 | +training_targets = preprocess_targets(california_housing_dataframe.head(12000)) |
| 67 | + |
| 68 | +# Choose the last 5000 (out of 17000) examples for validation. |
| 69 | +validation_examples = preprocess_features(california_housing_dataframe.tail(5000)) |
| 70 | +validation_targets = preprocess_targets(california_housing_dataframe.tail(5000)) |
| 71 | + |
| 72 | +# Double-check that we've done the right thing. |
| 73 | +print("Training examples summary:") |
| 74 | +display.display(training_examples.describe()) |
| 75 | +print("Validation examples summary:") |
| 76 | +display.display(validation_examples.describe()) |
| 77 | + |
| 78 | +print("Training targets summary:") |
| 79 | +display.display(training_targets.describe()) |
| 80 | +print("Validation targets summary:") |
| 81 | +display.display(validation_targets.describe()) |
| 82 | + |
| 83 | +def construct_feature_columns(input_features): |
| 84 | + """Construct the TensorFlow Feature Columns. |
| 85 | + Args: |
| 86 | + input_features: The names of the numerical input features to use. |
| 87 | + Returns: |
| 88 | + A set of feature columns |
| 89 | + """ |
| 90 | + return set([tf.feature_column.numeric_column(my_feature) |
| 91 | + for my_feature in input_features]) |
| 92 | + |
| 93 | +def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None): |
| 94 | + """Trains a linear regression model of one feature. |
| 95 | + |
| 96 | + Args: |
| 97 | + features: pandas DataFrame of features |
| 98 | + targets: pandas DataFrame of targets |
| 99 | + batch_size: Size of batches to be passed to the model |
| 100 | + shuffle: True or False. Whether to shuffle the data. |
| 101 | + num_epochs: Number of epochs for which data should be repeated. None = repeat indefinitely |
| 102 | + Returns: |
| 103 | + Tuple of (features, labels) for next data batch |
| 104 | + """ |
| 105 | + |
| 106 | + # Convert pandas data into a dict of np arrays. |
| 107 | + features = {key:np.array(value) for key,value in dict(features).items()} |
| 108 | + |
| 109 | + # Construct a dataset, and configure batching/repeating |
| 110 | + ds = Dataset.from_tensor_slices((features,targets)) # warning: 2GB limit |
| 111 | + ds = ds.batch(batch_size).repeat(num_epochs) |
| 112 | + |
| 113 | + # Shuffle the data, if specified |
| 114 | + if shuffle: |
| 115 | + ds = ds.shuffle(10000) |
| 116 | + |
| 117 | + # Return the next batch of data |
| 118 | + features, labels = ds.make_one_shot_iterator().get_next() |
| 119 | + return features, labels |
| 120 | + |
| 121 | + |
| 122 | +def train_model( |
| 123 | + learning_rate, |
| 124 | + steps, |
| 125 | + batch_size, |
| 126 | + feature_columns, |
| 127 | + training_examples, |
| 128 | + training_targets, |
| 129 | + validation_examples, |
| 130 | + validation_targets): |
| 131 | + """Trains a linear regression model. |
| 132 | + |
| 133 | + In addition to training, this function also prints training progress information, |
| 134 | + as well as a plot of the training and validation loss over time. |
| 135 | + |
| 136 | + Args: |
| 137 | + learning_rate: A `float`, the learning rate. |
| 138 | + steps: A non-zero `int`, the total number of training steps. A training step |
| 139 | + consists of a forward and backward pass using a single batch. |
| 140 | + feature_columns: A `set` specifying the input feature columns to use. |
| 141 | + training_examples: A `DataFrame` containing one or more columns from |
| 142 | + `california_housing_dataframe` to use as input features for training. |
| 143 | + training_targets: A `DataFrame` containing exactly one column from |
| 144 | + `california_housing_dataframe` to use as target for training. |
| 145 | + validation_examples: A `DataFrame` containing one or more columns from |
| 146 | + `california_housing_dataframe` to use as input features for validation. |
| 147 | + validation_targets: A `DataFrame` containing exactly one column from |
| 148 | + `california_housing_dataframe` to use as target for validation. |
| 149 | + |
| 150 | + Returns: |
| 151 | + A `LinearRegressor` object trained on the training data. |
| 152 | + """ |
| 153 | + |
| 154 | + periods = 10 |
| 155 | + steps_per_period = steps / periods |
| 156 | + |
| 157 | + # Create a linear regressor object. |
| 158 | + my_optimizer = tf.train.FtrlOptimizer(learning_rate=learning_rate) |
| 159 | + my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0) |
| 160 | + linear_regressor = tf.estimator.LinearRegressor( |
| 161 | + feature_columns=feature_columns, |
| 162 | + optimizer=my_optimizer |
| 163 | + ) |
| 164 | + |
| 165 | + training_input_fn = lambda: my_input_fn(training_examples, |
| 166 | + training_targets["median_house_value"], |
| 167 | + batch_size=batch_size) |
| 168 | + predict_training_input_fn = lambda: my_input_fn(training_examples, |
| 169 | + training_targets["median_house_value"], |
| 170 | + num_epochs=1, |
| 171 | + shuffle=False) |
| 172 | + predict_validation_input_fn = lambda: my_input_fn(validation_examples, |
| 173 | + validation_targets["median_house_value"], |
| 174 | + num_epochs=1, |
| 175 | + shuffle=False) |
| 176 | + |
| 177 | + # Train the model, but do so inside a loop so that we can periodically assess |
| 178 | + # loss metrics. |
| 179 | + print("Training model...") |
| 180 | + print("RMSE (on training data):") |
| 181 | + training_rmse = [] |
| 182 | + validation_rmse = [] |
| 183 | + for period in range (0, periods): |
| 184 | + # Train the model, starting from the prior state. |
| 185 | + linear_regressor.train( |
| 186 | + input_fn=training_input_fn, |
| 187 | + steps=steps_per_period |
| 188 | + ) |
| 189 | + # Take a break and compute predictions. |
| 190 | + training_predictions = linear_regressor.predict(input_fn=predict_training_input_fn) |
| 191 | + training_predictions = np.array([item['predictions'][0] for item in training_predictions]) |
| 192 | + validation_predictions = linear_regressor.predict(input_fn=predict_validation_input_fn) |
| 193 | + validation_predictions = np.array([item['predictions'][0] for item in validation_predictions]) |
| 194 | + |
| 195 | + # Compute training and validation loss. |
| 196 | + training_root_mean_squared_error = math.sqrt( |
| 197 | + metrics.mean_squared_error(training_predictions, training_targets)) |
| 198 | + validation_root_mean_squared_error = math.sqrt( |
| 199 | + metrics.mean_squared_error(validation_predictions, validation_targets)) |
| 200 | + # Occasionally print the current loss. |
| 201 | + print(" period %02d : %0.2f" % (period, training_root_mean_squared_error)) |
| 202 | + # Add the loss metrics from this period to our list. |
| 203 | + training_rmse.append(training_root_mean_squared_error) |
| 204 | + validation_rmse.append(validation_root_mean_squared_error) |
| 205 | + print("Model training finished.") |
| 206 | + |
| 207 | + |
| 208 | + # Output a graph of loss metrics over periods. |
| 209 | + plt.ylabel("RMSE") |
| 210 | + plt.xlabel("Periods") |
| 211 | + plt.title("Root Mean Squared Error vs. Periods") |
| 212 | + plt.tight_layout() |
| 213 | + plt.plot(training_rmse, label="training") |
| 214 | + plt.plot(validation_rmse, label="validation") |
| 215 | + plt.legend() |
| 216 | + |
| 217 | + return linear_regressor |
| 218 | + |
| 219 | +_ = train_model( |
| 220 | + learning_rate=1.0, |
| 221 | + steps=500, |
| 222 | + batch_size=100, |
| 223 | + feature_columns=construct_feature_columns(training_examples), |
| 224 | + training_examples=training_examples, |
| 225 | + training_targets=training_targets, |
| 226 | + validation_examples=validation_examples, |
| 227 | + validation_targets=validation_targets) |
| 228 | + |
| 229 | +def get_quantile_based_boundaries(feature_values, num_buckets): |
| 230 | + boundaries = np.arange(1.0, num_buckets) / num_buckets |
| 231 | + quantiles = feature_values.quantile(boundaries) |
| 232 | + return [quantiles[q] for q in quantiles.keys()] |
| 233 | + |
| 234 | +def construct_feature_columns(): |
| 235 | + """Construct the TensorFlow Feature Columns. |
| 236 | + Returns: |
| 237 | + A set of feature columns |
| 238 | + """ |
| 239 | + households = tf.feature_column.numeric_column("households") |
| 240 | + longitude = tf.feature_column.numeric_column("longitude") |
| 241 | + latitude = tf.feature_column.numeric_column("latitude") |
| 242 | + housing_median_age = tf.feature_column.numeric_column("housing_median_age") |
| 243 | + median_income = tf.feature_column.numeric_column("median_income") |
| 244 | + rooms_per_person = tf.feature_column.numeric_column("rooms_per_person") |
| 245 | + |
| 246 | + # Divide households into 7 buckets. |
| 247 | + bucketized_households = tf.feature_column.bucketized_column( |
| 248 | + households, boundaries=get_quantile_based_boundaries( |
| 249 | + training_examples["households"], 7)) |
| 250 | + |
| 251 | + # Divide longitude into 10 buckets. |
| 252 | + bucketized_longitude = tf.feature_column.bucketized_column( |
| 253 | + longitude, boundaries=get_quantile_based_boundaries( |
| 254 | + training_examples["longitude"], 10)) |
| 255 | + |
| 256 | + # Divide latitude into 10 buckets. |
| 257 | + bucketized_latitude = tf.feature_column.bucketized_column( |
| 258 | + latitude, boundaries=get_quantile_based_boundaries( |
| 259 | + training_examples["latitude"], 10)) |
| 260 | + |
| 261 | + # Divide housing_median_age into 7 buckets. |
| 262 | + bucketized_housing_median_age = tf.feature_column.bucketized_column( |
| 263 | + housing_median_age, boundaries=get_quantile_based_boundaries( |
| 264 | + training_examples["housing_median_age"], 7)) |
| 265 | + |
| 266 | + # Divide median_income into 7 buckets. |
| 267 | + bucketized_median_income = tf.feature_column.bucketized_column( |
| 268 | + median_income, boundaries=get_quantile_based_boundaries( |
| 269 | + training_examples["median_income"], 7)) |
| 270 | + |
| 271 | + # Divide rooms_per_person into 7 buckets. |
| 272 | + bucketized_rooms_per_person = tf.feature_column.bucketized_column( |
| 273 | + rooms_per_person, boundaries=get_quantile_based_boundaries( |
| 274 | + training_examples["rooms_per_person"], 7)) |
| 275 | + |
| 276 | + long_x_lat = tf.feature_column.crossed_column( |
| 277 | + set([bucketized_longitude, bucketized_latitude]), hash_bucket_size=1000) |
| 278 | + |
| 279 | + feature_columns = set([ |
| 280 | + bucketized_longitude, |
| 281 | + bucketized_latitude, |
| 282 | + bucketized_housing_median_age, |
| 283 | + bucketized_households, |
| 284 | + bucketized_median_income, |
| 285 | + bucketized_rooms_per_person, |
| 286 | + long_x_lat, |
| 287 | + ]) |
| 288 | + |
| 289 | + return feature_columns |
0 commit comments