Scale to large datasets, Find good features, Preprocess with Cloud MLE
Raw Data to Features
What makes a good feature?
Related to the objective
Tensorflow
How to write and deploy a model with TensorFlow
Embedding function
def create_embed(sparse_col):
dim = 10 # default
if hasattr(sparse_col, 'bucket_size'):
nbins = sparse_col.bucket_size
if nbins is not None:
dim = 1 + int(round(np.log2(nbins)))
return tflayers.embedding_column(sparse_col, dimension=dim)
DNN model
def dnn_model(output_dir):
real, sparse = get_features()
all = {}
all.update(real)
# create embeddings of the sparse columns
embed = {
colname : create_embed(col) \
for colname, col in sparse.items()
}
all.update(embed)
estimator = tflearn.DNNClassifier(
model_dir=output_dir,
feature_columns=all.values(),
hidden_units=[64, 16, 4])
estimator = tf.contrib.estimator.add_metrics(estimator, my_rmse)
return estimator
## Logs
2021-02-25 11:57:47.592 IST
ps-replica-1
"Cancellation requested for RunGraph."
Info
2021-02-25 11:57:47.630 IST
worker-replica-2
"loss = 169.66252, step = 1002"
Info
2021-02-25 11:57:47.674 IST
worker-replica-2
"Loss for final step: 169.66252."
Info
2021-02-25 11:57:47.837 IST
worker-replica-1
"loss = 169.63147, step = 1002"
Info
2021-02-25 11:57:47.868 IST
worker-replica-2
"model dir gs://qwiklabs-gcp-00-dfadb4cd0b7b/flights/chapter9/output"
Info
2021-02-25 11:57:47.881 IST
worker-replica-1
"Loss for final step: 169.63147."
Info
2021-02-25 11:57:47.898 IST
worker-replica-2
"Module completed; cleaning up."
Info
2021-02-25 11:57:47.898 IST
worker-replica-2
"Clean up finished."
Info
2021-02-25 11:57:47.899 IST
worker-replica-2
"Task completed successfully."
Warning
2021-02-25 11:57:47.948 IST
worker-replica-0
"Training with estimator made no steps. Perhaps input is empty or misspecified."
Info
2021-02-25 11:57:47.949 IST
worker-replica-0
"Loss for final step: None."
Info
2021-02-25 11:57:48.080 IST
worker-replica-1
"model dir gs://qwiklabs-gcp-00-dfadb4cd0b7b/flights/chapter9/output"
Wide and Deep model
extend the model to include additional features by creating features that allow you to associate airports with broad geographic zones and from those derive simplified air traffic corridors. feature crossing.
You start by creating location buckets for an n*n grid covering the USA and then assign each departure and arrival airport to their specific grid locations.
def parse_hidden_units(s):
return [int(item) for item in s.split(',')]
def wide_and_deep_model(output_dir,nbuckets=5,
hidden_units='64,32', learning_rate=0.01):
real, sparse = get_features()
# lat/lon cols can be discretized to "air traffic corridors"
latbuckets = np.linspace(20.0, 50.0, nbuckets).tolist()
lonbuckets = np.linspace(-120.0, -70.0, nbuckets).tolist()
disc = {}
disc.update({
'd_{}'.format(key) : \
tflayers.bucketized_column(real[key], latbuckets) \
for key in ['dep_lat', 'arr_lat']
})
disc.update({
'd_{}'.format(key) : \
tflayers.bucketized_column(real[key], lonbuckets) \
for key in ['dep_lon', 'arr_lon']
})
# cross columns that make sense in combination
sparse['dep_loc'] = tflayers.crossed_column( \
[disc['d_dep_lat'], disc['d_dep_lon']],\
nbuckets*nbuckets)
sparse['arr_loc'] = tflayers.crossed_column( \
[disc['d_arr_lat'], disc['d_arr_lon']],\
nbuckets*nbuckets)
sparse['dep_arr'] = tflayers.crossed_column( \
[sparse['dep_loc'], sparse['arr_loc']],\
nbuckets ** 4)
sparse['ori_dest'] = tflayers.crossed_column( \
[sparse['origin'], sparse['dest']], \
hash_bucket_size=1000)
# create embeddings of all the sparse columns
embed = {
colname : create_embed(col) \
for colname, col in sparse.items()
}
real.update(embed)
#lin_opt=tf.train.FtrlOptimizer(learning_rate=learning_rate)
#l_rate=learning_rate*0.25
#dnn_opt=tf.train.AdagradOptimizer(learning_rate=l_rate)
estimator = tflearn.DNNLinearCombinedClassifier(
model_dir=output_dir,
linear_feature_columns=sparse.values(),
dnn_feature_columns=real.values(),
dnn_hidden_units=parse_hidden_units(hidden_units))
#linear_optimizer=lin_opt,
#dnn_optimizer=dnn_opt)
estimator = tf.contrib.estimator.add_metrics(estimator, my_rmse)
return estimator