Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
#!/usr/bin/env python
from h2o import H2OFrame, H2OModel
import h2o as h2o
localH2O = h2o.init()
air = H2OFrame.from_csv(localH2O, "allyears_tiny.csv", index_col = False)
air.head().print()
X_air = air['Origin', 'Dest', 'Distance', 'UniqueCarrier', 'Month', 'DayofMonth', 'DayOfWeek']
y_air = air['IsDepDelayed']
X_air_train, X_air_valid, X_air_test, y_air_train, y_air_valid, y_air_test = \
H2OFrame.train_valid_test(X_air, y_air, valid_size = 0.1, test_size = 0.1)
my_gbm = H2OModel.GBM(distribution = "multinomial", n_trees = 10,
interaction_depth = 3, shrinkage = 0.01,
importance = True)
air_gbm = my_gbm.fit(x=X_air_train, y=y_air_train, x_valid=X_air_valid, y_valid=y_air_valid)
air_gbm.print()
pred = air_gbm.predict(X_air_test)
pred.head().print()import h2o
h2o.init() # Will set up H2O cluster using all available cores
h2o.init(ip="123.45.67.89", port=54321)
# To create an H2OFrame object from a python tuple:
df = h2o.H2OFrame(zip(*((1, 2, 3),
('a', 'b', 'c'),
(0.1, 0.2, 0.3))))
df
# To create an H2OFrame object from a python list:
df = h2o.H2OFrame(zip(*[[1, 2, 3],
['a', 'b', 'c'],
[0.1, 0.2, 0.3]]))
df
# To create an H2OFrame object from a python dict (or collections.OrderedDict):
df = h2o.H2OFrame({'A': [1, 2, 3],
'B': ['a', 'b', 'c'],
'C': [0.1, 0.2, 0.3]})
df
# To create an H2OFrame object from a dict with specified column types:
df2 = h2o.H2OFrame.from_python({'A': [1, 2, 3],
'B': ['a', 'a', 'b'],
'C': ['hello', 'all', 'world'],
'D': ['12MAR2015:11:00:00', '13MAR2015:12:00:00', '14MAR2015:13:00:00']},
column_types=['numeric', 'enum', 'string', 'time'])
df2
df2.types
import numpy as npimport numpy as np
from h2o.estimators import H2OAutoEncoderEstimator
from h2o.estimators import H2ODeepLearningEstimator
from dataprocessor import ProcessData, Filter
from featureeng import Measures
from parser import DataFrameParser
# Initialize server
h2o.init()
# AutoEncoder anomaly removal process
p_train = ProcessData.trainData(moving_k_closest_average=True, standard_deviation=True, probability_distribution=True, bin_classification=True)
p_test = ProcessData.testData(moving_k_closest_average=True, standard_deviation=True, probability_from_file=True, bin_classification=True)
# Converting to h2o frane
h_test = h2o.H2OFrame(p_test)
h_test.set_names(list(p_test.columns))
h_train = h2o.H2OFrame(p_train)
h_train.set_names(list(p_train.columns))
# Define autoencoder
anomaly_model = H2OAutoEncoderEstimator(
activation="Rectifier",
hidden=[25, 12, 25],
sparse=True,
l1=1e-4,
epochs=100
)
# Select relevant features
anomaly_train_columns = list(p_train.columns)hTrain, hValidate = hData.split_frame(ratios=[_validation_ratio_2])
hTest = h2o.H2OFrame(pTest)
hTest.set_names(list(pTest.columns))
# Training model
print "\nTraining Model"
print "----------------------------------------------------------------------------------------------------------------"
training_columns = list(pData.columns)
training_columns.remove(response_column)
training_columns.remove('UnitNumber')
training_columns.remove('Time')
# Create h2o frame using filtered pandas frame
filtered = h2o.H2OFrame(filtered_train)
filtered.set_names(list(filtered_train.columns))
model = H2ODeepLearningEstimator(hidden=[64, 64, 64], score_each_iteration=True, variable_importances=True, epochs=100, activation='Tanh')
model.train(x=training_columns, y=response_column, training_frame=filtered, validation_frame=hValidate)
print "\nModel Performance"
print "----------------------------------------------------------------------------------------------------------------"
# Evaluate model
print model.model_performance(test_data=hTest)df1 = pTrain.iloc[i, :]
filtered_train = filtered_train.append(df1, ignore_index=True)
count += 1
Progress.printProgress(iteration=(i+1), total=hTrain.nrow, decimals=1, prefix="Progress", suffix="Complete")
print filtered_train
print "Original Size :", hTrain.nrow
print "Filtered Size :", len(filtered_train)
print "Removed Rows :", (hTrain.nrow-len(filtered_train))
# Feature Engineering
pTrain = ProcessData.trainDataToFrame(filtered_train, moving_k_closest_average=True, standard_deviation=True, probability_distribution=True)
pTest = ProcessData.testData(moving_k_closest_average=True, standard_deviation=True, probability_from_file=True)
# Convert pandas to h2o frame - for model training
hValidate = h2o.H2OFrame(pValidate)
hValidate.set_names(list(pValidate.columns))
hTrain = h2o.H2OFrame(pTrain)
hTrain.set_names(list(pTrain.columns))
hTest = h2o.H2OFrame(pTest)
hTest.set_names(list(pTest.columns))
# Training model
print "\nTraining Model"
print "----------------------------------------------------------------------------------------------------------------"
training_columns = list(pData.columns)
training_columns.remove(response_column)
training_columns.remove('UnitNumber')
training_columns.remove('Time')anomaly_series = list(set(anomaly_series))
print anomaly_series
print len(anomaly_series)
# Remove anomalies
df = pData.drop(pData.index[anomaly_series])
# Feature engineering
data_frame = ProcessData.trainDataToFrame(df, moving_k_closest_average=True, standard_deviation=True)
testing_frame = ProcessData.testData(moving_k_closest_average=True, standard_deviation=True)
# Create h2o frame
hData = h2o.H2OFrame(data_frame)
hData.set_names(list(data_frame.columns))
hTesting = h2o.H2OFrame(testing_frame)
hTesting.set_names(list(testing_frame.columns))
# Split data inti training and validation
hTrain, hValidate = hData.split_frame(ratios=[0.8])
h2o.export_file(hTrain, "hTrainMy.csv", force=True)
h2o.export_file(hValidate, "hValidateMy.csv", force=True)
h2o.export_file(hTesting, "hTestingMy.csv", force=True)
training_columns = list(pData.columns)
training_columns.remove('UnitNumber')
training_columns.remove('Time')
training_columns.remove('RUL')
response_column = 'RUL'column_types=['numeric'])
train_frame = train_frame.cbind(train_w)
valid_frame = None
valid_X = None
valid_y = None
model = None
if eval_set is not None:
valid_X = h2o.H2OFrame(eval_set[0][0].to_pandas(), column_types=self.col_types)
valid_y = h2o.H2OFrame(eval_set[0][1],
column_names=[self.target],
column_types=['categorical' if self.num_classes >= 2 else 'numeric'])
valid_frame = valid_X.cbind(valid_y)
if sample_weight is not None:
if sample_weight_eval_set is None:
sample_weight_eval_set = [np.ones(len(eval_set[0][1]))]
valid_w = h2o.H2OFrame(sample_weight_eval_set[0],
column_names=[self.weight],
column_types=['numeric'])
valid_frame = valid_frame.cbind(valid_w)
try:
train_kwargs = dict()
params = copy.deepcopy(self.params)
if not isinstance(self, H2OAutoMLModel):
# AutoML needs max_runtime_secs in initializer, all others in train() method
max_runtime_secs = params.pop('max_runtime_secs')
train_kwargs = dict(max_runtime_secs=max_runtime_secs)
if valid_frame is not None:
train_kwargs['validation_frame'] = valid_frame
if sample_weight is not None:
train_kwargs['weights_column'] = self.weight
model = self.make_instance(**params)# df = pd.read_csv(args.test_csv)
df = pd.read_csv(args.test_csv, dtype=model_config['dtypes'],
parse_dates=model_config['datetime_cols'])
print('Dataset read, shape {}'.format(df.shape))
print('time elapsed: {}'.format(time.time()-start_time))
# preprocessing
df, df_pred = preprocess(df, model_config, type='test')
print('time elapsed: {}'.format(time.time()-start_time))
# final data shape
print('final df shape {}'.format(df.shape))
# convert data to h2o format
print('convert data to h2o format..')
test = h2o.H2OFrame(df)
print('time elapsed: {}'.format(time.time()-start_time))
# make prediction
aml = h2o.load_model(model_config['model_path'])
if model_config['mode'] == 'regression':
df_pred['prediction'] = aml.predict(test).as_data_frame().squeeze()
if model_config['mode'] == 'classification':
df_pred['prediction'] = aml.predict(test)['p1'].as_data_frame().squeeze()
df_pred[['line_id', 'prediction']].to_csv(args.prediction_csv, index=False)
print('Prediction time: {}'.format(time.time() - start_time))"""
Create partial dependence plot which gives a graphical depiction of the marginal effect of a variable on the
response. The effect of a variable is measured in change in the mean response.
:param H2OFrame data: An H2OFrame object used for scoring and constructing the plot.
:param cols: Feature(s) for which partial dependence will be calculated.
:param destination_key: An key reference to the created partial dependence tables in H2O.
:param nbins: Number of bins used. For categorical columns make sure the number of bins exceed the level count.
:param plot: A boolean specifying whether to plot partial dependence table.
:param plot_stddev: A boolean specifying whether to add std err to partial dependence plot.
:param figsize: Dimension/size of the returning plots, adjust to fit your output cells.
:param server: ?
:returns: Plot and list of calculated mean response tables for each feature requested.
"""
if not isinstance(data, h2o.H2OFrame): raise ValueError("data must be an instance of H2OFrame")
assert_is_type(cols, [str])
assert_is_type(destination_key, None, str)
assert_is_type(nbins, int)
assert_is_type(plot, bool)
assert_is_type(figsize, (int, int))
# Check cols specified exist in frame data
for xi in cols:
if xi not in data.names:
raise H2OValueError("Column %s does not exist in the training frame" % xi)
kwargs = {}
kwargs["cols"] = cols
kwargs["model_id"] = self.model_id
kwargs["frame_id"] = data.frame_id
kwargs["nbins"] = nbinsif isinstance(self, H2ONBModel):
# NB can only handle weights of 0 / 1
if sample_weight is not None:
sample_weight = (sample_weight != 0).astype(int)
if sample_weight_eval_set is not None:
sample_weight_eval_set = [(sample_weight_eval_set[0] != 0).astype(int)]
train_X = h2o.H2OFrame(X.to_pandas())
self.col_types = train_X.types
train_y = h2o.H2OFrame(y,
column_names=[self.target],
column_types=['categorical' if self.num_classes >= 2 else 'numeric'])
train_frame = train_X.cbind(train_y)
if sample_weight is not None:
train_w = h2o.H2OFrame(sample_weight,
column_names=[self.weight],
column_types=['numeric'])
train_frame = train_frame.cbind(train_w)
valid_frame = None
valid_X = None
valid_y = None
model = None
if eval_set is not None:
valid_X = h2o.H2OFrame(eval_set[0][0].to_pandas(), column_types=self.col_types)
valid_y = h2o.H2OFrame(eval_set[0][1],
column_names=[self.target],
column_types=['categorical' if self.num_classes >= 2 else 'numeric'])
valid_frame = valid_X.cbind(valid_y)
if sample_weight is not None:
if sample_weight_eval_set is None:
sample_weight_eval_set = [np.ones(len(eval_set[0][1]))]