Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
#!/usr/bin/env python
from h2o import H2OFrame, H2OModel
import h2o as h2o
localH2O = h2o.init()
air = H2OFrame.from_csv(localH2O, "allyears_tiny.csv", index_col = False)
air.head().print()
X_air = air['Origin', 'Dest', 'Distance', 'UniqueCarrier', 'Month', 'DayofMonth', 'DayOfWeek']
y_air = air['IsDepDelayed']
X_air_train, X_air_valid, X_air_test, y_air_train, y_air_valid, y_air_test = \
H2OFrame.train_valid_test(X_air, y_air, valid_size = 0.1, test_size = 0.1)
my_gbm = H2OModel.GBM(distribution = "multinomial", n_trees = 10,
interaction_depth = 3, shrinkage = 0.01,
importance = True)
air_gbm = my_gbm.fit(x=X_air_train, y=y_air_train, x_valid=X_air_valid, y_valid=y_air_valid)
air_gbm.print()
pred = air_gbm.predict(X_air_test)
pred.head().print()
import h2o
h2o.init() # Will set up H2O cluster using all available cores
h2o.init(ip="123.45.67.89", port=54321)
# To create an H2OFrame object from a python tuple:
df = h2o.H2OFrame(zip(*((1, 2, 3),
('a', 'b', 'c'),
(0.1, 0.2, 0.3))))
df
# To create an H2OFrame object from a python list:
df = h2o.H2OFrame(zip(*[[1, 2, 3],
['a', 'b', 'c'],
[0.1, 0.2, 0.3]]))
df
# To create an H2OFrame object from a python dict (or collections.OrderedDict):
df = h2o.H2OFrame({'A': [1, 2, 3],
'B': ['a', 'b', 'c'],
'C': [0.1, 0.2, 0.3]})
df
# To create an H2OFrame object from a dict with specified column types:
df2 = h2o.H2OFrame.from_python({'A': [1, 2, 3],
'B': ['a', 'a', 'b'],
'C': ['hello', 'all', 'world'],
'D': ['12MAR2015:11:00:00', '13MAR2015:12:00:00', '14MAR2015:13:00:00']},
column_types=['numeric', 'enum', 'string', 'time'])
df2
df2.types
import numpy as np
import numpy as np
from h2o.estimators import H2OAutoEncoderEstimator
from h2o.estimators import H2ODeepLearningEstimator
from dataprocessor import ProcessData, Filter
from featureeng import Measures
from parser import DataFrameParser
# Initialize server
h2o.init()
# AutoEncoder anomaly removal process
p_train = ProcessData.trainData(moving_k_closest_average=True, standard_deviation=True, probability_distribution=True, bin_classification=True)
p_test = ProcessData.testData(moving_k_closest_average=True, standard_deviation=True, probability_from_file=True, bin_classification=True)
# Converting to h2o frane
h_test = h2o.H2OFrame(p_test)
h_test.set_names(list(p_test.columns))
h_train = h2o.H2OFrame(p_train)
h_train.set_names(list(p_train.columns))
# Define autoencoder
anomaly_model = H2OAutoEncoderEstimator(
activation="Rectifier",
hidden=[25, 12, 25],
sparse=True,
l1=1e-4,
epochs=100
)
# Select relevant features
anomaly_train_columns = list(p_train.columns)
hTrain, hValidate = hData.split_frame(ratios=[_validation_ratio_2])
hTest = h2o.H2OFrame(pTest)
hTest.set_names(list(pTest.columns))
# Training model
print "\nTraining Model"
print "----------------------------------------------------------------------------------------------------------------"
training_columns = list(pData.columns)
training_columns.remove(response_column)
training_columns.remove('UnitNumber')
training_columns.remove('Time')
# Create h2o frame using filtered pandas frame
filtered = h2o.H2OFrame(filtered_train)
filtered.set_names(list(filtered_train.columns))
model = H2ODeepLearningEstimator(hidden=[64, 64, 64], score_each_iteration=True, variable_importances=True, epochs=100, activation='Tanh')
model.train(x=training_columns, y=response_column, training_frame=filtered, validation_frame=hValidate)
print "\nModel Performance"
print "----------------------------------------------------------------------------------------------------------------"
# Evaluate model
print model.model_performance(test_data=hTest)
df1 = pTrain.iloc[i, :]
filtered_train = filtered_train.append(df1, ignore_index=True)
count += 1
Progress.printProgress(iteration=(i+1), total=hTrain.nrow, decimals=1, prefix="Progress", suffix="Complete")
print filtered_train
print "Original Size :", hTrain.nrow
print "Filtered Size :", len(filtered_train)
print "Removed Rows :", (hTrain.nrow-len(filtered_train))
# Feature Engineering
pTrain = ProcessData.trainDataToFrame(filtered_train, moving_k_closest_average=True, standard_deviation=True, probability_distribution=True)
pTest = ProcessData.testData(moving_k_closest_average=True, standard_deviation=True, probability_from_file=True)
# Convert pandas to h2o frame - for model training
hValidate = h2o.H2OFrame(pValidate)
hValidate.set_names(list(pValidate.columns))
hTrain = h2o.H2OFrame(pTrain)
hTrain.set_names(list(pTrain.columns))
hTest = h2o.H2OFrame(pTest)
hTest.set_names(list(pTest.columns))
# Training model
print "\nTraining Model"
print "----------------------------------------------------------------------------------------------------------------"
training_columns = list(pData.columns)
training_columns.remove(response_column)
training_columns.remove('UnitNumber')
training_columns.remove('Time')
anomaly_series = list(set(anomaly_series))
print anomaly_series
print len(anomaly_series)
# Remove anomalies
df = pData.drop(pData.index[anomaly_series])
# Feature engineering
data_frame = ProcessData.trainDataToFrame(df, moving_k_closest_average=True, standard_deviation=True)
testing_frame = ProcessData.testData(moving_k_closest_average=True, standard_deviation=True)
# Create h2o frame
hData = h2o.H2OFrame(data_frame)
hData.set_names(list(data_frame.columns))
hTesting = h2o.H2OFrame(testing_frame)
hTesting.set_names(list(testing_frame.columns))
# Split data inti training and validation
hTrain, hValidate = hData.split_frame(ratios=[0.8])
h2o.export_file(hTrain, "hTrainMy.csv", force=True)
h2o.export_file(hValidate, "hValidateMy.csv", force=True)
h2o.export_file(hTesting, "hTestingMy.csv", force=True)
training_columns = list(pData.columns)
training_columns.remove('UnitNumber')
training_columns.remove('Time')
training_columns.remove('RUL')
response_column = 'RUL'
column_types=['numeric'])
train_frame = train_frame.cbind(train_w)
valid_frame = None
valid_X = None
valid_y = None
model = None
if eval_set is not None:
valid_X = h2o.H2OFrame(eval_set[0][0].to_pandas(), column_types=self.col_types)
valid_y = h2o.H2OFrame(eval_set[0][1],
column_names=[self.target],
column_types=['categorical' if self.num_classes >= 2 else 'numeric'])
valid_frame = valid_X.cbind(valid_y)
if sample_weight is not None:
if sample_weight_eval_set is None:
sample_weight_eval_set = [np.ones(len(eval_set[0][1]))]
valid_w = h2o.H2OFrame(sample_weight_eval_set[0],
column_names=[self.weight],
column_types=['numeric'])
valid_frame = valid_frame.cbind(valid_w)
try:
train_kwargs = dict()
params = copy.deepcopy(self.params)
if not isinstance(self, H2OAutoMLModel):
# AutoML needs max_runtime_secs in initializer, all others in train() method
max_runtime_secs = params.pop('max_runtime_secs')
train_kwargs = dict(max_runtime_secs=max_runtime_secs)
if valid_frame is not None:
train_kwargs['validation_frame'] = valid_frame
if sample_weight is not None:
train_kwargs['weights_column'] = self.weight
model = self.make_instance(**params)
# df = pd.read_csv(args.test_csv)
df = pd.read_csv(args.test_csv, dtype=model_config['dtypes'],
parse_dates=model_config['datetime_cols'])
print('Dataset read, shape {}'.format(df.shape))
print('time elapsed: {}'.format(time.time()-start_time))
# preprocessing
df, df_pred = preprocess(df, model_config, type='test')
print('time elapsed: {}'.format(time.time()-start_time))
# final data shape
print('final df shape {}'.format(df.shape))
# convert data to h2o format
print('convert data to h2o format..')
test = h2o.H2OFrame(df)
print('time elapsed: {}'.format(time.time()-start_time))
# make prediction
aml = h2o.load_model(model_config['model_path'])
if model_config['mode'] == 'regression':
df_pred['prediction'] = aml.predict(test).as_data_frame().squeeze()
if model_config['mode'] == 'classification':
df_pred['prediction'] = aml.predict(test)['p1'].as_data_frame().squeeze()
df_pred[['line_id', 'prediction']].to_csv(args.prediction_csv, index=False)
print('Prediction time: {}'.format(time.time() - start_time))
"""
Create partial dependence plot which gives a graphical depiction of the marginal effect of a variable on the
response. The effect of a variable is measured in change in the mean response.
:param H2OFrame data: An H2OFrame object used for scoring and constructing the plot.
:param cols: Feature(s) for which partial dependence will be calculated.
:param destination_key: An key reference to the created partial dependence tables in H2O.
:param nbins: Number of bins used. For categorical columns make sure the number of bins exceed the level count.
:param plot: A boolean specifying whether to plot partial dependence table.
:param plot_stddev: A boolean specifying whether to add std err to partial dependence plot.
:param figsize: Dimension/size of the returning plots, adjust to fit your output cells.
:param server: ?
:returns: Plot and list of calculated mean response tables for each feature requested.
"""
if not isinstance(data, h2o.H2OFrame): raise ValueError("data must be an instance of H2OFrame")
assert_is_type(cols, [str])
assert_is_type(destination_key, None, str)
assert_is_type(nbins, int)
assert_is_type(plot, bool)
assert_is_type(figsize, (int, int))
# Check cols specified exist in frame data
for xi in cols:
if xi not in data.names:
raise H2OValueError("Column %s does not exist in the training frame" % xi)
kwargs = {}
kwargs["cols"] = cols
kwargs["model_id"] = self.model_id
kwargs["frame_id"] = data.frame_id
kwargs["nbins"] = nbins
if isinstance(self, H2ONBModel):
# NB can only handle weights of 0 / 1
if sample_weight is not None:
sample_weight = (sample_weight != 0).astype(int)
if sample_weight_eval_set is not None:
sample_weight_eval_set = [(sample_weight_eval_set[0] != 0).astype(int)]
train_X = h2o.H2OFrame(X.to_pandas())
self.col_types = train_X.types
train_y = h2o.H2OFrame(y,
column_names=[self.target],
column_types=['categorical' if self.num_classes >= 2 else 'numeric'])
train_frame = train_X.cbind(train_y)
if sample_weight is not None:
train_w = h2o.H2OFrame(sample_weight,
column_names=[self.weight],
column_types=['numeric'])
train_frame = train_frame.cbind(train_w)
valid_frame = None
valid_X = None
valid_y = None
model = None
if eval_set is not None:
valid_X = h2o.H2OFrame(eval_set[0][0].to_pandas(), column_types=self.col_types)
valid_y = h2o.H2OFrame(eval_set[0][1],
column_names=[self.target],
column_types=['categorical' if self.num_classes >= 2 else 'numeric'])
valid_frame = valid_X.cbind(valid_y)
if sample_weight is not None:
if sample_weight_eval_set is None:
sample_weight_eval_set = [np.ones(len(eval_set[0][1]))]