Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_exec2_operators(self):
h2o.beta_features = True
bucket = ''
csvPathname = 'testdata/airlines/year2013.csv'
hexKey = 'i.hex'
parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)
for resultKey, execExpr in initList:
h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=4)
start = time.time()
# h2e.exec_expr_list_rand(len(h2o.nodes), exprList, 'r1.hex', maxTrials=200, timeoutSecs=10)
h2e.exec_expr_list_rand(len(h2o.nodes), exprList, None, maxTrials=200, timeoutSecs=10)
h2o.check_sandbox_for_errors()
print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
def test_cs_training(self):
h2o.beta_features = True
parseResult = h2i.import_parse(bucket='smalldata', path='kaggle/creditsample-training.csv.gz', schema='put', timeoutSecs=120)
h2o_cmd.runRF(parseResult=parseResult, ntrees=100, max_depth=100, timeoutSecs=500,
response='SeriousDlqin2yrs')
# h2b.browseJsonHistoryAsUrlLastMatch("RFView")
def test_c7_rel(self):
print "Running with h2o.beta_features=True for all"
h2o.beta_features = True
DO_INSPECT = True
print "Since the python is not necessarily run as user=0xcust..., can't use a schema='put' here"
print "Want to be able to run python as jenkins"
print "I guess for big 0xcust files, we don't need schema='put'"
print "For files that we want to put (for testing put), we can get non-private files"
csvFilename = 'part-00000b'
if getpass.getuser()=='kevin':
importFolderPath = '/home/hduser/data/'
else:
importFolderPath = '/mnt/0xcustomer-datasets/c2'
csvPathname = importFolderPath + "/" + csvFilename
# FIX! does 'separator=' take ints or ?? hex format
num_rows = origInspect['num_rows']
num_cols = origInspect['num_cols']
lenNodes = len(h2o.nodes)
for trial in range (10):
h2p.green_print("\nTrial", trial)
# we want to use the boundary conditions, so have two level of random choices
offset = good_choices(num_rows)
view = good_choices(num_cols)
# randomize the node used
nodeX = random.randint(0,lenNodes-1)
print "nodeX:", nodeX, "offset:", offset, "view:", view
h2o.beta_features = False
inspect_and_check(nodeX,destination_key,offset,view,origInspect)
print "trying Inspect2 by flipping h2o.nodes[0].beta_features"
h2o.beta_features = True
# delay between the two inspects...bug around not getting autoframe in storeview?
time.sleep(1)
inspect_and_check(nodeX,destination_key,offset,view,origInspect)
h2o.beta_features = False
# a fvec frame should have been created in the storeView
time.sleep(1)
# loop looking for the autoframe to show up
# o = len(origStoreViewResult['keys'])
o = h2i.count_keys_at_all_nodes()
retry = 0
okay = False
while retry==0 or not okay:
newStoreViewResult = h2o_cmd.runStoreView(offset=0, view=1024, timeoutSecs=60)
## p = len(newStoreViewResult['keys'])
def test_rf_hhp_2_fvec(self):
h2o.beta_features = True
# NAs cause CM to zero..don't run for now
csvPathname = 'hhp_9_17_12.predict.data.gz'
parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put')
h2o_cmd.runRF(parseResult=parseResult, trees=6, timeoutSecs=30)
(importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=50)
parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key='c.hex',
timeoutSecs=500, noPoll=True, doSummary=False) # can't do summary until parse result is correct json
h2o.check_sandbox_for_errors()
print "\nparseResult", h2o.dump_json(parseResult)
# wait for it to show up in jobs?
time.sleep(2)
# no pattern waits for all
h2o_jobs.pollWaitJobs(pattern=None, timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5)
# hack it because no response from Parse2
if h2o.beta_features:
parseResult = {'destination_key': 'c.hex'}
else:
print csvFilename, 'parse time:', parseResult['response']['time']
print "Parse result['destination_key']:", parseResult['destination_key']
inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=30)
h2o.beta_features = True
inspect = h2o_cmd.runInspect(key='c.hex', timeoutSecs=30)
h2o.check_sandbox_for_errors()
# have to avoid this on nflx data. colswap with exec
# Exception: rjson error in gbm: Argument 'response' error: Only integer or enum/factor columns can be classified
if importFolderPath=='manyfiles-nflx-gz':
execExpr = 'c.hex=colSwap(c.hex,378,(c.hex[378]>15 ? 1 : 0))'
resultExec = h2o_cmd.runExec(expression=execExpr)
def test_four_billion_rows(self):
h2o.beta_features = True
timeoutSecs = 1500
importFolderPath = "billions"
csvFilenameList = [
"four_billion_rows.csv",
]
for csvFilename in csvFilenameList:
csvPathname = importFolderPath + "/" + csvFilename
start = time.time()
# Parse*********************************
parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local',
timeoutSecs=timeoutSecs, pollTimeoutSecs=180)
elapsed = time.time() - start
print "Parse result['destination_key']:", parseResult['destination_key']
print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_KMeans_covtype_fvec(self):
h2o.beta_features = True
csvFilenameList = [
('covtype.data', 800),
]
importFolderPath = "standard"
for csvFilename, timeoutSecs in csvFilenameList:
# creates csvFilename.hex from file in importFolder dir
csvPathname = importFolderPath + "/" + csvFilename
parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname,
timeoutSecs=2000, pollTimeoutSecs=60)
inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
print "\n" + csvPathname, \
" numRows:", "{:,}".format(inspect['numRows']), \
" numCols:", "{:,}".format(inspect['numCols'])
for trial in range(3):
# hack
if h2o.beta_features:
h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
print "Filling in the parseTestResult['destination_key'] for h2o"
parseTestResult['destination_key'] = testKey
elapsed = time.time() - start
print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
"%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
print "test parse result:", parseTestResult['destination_key']
# GBM (train iterate)****************************************
inspect = h2o_cmd.runInspect(key=parseTestResult['destination_key'])
paramsDict = define_gbm_params()
for trial in range(3):
h2o.beta_features = True
# translate it (only really need to do once . out of loop?
h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])
# use this to set any defaults you want if the pick doesn't set
params = {
'response': 54,
'ignored_cols_by_name':
'0,1,2,3,4',
'ntrees': 2,
'validation': parseTestResult['destination_key'],
}
h2o_gbm.pickRandGbmParams(paramsDict, params)
print "Using these parameters for GBM: ", params
kwargs = params.copy()
def test_rf_big_rand_tree_fvec(self):
h2o.beta_features = True
SYNDATASETS_DIR = h2o.make_syn_dir()
csvFilename = "syn.csv"
csvPathname = SYNDATASETS_DIR + '/' + csvFilename
rowCount = 5000
colCount = 1000
write_syn_dataset(csvPathname, rowCount, colCount)
for trial in range (1):
# make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
src_key = csvFilename + "_" + str(trial)
hex_key = csvFilename + "_" + str(trial) + ".hex"
seed = random.randint(0,sys.maxint)
# some cols can be dropped due to constant 0 or 1. make sure data set has all 0's and all 1's above
# to guarantee no dropped cols!
# kwargs = {'ntree': 3, 'depth': 50, 'seed': seed}