Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_R_C_kmeans_prostate(self):
print "\nStarting prostate.csv"
rScript = h2o.find_file('R/tests/test_R_C_kmeans_prostate.R')
rLibrary = h2o.find_file('R/h2o-package/R/H2O.R')
# Run k-means with k = 5 on column 2 (Age)
# Loop to see if we get same centers
shCmdString = "R -f " + rScript + " --args " + rLibrary + " " + h2o.nodes[0].http_addr + ":" + str(h2o.nodes[0].port)
(ps, outpath, errpath) = h2o.spawn_cmd('rtest_with_h2o', shCmdString.split())
h2o.spawn_wait(ps, outpath, errpath, timeout=10)
csvPathname = '1B/reals_100000x1000_15f.data'
csvPathname = '1B/reals_1B_15f.data'
csvPathname = '1B/reals_1000000x1000_15f.data'
hex_key = 'r1'
parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2)
inspect = h2o_cmd.runInspect(key=hex_key)
print "numRows:", inspect['numRows']
print "numCols:", inspect['numCols']
inspect = h2o_cmd.runInspect(key=hex_key, offset=-1)
print "inspect offset = -1:", h2o.dump_json(inspect)
for execExpr in exprList:
start = time.time()
execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300)
print 'exec took', time.time() - start, 'seconds'
print "result:", result
h2o.check_sandbox_for_errors()
def test_randomFilter(self):
SYNDATASETS_DIR = h2o.make_syn_dir()
# use SEED so the file isn't cached?
csvFilenameAll = [
('syn_1mx8_' + str(SEED) + '.csv', 'cA', 5),
]
### csvFilenameList = random.sample(csvFilenameAll,1)
csvFilenameList = csvFilenameAll
### h2b.browseTheCloud()
lenNodes = len(h2o.nodes)
for (csvFilename, key2, timeoutSecs) in csvFilenameList:
SEEDPERFILE = random.randint(0, sys.maxint)
csvPathname = SYNDATASETS_DIR + '/' + csvFilename
print "Creating random 1mx8 csv"
write_syn_dataset(csvPathname, 1000000, SEEDPERFILE)
# creates csvFilename.hex from file in importFolder dir
parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=2000)
print csvFilename, 'parse time:', parseKey['response']['time']
print "Parse result['destination_key']:", parseKey['destination_key']
inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
h2o_cmd.infoFromInspect(inspect, csvPathname)
print "\n" + csvFilename
h2e.exec_zero_list(zeroList)
# does n+1 so use maxCol 6
def test_exec2_multi_node(self):
for node in h2o.nodes:
# get this key known to this node
execExpr = "r0 = c(0); r1 = c(0); r2 = c(0);"
print "Sending request to node: %s" % node
h2e.exec_expr(node=node, execExpr=execExpr, timeoutSecs=30)
# test the store expression
execExpr = "(r1==0) ? c(0) : c(1)"
print "Sending request to node: %s" % node
h2e.exec_expr(node=node, execExpr=execExpr, timeoutSecs=30)
global OUTSTANDING
if not OUTSTANDING:
OUTSTANDING = min(10, len(h2o.nodes))
execTrial = 0
worker_resultq = multiprocessing.Queue()
while execTrial <= TRIALMAX:
start = time.time()
workers = []
for o in range(OUTSTANDING):
np = execTrial % len(h2o.nodes)
retryDelaySecs = 5
timeoutSecs = 60
bucket = None
csvPathname = None
src_key = None
hex_key = 'a'
tmp = multiprocessing.Process(target=function_no_keyboard_intr,
args=(worker_resultq, execit, np, bucket, csvPathname, src_key, hex_key, timeoutSecs, retryDelaySecs))
start = time.time()
hex_key = "a.hex"
csvPathname = "datasets/" + csvFilename
parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, header=0, timeoutSecs=1000)
print "hdfs parse of", csvPathname, "took", time.time() - start, 'secs'
start = time.time()
print "Saving", csvFilename, 'to HDFS'
print "Using /tmp2 to avoid the '.' prefixed files in /tmp2 (kills import)"
print "Unique per-user to avoid permission issues"
username = getpass.getuser()
# reuse the file name to avoid running out of space
csvPathname = "tmp2/a%s.%s.csv" % ('_h2o_export_files', username)
path = "hdfs://"+ h2o.nodes[0].hdfs_name_node + "/" + csvPathname
h2o.nodes[0].export_files(src_key=hex_key, path=path, force=1, timeoutSecs=timeoutSecs)
print "export_files of", hex_key, "to", path, "took", time.time() - start, 'secs'
trial += 1
print "Re-Loading", csvFilename, 'from HDFS'
start = time.time()
hex_key = "a2.hex"
time.sleep(2)
d = h2i.import_only(path=csvPathname, schema='hdfs', timeoutSecs=1000)
print h2o.dump_json(d)
parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, header=0, timeoutSecs=1000)
print "hdfs re-parse of", csvPathname, "took", time.time() - start, 'secs'
csvPathname = csvDirname + "/" + csvFilename
trialStart = time.time()
# import*****************************************
hex_key = csvFilename + "_" + str(trial) + ".hex"
start = time.time()
# the import has to overwrite existing keys. no parse
h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key,
timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120, doSummary=False)
elapsed = time.time() - start
print "import", trial, "end ", 'took', elapsed, 'seconds',\
"%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
# STOREVIEW***************************************
print "\nTrying StoreView after the import"
for node in h2o.nodes:
h2o_cmd.runStoreView(node=node, timeoutSecs=30, view=10000)
# exec does read lock on all existing keys
if DO_EXEC:
# fails
execExpr="A.hex=c(0,1)"
# execExpr="A.hex=0;"
h2e.exec_expr(execExpr=execExpr, timeoutSecs=20)
h2o_cmd.runInspect(key='A.hex')
print "\nTrying StoreView after the exec "
h2o_cmd.runStoreView(timeoutSecs=30, view=10000)
# for node in h2o.nodes:
# h2o_cmd.runStoreView(node=node, timeoutSecs=30, view=10000)
print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
def predict_and_compare_csvs(model_key, translate=None):
start = time.time()
predict = h2o.nodes[0].generate_predictions(model_key=model_key,
data_key=hexKey, destination_key=predictHexKey)
print "generate_predictions end on ", hexKey, " took", time.time() - start, 'seconds'
h2o.check_sandbox_for_errors()
inspect = h2o_cmd.runInspect(key=predictHexKey)
h2o_cmd.infoFromInspect(inspect, 'predict.hex')
h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname)
h2o.check_sandbox_for_errors()
print "Do a check of the original output col against predicted output"
(rowNum1, originalOutput) = compare_csv(csvFullname, col=-1,
msg="Original", translate=translate, skipHeader=skipSrcHeader)
(rowNum2, predictOutput) = compare_csv(csvPredictPathname, col=0,
msg="Predicted", skipHeader=True)
# both source and predict have headers, so no expected mismatch?
expHeaderMismatch = 0 if skipSrcHeader else 1
if ((rowNum1+expHeaderMismatch) != rowNum2):
raise Exception("original rowNum1: %s + %s not same as downloaded predict rowNum2: %s" \
% (rowNum1, expHeaderMismatch, rowNum2))
wrong = 0
for rowNum,(o,p) in enumerate(zip(originalOutput, predictOutput)):
def parse_file(f):
v = h2o.nodes[0].import_files(f)['succeeded'][0]
return h2o.nodes[0].parse(v['key'],timeoutSecs=3600)['destination_key']
print "minLeaves:", treeStats['minLeaves']
print "meanLeaves:", treeStats['meanLeaves']
print "meanDepth:", treeStats['meanDepth']
print "errs[0]:", errs[0]
print "errs[-1]:", errs[-1]
print "errs:", errs
(classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView)
# we iterate over params, so can't really do this check
# self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)
print "classErrorPctList:", classErrorPctList
self.assertEqual(len(classErrorPctList), 7, "Should be 7 output classes, so should have 7 class error percentages from a reasonable predict")
# FIX! should update this expected classification error
predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=data_key)
eList.append(classErrorPctList[4])
fList.append(trainElapsed)
if DO_PLOT:
if TRY == 'max_depth':
xLabel = 'max_depth'
elif TRY == 'ntrees':
xLabel = 'ntrees'
elif TRY == 'nbins':
xLabel = 'nbins'
else:
raise Exception("huh? %s" % TRY)
xList.append(paramDict[xLabel])
if DO_PLOT:
eLabel = 'class 4 pctWrong'
row.update({'trainViewTime':trainViewTime})
h2o_rf.simpleCheckRFView(None, rfView, **kwargs)
modelKey = rfView['model_key']
#Test File Parsing#
testParseWallStart = time.time()
print "Testing file is: ", files['test']
csvPathname = files['test']
destKey = files['test'] + '.hex'
parseKey = h2i.parseImportFolderFile(None,csvPathname,
importFolderPath,key2=destKey,
timeoutSecs=300,retryDelaySecs=5,pollTimeoutSecs=120)
testParseWallTime = time.time() - testParseWallStart
#End Test File Parse#
inspect = h2o.nodes[0].inspect(parseKey['destination_key'])
row.update({'nTestRows':inspect['num_rows']})
row.update({'testParseWallTime':testParseWallTime})
modelKey = rfView['model_key']
#RFView (score on test)#
kwargs = configs.copy()
testRFStart = time.time()
kwargs.update({'model_key':modelKey,'ntree':10})
rfView = h2o_cmd.runRFView(data_key=destKey,timeoutSecs=180,
doSimpleCheck=False,**kwargs)
testViewTime = time.time() - testRFStart
#End RFView (score on test)#
pprint(rfView)
errRate = rfView['confusion_matrix']['classification_error']
row.update({'testViewTime':testViewTime})
overallWallTime = time.time() - overallWallStart