Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
pollTimeoutSecs = 120
retryDelaySecs = 10
for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
csvPathname = importFolderPath + "/" + csvFilepattern
if DO_DOUBLE_IMPORT:
(importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
importFullList = importResult['files']
importFailList = importResult['fails']
print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)
# this accumulates performance stats into a benchmark log over multiple runs
# good for tracking whether we're getting slower or faster
h2o.cloudPerfH2O.change_logfile(csvFilename)
h2o.cloudPerfH2O.message("")
h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")
start = time.time()
parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
hex_key="A.hex", timeoutSecs=timeoutSecs,
retryDelaySecs=retryDelaySecs,
pollTimeoutSecs=pollTimeoutSecs,
benchmarkLogging=benchmarkLogging)
elapsed = time.time() - start
print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
"%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
print "Parse result['destination_key']:", parseResult['destination_key']
h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)
fileMBS = (totalBytes/1e6)/elapsed
retryDelaySecs = 10
for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
csvPathname = importFolderPath + "/" + csvFilepattern
if DO_DOUBLE_IMPORT:
(importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
importFullList = importResult['files']
importFailList = importResult['fails']
print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)
# this accumulates performance stats into a benchmark log over multiple runs
# good for tracking whether we're getting slower or faster
h2o.cloudPerfH2O.change_logfile(csvFilename)
h2o.cloudPerfH2O.message("")
h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")
start = time.time()
parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs,
retryDelaySecs=retryDelaySecs,
pollTimeoutSecs=pollTimeoutSecs,
benchmarkLogging=benchmarkLogging)
elapsed = time.time() - start
print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
"%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
print "Parse result['destination_key']:", parseResult['destination_key']
h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)
if totalBytes is not None:
fileMBS = (totalBytes/1e6)/elapsed
for trial in range(trialMax):
# (importResult, importPattern) = h2i.import_only(path=importFolderPath+"/*")
if DO_IMPORT_CHECK:
for i in range(2):
csvPathname = importFolderPath + "/" + csvFilepattern
(importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets',
path=csvPathname, schema='local', timeoutSecs=timeoutSecs)
importFullList = importResult['files']
importFailList = importResult['fails']
print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)
# creates csvFilename.hex from file in importFolder dir
h2o.cloudPerfH2O.change_logfile(csvFilename)
h2o.cloudPerfH2O.message("")
h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")
csvPathname = importFolderPath + "/" + csvFilepattern
start = time.time()
parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local',
hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs,
retryDelaySecs=retryDelaySecs,
pollTimeoutSecs=pollTimeoutSecs,
noPoll=noPoll,
benchmarkLogging=benchmarkLogging)
elapsed = time.time() - start
print "Parse#", trial, parseResult['destination_key'], "took", elapsed, "seconds",\
"%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
h2o_cmd.infoFromInspect(inspect, csvPathname)
hex_key="A.hex", timeoutSecs=timeoutSecs,
retryDelaySecs=retryDelaySecs,
pollTimeoutSecs=pollTimeoutSecs,
benchmarkLogging=benchmarkLogging)
elapsed = time.time() - start
print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
"%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
print "Parse result['destination_key']:", parseResult['destination_key']
h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)
fileMBS = (totalBytes/1e6)/elapsed
msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
print msg
h2o.cloudPerfH2O.message(msg)
h2o_cmd.checkKeyDistribution()
# are the unparsed keys slowing down exec?
h2i.delete_keys_at_all_nodes(pattern="manyfile")
execExpr = 'B.hex=A.hex'
h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
h2o_cmd.checkKeyDistribution()
execExpr = 'C.hex=B.hex'
h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
h2o_cmd.checkKeyDistribution()
execExpr = 'D.hex=C.hex'
h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
h2o_cmd.checkKeyDistribution()
pollTimeoutSecs = 120
retryDelaySecs = 10
for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
csvPathname = importFolderPath + "/" + csvFilepattern
if DO_DOUBLE_IMPORT:
(importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
importFullList = importResult['files']
importFailList = importResult['fails']
print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)
# this accumulates performance stats into a benchmark log over multiple runs
# good for tracking whether we're getting slower or faster
h2o.cloudPerfH2O.change_logfile(csvFilename)
h2o.cloudPerfH2O.message("")
h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")
start = time.time()
parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs,
retryDelaySecs=retryDelaySecs,
pollTimeoutSecs=pollTimeoutSecs,
benchmarkLogging=benchmarkLogging)
elapsed = time.time() - start
print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
"%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
print "Parse result['destination_key']:", parseResult['destination_key']
h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)
if totalBytes is not None:
csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
csvPathname = SYNDATASETS_DIR + '/' + csvFilename
print "Creating random", csvPathname
write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
start = time.time()
parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=60)
elapsed = time.time() - start
print "Parse result['destination_key']:", parseResult['destination_key']
algo = "Parse"
l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
len(h2o.nodes), tryHeap, algo, csvFilename, elapsed)
print l
h2o.cloudPerfH2O.message(l)
# We should be able to see the parse result?
inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
print "\n" + csvFilename
y = colCount
# just limit to 2 iterations..assume it scales with more iterations
kwargs = {
'response': y,
'max_iter': 2,
'family': 'binomial',
'lambda': 1.e-4,
'alpha': 0.6,
'n_folds': 1,
'beta_epsilon': 1.e-4,
}
# Parse (train)****************************************
parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', header=0,
hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False)
elapsed = time.time() - start
print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\
"%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
print "train parse result:", parseTrainResult['destination_key']
# Logging to a benchmark file
algo = "Parse"
l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed)
print l
h2o.cloudPerfH2O.message(l)
inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
print "\n" + csvPathname, \
" numRows:", "{:,}".format(inspect['numRows']), \
" numCols:", "{:,}".format(inspect['numCols'])
numRows = inspect['numRows']
numCols = inspect['numCols']
### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])
# GBM(train iterate)****************************************
ntrees = 10
for max_depth in [5,10,20,40]:
params = {
'learn_rate': .2,
'nbins': 1024,
'ntrees': ntrees,
# pop open a browser on the cloud
### h2b.browseTheCloud()
# to avoid sticky ports?
### base_port += 2
for trial in range(trialMax):
csvPathname = importFolderPath + "/" + csvFilepattern
(importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
importFullList = importResult['files']
importFailList = importResult['fails']
print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)
# creates csvFilename.hex from file in importFolder dir
h2o.cloudPerfH2O.change_logfile(csvFilename)
h2o.cloudPerfH2O.message("")
h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")
csvPathname = importFolderPath + "/" + csvFilepattern
start = time.time()
parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs,
retryDelaySecs=retryDelaySecs,
pollTimeoutSecs=pollTimeoutSecs,
noPoll=noPoll,
benchmarkLogging=benchmarkLogging)
if noPoll:
if (i+1) < len(csvFilenameList):
time.sleep(1)
h2o.check_sandbox_for_errors()
(csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1]
csvPathname = importFolderPath + "/" + csvFilepattern
retryDelaySecs = 10
for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
csvPathname = importFolderPath + "/" + csvFilepattern
# double import still causing problems?
# (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
# importFullList = importResult['files']
# importFailList = importResult['fails']
# print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)
# this accumulates performance stats into a benchmark log over multiple runs
# good for tracking whether we're getting slower or faster
h2o.cloudPerfH2O.change_logfile(csvFilename)
h2o.cloudPerfH2O.message("")
h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")
start = time.time()
parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs,
retryDelaySecs=retryDelaySecs,
pollTimeoutSecs=pollTimeoutSecs,
benchmarkLogging=benchmarkLogging)
elapsed = time.time() - start
print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
"%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
print "Parse result['destination_key']:", parseResult['destination_key']
h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)
if totalBytes is not None:
'n_folds': 1,
'family': 'binomial',
'alpha': 0.2,
'lambda': 1e-5
}
start = time.time()
glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **GLMkwargs)
elapsed = time.time() - start
h2o.check_sandbox_for_errors()
h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed)
print msg
h2o.cloudPerfH2O.message(msg)
h2o_cmd.checkKeyDistribution()