Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
makeEventHeadersAlphaNumeric=False):
logLevel = getLogLevelString2(logLevel)
# We pass in the genome->sequence map as a series of paired arguments: [genome, faPath]*N.
pairs = [[genome, faPath] for genome, faPath in seqMap.items()]
args = [item for sublist in pairs for item in sublist]
args += ["--speciesTree", newickTreeString, "--cactusDisk", cactusDiskDatabaseString,
"--logLevel", logLevel]
if makeEventHeadersAlphaNumeric:
args += ["--makeEventHeadersAlphaNumeric"]
if outgroupEvents:
args += ["--outgroupEvents", " ".join(outgroupEvents)]
masterMessages = cactus_call(check_output=True,
parameters=["cactus_setup"] + args)
logger.info("Ran cactus setup okay")
return [ i for i in masterMessages.split("\n") if i != '' ]
def findOccupiedPorts():
"""Attempt to find all currently taken TCP ports.
Returns a set of ints, representing taken ports."""
netstatOutput = cactus_call(parameters=["netstat", "-tuplen"], check_output=True)
ports = set()
for line in netstatOutput.split("\n"):
fields = line.split()
if len(fields) != 9:
# Header or other garbage line
continue
port = int(fields[3].split(':')[-1])
ports.add(port)
logger.debug('Detected ports in use: %s' % repr(ports))
return ports
fileStore=fileStore, calculateWhichEndsToComputeSeparately=True):
endToAlign, sequencesInEndAlignment, basesInEndAlignment = line.split()
sequencesInEndAlignment = int(sequencesInEndAlignment)
basesInEndAlignment = int(basesInEndAlignment)
# Sanity check
assert len(endsToAlign) == len(endSizes), "End alignments and size lists out of sync"
#If we have a really big end align separately
if basesInEndAlignment >= veryLargeEndSize:
alignmentID = self.addChild(CactusBarEndAlignerWrapper(self.phaseNode, self.constantsNode,
self.cactusDiskDatabaseString, self.flowerNames,
self.flowerSizes, True, [ endToAlign ], [ basesInEndAlignment ],
cactusWorkflowArguments=self.cactusWorkflowArguments)).rv()
precomputedAlignmentIDs.append(alignmentID)
logger.info("Precomputing very large end alignment for %s with %i caps and %i bases" % \
(endToAlign, sequencesInEndAlignment, basesInEndAlignment))
else:
endsToAlign.append(endToAlign)
endSizes.append(basesInEndAlignment)
if sum(endSizes) >= maxFlowerGroupSize:
alignmentID = self.addChild(CactusBarEndAlignerWrapper(self.phaseNode,
self.constantsNode,
self.cactusDiskDatabaseString,
self.flowerNames, self.flowerSizes, False,
endsToAlign, endSizes,
cactusWorkflowArguments=self.cactusWorkflowArguments)).rv()
precomputedAlignmentIDs.append(alignmentID)
endsToAlign = []
endSizes = []
if len(endsToAlign) > 0:
precomputedAlignmentIDs.append(self.addChild(CactusBarEndAlignerWrapper(
# image is pulled.
# NOTE: singularity writes images in the current directory only
# when SINGULARITY_CACHEDIR is not set
oldCWD = os.getcwd()
os.chdir(os.path.dirname(imgPath))
# --size is deprecated starting in 2.4, but is needed for 2.3 support. Keeping it in for now.
try:
check_call(["singularity", "pull", "--size", "2000", "--name", os.path.basename(imgPath),
"docker://" + getDockerImage()])
except CalledProcessError:
# Call failed, try without --size, required for singularity 3+
check_call(["singularity", "pull", "--name", os.path.basename(imgPath),
"docker://" + getDockerImage()])
os.chdir(oldCWD)
else:
logger.info("Using pre-built singularity image: '{}'".format(imgPath))
down the DB and save the results. After finishing, the data will
eventually be written to snapshotFile.
Returns a tuple containing an updated version of the database config dbElem and the
path to the log file.
"""
logPath = fileStore.getLocalTempFile()
dbElem.setDbHost(getHostName())
# Find a suitable port to run on.
try:
occupiedPorts = findOccupiedPorts()
unoccupiedPorts = set(xrange(1025, _MAX_DBSERVER_PORT)) - occupiedPorts
port = random.choice(list(unoccupiedPorts))
except:
logger.warning("Can't find which ports are occupied--likely netstat is not installed."
" Choosing a random port to start the DB on, good luck!")
port = random.randint(1025, _MAX_DBSERVER_PORT)
dbElem.setDbPort(port)
process = DBServerProcess(dbElem, logPath, fileStore, existingSnapshotID, snapshotExportID)
process.daemon = True
process.start()
if not blockUntilDBserverIsRunning(dbElem):
raise RuntimeError("Unable to launch DBserver in time.")
return process, dbElem, logPath
def run(self, fileStore):
logger.info("Raising sampling rate from %f to %f" % (self.oldSamplingRate, self.newSamplingRate))
coveredSeedsFiles = [os.path.basename(fileStore.readGlobalFile(coveredSeedsID)) for coveredSeedsID in
self.coveredSeedsIDs]
samplingRates = fileStore.readGlobalFile(self.samplingRatesID)
newSamplingRates = os.path.basename(fileStore.getLocalTempFile())
logger.info("Work dir = %s" % os.path.dirname(samplingRates))
cactus_call(outfile=newSamplingRates, parameters=["cactus_adjustSamplingRates",
"--oldSamplingRate", str(self.oldSamplingRate),
"--newSamplingRate", str(self.newSamplingRate),
"--oldSamplingRates", os.path.basename(samplingRates),
"--coveredSeedsFiles", ",".join(coveredSeedsFiles)])
return fileStore.writeGlobalFile(newSamplingRates)
else:
# Too many children, so we have to build a tree to avoid
# bottlenecking on consistently serializing all the jobs.
for job in self.queuedChildJobs:
job.prepareForPromiseRegistration(fileStore.jobStore)
curLevel = self.queuedChildJobs
while len(curLevel) > self.maxChildrenPerJob:
curLevel = [curLevel[i:i + self.maxChildrenPerJob] for i in xrange(0, len(curLevel), self.maxChildrenPerJob)]
# curLevel is now a nested list (of lists, of lists...)
# representing a tree of out-degree no higher than
# maxChildrenPerJob. We can pass that to SpawnChildren
# instances, which will run down the tree and eventually
# spawn the jobs we're actually interested in running.
for sublist in curLevel:
logger.debug(sublist)
assert isinstance(sublist, list)
super(ChildTreeJob, self).addChild(SpawnChildren(sublist))
return ret
else:
# Too many children, so we have to build a tree to avoid
# bottlenecking on consistently serializing all the jobs.
for job in self.queuedChildJobs:
job.prepareForPromiseRegistration(fileStore.jobStore)
curLevel = self.queuedChildJobs
while len(curLevel) > self.maxChildrenPerJob:
curLevel = [curLevel[i:i + self.maxChildrenPerJob] for i in xrange(0, len(curLevel), self.maxChildrenPerJob)]
# curLevel is now a nested list (of lists, of lists...)
# representing a tree of out-degree no higher than
# maxChildrenPerJob. We can pass that to SpawnChildren
# instances, which will run down the tree and eventually
# spawn the jobs we're actually interested in running.
for sublist in curLevel:
logger.debug(sublist)
assert isinstance(sublist, list)
super(ChildTreeJob, self).addChild(SpawnChildren(sublist))
return ret
inSequence = fileStore.readGlobalFile(self.inSequenceID)
if self.prepOptions.chunkSize <= 0:
# In this first case we don't need to break up the sequence
chunked = False
inChunkList = [inSequence]
else:
# chunk it up
chunked = True
inChunkDirectory = getTempDirectory(rootDir=fileStore.getLocalTempDir())
inChunkList = runGetChunks(sequenceFiles=[inSequence], chunksDir=inChunkDirectory,
chunkSize=self.prepOptions.chunkSize,
overlapSize=0)
inChunkList = [os.path.abspath(path) for path in inChunkList]
logger.info("Chunks = %s" % inChunkList)
inChunkIDList = [fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in inChunkList]
outChunkIDList = []
#For each input chunk we create an output chunk, it is the output chunks that get concatenated together.
if not self.chunksToCompute:
self.chunksToCompute = range(len(inChunkList))
for i in self.chunksToCompute:
#Calculate the number of chunks to use
inChunkNumber = int(max(1, math.ceil(len(inChunkList) * self.prepOptions.proportionToSample)))
assert inChunkNumber <= len(inChunkList) and inChunkNumber > 0
#Now get the list of chunks flanking and including the current chunk
j = max(0, i - inChunkNumber/2)
inChunkIDs = inChunkIDList[j:j+inChunkNumber]
if len(inChunkIDs) < inChunkNumber: #This logic is like making the list circular
inChunkIDs += inChunkIDList[:inChunkNumber-len(inChunkIDs)]
assert len(inChunkIDs) == inChunkNumber
def run(self, fileStore):
samplingRates = fileStore.readGlobalFile(self.samplingRatesID)
chunk1 = fileStore.readGlobalFile(self.chunkID1)
chunk2 = fileStore.readGlobalFile(self.chunkID2)
lastZSequenceHandling = ['%s[multiple,unmask][nameparse=darkspace]' % os.path.basename(chunk1), '%s[nameparse=darkspace][unmask]' % os.path.basename(chunk2)]
alignments = fileStore.getLocalTempFile()
logger.info("Work dir = %s" % os.path.dirname(chunk1))
cactus_call(outfile=alignments,
parameters=["cPecanLastz"] + lastZSequenceHandling +
["--samplingRates=%s" % os.path.basename(samplingRates),
"--notrivial",
"--format=general:name1,zstart1,end1,name2,zstart2+,end2+", "--markend"])
coveredSeeds = fileStore.getLocalTempFile()
cactus_call(outfile=coveredSeeds,
parameters=["cactus_coveredSeeds",
"--seq", chunk1,
"--alignments", alignments])
return fileStore.writeGlobalFile(coveredSeeds)