Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
redundancy_threshold = get_value('redundancy_threshold')
window_size = get_value('window_size')
fragment_size = get_value('fragment_size')
effective_genome_fraction = get_value('effective_genome_fraction', 'reference_effective_genome_fraction')
gap_size = get_value('gap_size')
fdr = get_value('fdr')
genome_build = get_value('genome_build', 'reference_genome_build')
outdir, basebed = os.path.split(snakemake.output.bed)
label = snakemake.params.block['label']
tmpdir = tempfile.mkdtemp()
cwd = os.getcwd()
# SICER expects bed input format, not bam as in other peak callers
shell(
'bamToBed -i {snakemake.input.ip} > {tmpdir}/ip.bed ; '
'bamToBed -i {snakemake.input.control} > {tmpdir}/in.bed '
)
# SICER emits a single hard-coded file that does not respect output directory.
# So move each run into its own temp directory to avoid collisions with
# other processes.
os.chdir(tmpdir)
shell(
# there is a CI-specific bug, in which the python symlink is not correctly resolved to python2.7;
# so as a really desperate hack, modify SICER's python calls to directly touch 2.7
"""sed 's/^python/$CONDA_PREFIX\/bin\/python2.7/' """
"""$CONDA_PREFIX/share/sicer*/SICER.sh > {tmpdir}/SICER.sh && chmod u+x {tmpdir}/SICER.sh """
)
shell(
lock_file.close()
break
cmds = (
'macs2 predictd '
'-i {ip_bam} '
'{genome_count_flag} '
'-m {mfold_lower} {mfold_upper} '
'-f BAM '
)
shell(cmds + ' {log}')
inlog = log.split(' ')[-2]
d_estimate_temporary_file = tempfile.NamedTemporaryFile().name
cmds = ('awk \'/predicted fragment length is/ {{print "{mfold_lower} {mfold_upper} "$(NF-1)}}\' '
'< {inlog} > {d_estimate_temporary_file}')
shell(cmds)
with open(d_estimate_temporary_file, "r") as f:
lines = f.readlines()
# try to get access to d estimate file
with open(d_estimate_file, "a") as d_file:
# if there was less than one matching line from the awk extraction,
# it's because there was an MFOLD failure. Fall back to default extsize.
if len(lines) < 1:
d_file.write("{0} {1} {2}\n".format(mfold_lower, mfold_upper, d_estimate_default_value))
else:
d_file.write("{0} {1} {2}\n".format(mfold_lower, mfold_upper, lines[0].strip()))
cmds = ('awk \'/tag size is determined as/ {{print $(NF-1)}}\' < {inlog} > {read_length_file}')
shell(cmds)
shell("rm {0}".format(d_estimate_temporary_file))
# release the d estimate lock
fcntl.flock(lock_file, fcntl.LOCK_UN)
def shell(
cmd,
remove_spaces=True,
async=False,
iterable=False,
read=False,
):
if remove_spaces:
# print("removing spaces from command")
cmd = re.sub(r'[ \t\f\v]+', ' ', cmd).strip()
return snakemake.shell(
cmd=cmd,
async=async,
iterable=iterable,
read=read,
)
from sequana import FastQ
from sequana import FastA
S = 0
for this in FastQ(options.file1):
S += len(this['sequence'])
if options.file2:
for this in FastQ(options.file2):
S += len(this['sequence'])
ref = FastA(options.reference)
coverage = float(S) / len(ref.sequences[0])
print('Theoretical Depth of Coverage : %s' % coverage)
params = {"reference": reference, "fastq": fastq, "thread": options.thread}
# indexing
shellcmd("bwa index %(reference)s " % params)
cmd = "samtools faidx %(reference)s " % params
# mapping
cmd = "bwa mem -M " # mark shorter split read as secondary; -M is not compulsary but recommended
if options.pacbio:
cmd += "-x pacbio "
cmd += r" -t %(thread)s -R @RG\\tID:1\\tSM:1\\tPL:illumina -T 30 %(reference)s %(fastq)s "
# Samtools options:
# S:ignore input format
# h:include header
# b:bam output
if options.sambamba is False:
cmd += "| samtools view -Sbh | "
# sorting BAM
cmd += "samtools sort -@ %(thread)s -o %(reference)s.sorted.bam -"
cmd = """
"{assembler}" \
-i "{i}" \
-o "{o}" \
-x "{x}" \
-k {k} \
""".format(
assembler="../../bin/assembler",
i='" -i "'.join(input_files),
o='" -o "'.join(output_files),
x=intersection_file,
k=k,
)
logger.info('Cmd to be run: {}'.format(cmd))
snakemake.shell(cmd)
logger.info('Finished assembly')
def _to_fastX(self, mode, output_filename, threads=2):
"""
:param mode: fastq or fasta
"""
# for now, we use samtools
# can use bamtools as well but as long and output 10% larger (sequences
# are split on 80-characters length)
from snakemake import shell
cmd = "samtools %s -@ %s %s > %s" % (mode, threads,
self.filename, output_filename)
logger.info("Please be patient")
logger.info("This may be long depending on your input data file: ")
logger.info("typically, a minute per 500,000 reads")
shell(cmd)
logger.info("done")
def _control_script(igvcommands, igv_fp, igv_prefs):
igvscript = tempfile.NamedTemporaryFile()
igvscript.writelines(map(lambda x: bytes(x+'\n', 'ascii'), igvcommands))
igvscript.flush()
igvprefsfile = _write_prefs(igv_prefs)
shell("xvfb-run -a -s '-screen 1 1920x1080x24' %s -o %s -b %s" % (igv_fp, igvprefsfile.name, igvscript.name))
# "library of raw redundancy-removed reads on significant islands
redundancy_removed = glob.glob(os.path.join(tmpdir, '*-W{0}-G{1}-FDR*-islandfiltered.bed'.format(window_size, gap_size)))
if len(redundancy_removed) == 1:
redundancy_removed = redundancy_removed[0]
else:
raise ValueError("SICER redundancy removed library file not found")
# "wig file for the island-filtered redundancy-removed reads
normalized_postfilter_wig = glob.glob(os.path.join(tmpdir, '*-W{0}-G{1}-FDR*-islandfiltered-normalized.wig'.format(window_size, gap_size)))
if len(normalized_postfilter_wig) == 1:
normalized_postfilter_wig = normalized_postfilter_wig[0]
else:
raise ValueError("SICER normalized postfilter wig file not found")
shell(
"export LC_COLLATE=C; "
# format the output in broadPeak format
return [
fastafile.get() for fastafile in smbl.fasta.get_registered_fastas()
]
def all_programs():
return [
plugin.get_installation_files() for plugin in smbl.prog.plugins.get_registered_plugins()
]
def all_compatible_programs():
return [
plugin.get_installation_files() for plugin in smbl.prog.plugins.get_registered_plugins()
if plugin.is_platform_supported()
]
snakemake.shell(
"""
mkdir -p "{}" "{}" "{}"
""".format(bin_dir,fa_dir,src_dir)
)
'-l {d_estimate} '
'-g {read_length} '
'-o {bed} '
)
shell(cmds + '{log}')
bed_cleaning_intermediate = tempfile.NamedTemporaryFile().name
# Fix the output file so that it doesn't have negative numbers and so it fits
# inside the genome
shell(
"""awk -F "\\t" '{{OFS="\\t"; print $1, "0", $2}}' """
"{snakemake.input.chromsizes} "
"> {bed_cleaning_intermediate}"
)
unsorted_bed_intermediate = tempfile.NamedTemporaryFile().name
shell(
"export LC_COLLATE=C; "
"""awk -F "\\t" '{{OFS="\\t"; if (($2>0) && ($3>0)) print $0}}' {bed} | """
"bedtools intersect -a - -b {bed_cleaning_intermediate} > {unsorted_bed_intermediate} "
"&& bedSort {unsorted_bed_intermediate} {bed}"
)
shell("rm {0} {1} {2}".format(qvalue_intermediate,
bed_cleaning_intermediate,
unsorted_bed_intermediate))