Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def ensureIndexed(bedPath, preset="bed", trySorting=True):
if not bedPath.endswith(".gz"):
if not os.path.exists(bedPath+".gz"):
logging.info("bgzf compressing {}".format(bedPath))
pysam.tabix_compress(bedPath, bedPath+".gz")
if not os.path.exists(bedPath+".gz"):
raise Exception("Failed to create compress {preset} file for {file}; make sure the {preset} file is "
"sorted and the directory is writeable".format(preset=preset, file=bedPath))
bedPath += ".gz"
if not os.path.exists(bedPath+".tbi"):
logging.info("creating tabix index for {}".format(bedPath))
pysam.tabix_index(bedPath, preset=preset)
if not os.path.exists(bedPath+".tbi"):
raise Exception("Failed to create tabix index file for {file}; make sure the {preset} file is "
"sorted and the directory is writeable".format(preset=preset, file=bedPath))
line = next(pysam.Tabixfile(bedPath).fetch())
if len(line.strip().split("\t")) < 6 and preset == "bed":
raise AnnotationError("BED files need to have at least 6 (tab-delimited) fields (including "
"chrom, start, end, name, score, strand; score is unused)")
if len(line.strip().split("\t")) < 9 and preset == "gff":
raise AnnotationError("GFF/GTF files need to have at least 9 tab-delimited fields")
return bedPath
def addVariantSet(
self, variantFileName, dataset, referenceSet,
ontology, biosamples):
inputVcf = os.path.join(
self.inputDirectory, variantFileName)
outputVcf = os.path.join(
self.outputDirectory, variantFileName)
shutil.copy(inputVcf, outputVcf)
pysam.tabix_index(outputVcf, preset="vcf")
variantSet = variants.HtslibVariantSet(
dataset, variantFileName.split('_')[1])
variantSet.setReferenceSet(referenceSet)
variantSet.populateFromFile(
[os.path.abspath(outputVcf + ".gz")],
[os.path.abspath(outputVcf + ".gz.tbi")])
variantSet.checkConsistency()
for callSet in variantSet.getCallSets():
for biosample in biosamples:
if biosample.getLocalId() == callSet.getLocalId():
callSet.setBiosampleId(biosample.getId())
self.repo.insertVariantSet(variantSet)
for annotationSet in variantSet.getVariantAnnotationSets():
annotationSet.setOntology(ontology)
self.repo.insertVariantAnnotationSet(annotationSet)
def indexFile(options):
sys.stdout.write('Compressing output file ... ')
sys.stdout.flush()
pysam.tabix_compress(options.output, options.output + '.gz', force=True)
sys.stdout.write('OK\n')
sys.stdout.write('Indexing output file ... ')
sys.stdout.flush()
pysam.tabix_index(options.output + '.gz', seq_col=1, start_col=2, end_col=2, meta_char='#', force=True)
sys.stdout.write('OK\n')
if not genomeToUse:
raise Exception()
args = [
'-treatAllAsProteinCoding', 'false', '-t', '-noLog', '-ud', '0', '-noStats', '-noShiftHgvs', genomeToUse,
os.path.realpath(inVcf)
]
command_ps = self.execute('ann', args, JVMmemory=JVMmemory)
if command_ps.returncode == 0:
with open(tmpVcf, 'wt') as outf:
outf.write(command_ps.stdout.decode("utf-8"))
if outVcf.endswith('.vcf.gz'):
pysam.tabix_compress(tmpVcf, outVcf, force=True)
pysam.tabix_index(outVcf, force=True, preset='vcf')
os.unlink(tmpVcf)
else:
raise subprocess.CalledProcessError(cmd=command_ps.args, returncode=command_ps.returncode, output=command_ps.stdout)
def index_vcf(filename):
pysam.tabix_index(filename, preset='vcf', force=True)
def index_vcf_gz(vcf_gz):
pysam.tabix_index(vcf_gz, force = True, preset = 'vcf')
print(pysam.mpileup(
'-t', 'INFO/AD,INFO/ADF,INFO/ADR',
'-L', '99999999',
'-A',
'-f', self.ref_fa,
'-u',
'-v',
self.bam,
), end='', file=f)
got = vcfcall_ariba.vcfcall_ariba(tmp_vcf, self.outprefix, self.min_var_read_depth, self.min_second_var_read_depth, self.max_allele_freq)
if got != 0:
raise Error('Error parsing vcf file. Cannot contine')
pysam.tabix_compress(self.outprefix + '.read_depths', self.read_depths_file)
pysam.tabix_index(self.read_depths_file, seq_col=0, start_col=1, end_col=1)
os.unlink(self.outprefix + '.read_depths')
os.unlink(tmp_vcf)
else:
vcf_writer.write_record(vcf_record)
if sort:
if reference_contigs:
contigs_order_dict = {contig.name: index for (index, contig) in enumerate(reference_contigs)}
vcf_records.sort(
cmp=lambda x, y: cmp((contigs_order_dict[x.CHROM], x.POS), (contigs_order_dict[y.CHROM], y.POS)))
else:
vcf_records.sort(cmp=lambda x, y: cmp((x.CHROM, x.POS), (y.CHROM, y.POS)))
for vcf_record in vcf_records:
vcf_writer.write_record(vcf_record)
vcf_writer.close()
if out_vcf and index:
pysam.tabix_index(out_vcf, force=True, preset='vcf')