Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
if not bedPath.endswith(".gz"):
if not os.path.exists(bedPath+".gz"):
logging.info("bgzf compressing {}".format(bedPath))
pysam.tabix_compress(bedPath, bedPath+".gz")
if not os.path.exists(bedPath+".gz"):
raise Exception("Failed to create compress {preset} file for {file}; make sure the {preset} file is "
"sorted and the directory is writeable".format(preset=preset, file=bedPath))
bedPath += ".gz"
if not os.path.exists(bedPath+".tbi"):
logging.info("creating tabix index for {}".format(bedPath))
pysam.tabix_index(bedPath, preset=preset)
if not os.path.exists(bedPath+".tbi"):
raise Exception("Failed to create tabix index file for {file}; make sure the {preset} file is "
"sorted and the directory is writeable".format(preset=preset, file=bedPath))
line = next(pysam.Tabixfile(bedPath).fetch())
if len(line.strip().split("\t")) < 6 and preset == "bed":
raise AnnotationError("BED files need to have at least 6 (tab-delimited) fields (including "
"chrom, start, end, name, score, strand; score is unused)")
if len(line.strip().split("\t")) < 9 and preset == "gff":
raise AnnotationError("GFF/GTF files need to have at least 9 tab-delimited fields")
return bedPath
def _annotate_variants(args, conn, metadata, get_val_fn, col_names=None, col_types=None, col_ops=None):
"""Generalized annotation of variants with a new column.
get_val_fn takes a list of annotations in a region and returns
the value for that region to update the database with.
Separates selection and identification of values from update,
to avoid concurrent database access errors from sqlite, especially on
NFS systems. The retained to_update list is small, but batching
could help if memory issues emerge.
"""
# For each, use Tabix to detect overlaps with the user-defined
# annotation file. Update the variant row with T/F if overlaps found.
anno = pysam.Tabixfile(args.anno_file)
naming = guess_contig_naming(anno)
cursor = conn.bind.connect()
add_requested_columns(args, cursor, col_names, col_types)
conn.commit()
cursor.close()
conn, metadata = database.get_session_metadata(str(conn.bind.url))
cursor = conn.bind.connect()
total = 0
update_size = 5000
to_update = []
select_res = cursor.execution_options(stream_results=True).execute('''SELECT chrom, start, end, ref, alt, variant_id FROM variants''')
for row in select_res:
print("Generating pileup...");
# instantiate temp file to dump to
pileup_out = tempfile.NamedTemporaryFile(delete=False);
#A unzip the VCF and convert to BED use samtools to produce a read pileup
if args.vcf.endswith(".gz"):
subprocess.check_output("gunzip -c "+args.vcf+" | awk 'BEGIN { OFS=\"\t\"; FS=\"\t\"; } { if (index($0, \"#\") == 0) { print($1,$2-1,$2,$3,$6,$4,$5,$7,$8,$9); } }' | samtools mpileup -d "+str(args.max_depth)+" -I -B -q 0 -Q 0 -s -l - -f "+args.ref+" "+args.bam+" > "+pileup_out.name, shell=True);
else:
print("FATAL ERROR: input VCF must be gzipped and indexed.")
quit();
# 2 process the mpileup result
# A load the VCF
#vcf_reader = vcf.Reader(filename=args.vcf);
vcf_reader = pysam.Tabixfile(args.vcf,"r");
vcf_map = sample_column_map(args.vcf);
# B go through the pileup result line by line
out_stream = open(args.o, "w");
print("Processing pileup...");
out_stream.write("contig position variantID refAllele altAllele refCount altCount totalCount lowMAPQDepth lowBaseQDepth rawDepth otherBases\n");
stream_in = open(pileup_out.name, "r");
for line in stream_in:
cols = line.replace("\n","").split("\t");
#chr pos REF count reads
chr = cols[0];
pos = int(cols[1]);
ref = "";
def load_contig(contig):
"""contig is #chrm, from 1 to 23, X and Y"""
tabix = pysam.Tabixfile(cadd_file_path)
data = fetch_generator(tabix, contig)
for doc in data:
yield doc
def parse_tabix_file_subset(tabix_filenames, subset_i, subset_n, record_parser):
"""
Returns a generator of parsed record objects (as returned by record_parser) for the i'th out n subset of records
across all the given tabix_file(s). The records are split by files and contigs within files, with 1/n of all contigs
from all files being assigned to this the i'th subset.
Args:
tabix_filenames: a list of one or more tabix-indexed files. These will be opened using pysam.Tabixfile
subset_i: zero-based number
subset_n: total number of subsets
record_parser: a function that takes a file-like object and returns a generator of parsed records
"""
start_time = time.time()
print(tabix_filenames)
open_tabix_files = [pysam.Tabixfile(tabix_filename) for tabix_filename in tabix_filenames]
tabix_file_contig_pairs = [(tabix_file, contig) for tabix_file in open_tabix_files for contig in tabix_file.contigs]
# get every n'th tabix_file/contig pair
tabix_file_contig_subset = tabix_file_contig_pairs[subset_i : : subset_n]
short_filenames = ", ".join(map(os.path.basename, tabix_filenames))
print(short_filenames)
num_file_contig_pairs = len(tabix_file_contig_subset)
print(("Loading subset %(subset_i)s of %(subset_n)s total: %(num_file_contig_pairs)s contigs from %(short_filenames)s") % locals())
counter = 0
for tabix_file, contig in tabix_file_contig_subset:
header_iterator = tabix_file.header
records_iterator = tabix_file.fetch(contig, 0, 10**9, multiple_iterators=True)
for parsed_record in record_parser(itertools.chain(header_iterator, records_iterator)):
counter += 1
yield parsed_record
if counter % 100000 == 0:
seconds_elapsed = int(time.time()-start_time)
def __init__(self, options):
# Openning tabix file representing the dbSNP database
self.tabixfile = pysam.Tabixfile(options.args['dbsnp'])
access a given handle and fetch data from it
as follows:
dbsnp_handle = annotations.annos['dbsnp']
hits = dbsnp_handle.fetch(chrom, start, end)
"""
anno_files = get_anno_files(args)
for anno in anno_files:
try:
# .gz denotes Tabix files.
if anno_files[anno].endswith(".gz"):
if anno == "clinvar":
annos[anno] = pysam.Tabixfile(anno_files[anno],
encoding='utf8')
else:
annos[anno] = pysam.Tabixfile(anno_files[anno])
elif anno_files[anno].endswith(".bcf"):
annos[anno] = cyvcf2.VCF(anno_files[anno])
# .bw denotes BigWig files.
elif anno_files[anno].endswith(".bw"):
from bx.bbi.bigwig_file import BigWigFile
annos[anno] = BigWigFile(open(anno_files[anno]))
except IOError:
raise IOError("Gemini cannot open this annotation file: %s. \n"
"Have you installed the annotation files? If so, "
"have they been moved or deleted? Exiting...\n\n"
"For more details:\n\t"
"http://gemini.readthedocs.org/en/latest/content/"
"#installation.html\#installing-annotation-files\n"
% anno_files[anno])
return None
try:
mutbase = mut(refbase, altbase)
mutbase_list.append(mutbase)
except ValueError as e:
logger.warning(mutid_list[n] + " " + ' '.join(("skipped site:",chrom,str(hc[n]['start']),str(hc[n]['end']),"due to N base:",str(e),"\n")))
return None
mutstr_list.append(refbase + "-->" + str(mutbase))
# optional CNV file
cnv = None
if (args.cnvfile):
cnv = pysam.Tabixfile(args.cnvfile, 'r')
hapstr = "_".join(('haplo',chrom,str(min(mutpos_list)),str(max(mutpos_list))))
log = open('addsnv_logs_' + os.path.basename(args.outBamFile) + '/' + os.path.basename(args.outBamFile) + "." + hapstr + ".log",'w')
tmpoutbamname = args.tmpdir + "/" + hapstr + ".tmpbam." + str(uuid4()) + ".bam"
logger.info("%s creating tmp bam: %s" % (hapstr, tmpoutbamname))
outbam_muts = pysam.Samfile(tmpoutbamname, 'wb', template=bamfile)
mutfail, hasSNP, maxfrac, outreads, mutreads, mutmates = mutation.mutate(args, log, bamfile, bammate, chrom, min(mutpos_list), max(mutpos_list)+1, mutpos_list, avoid=avoid, mutid_list=mutid_list, is_snv=True, mutbase_list=mutbase_list, reffile=reffile)
if mutfail:
outbam_muts.close()
os.remove(tmpoutbamname)
return None
# pick reads to change
def create_tabix_infile(file_name):
return pysam.Tabixfile(file_name)