Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def bam_split(in_file, nfiles=2):
out_files = _get_split_files(in_file, nfiles)
out_bases = [os.path.splitext(x)[0] for x in out_files]
tmp_files = [x + "tmp" for x in out_files]
samfile = pysam.Samfile(in_file, "rb")
out_handles = [pysam.Samfile(x, "wb", template=samfile) for x in tmp_files]
for read in samfile:
out_handles[random.randint(0, nfiles-1)].write(read)
samfile.close()
[x.close() for x in out_handles]
[pysam.sort(tmp_files[x], out_bases[x]) for x in range(nfiles)]
[pysam.index(x) for x in out_files]
return out_files
def sortBam(inputBam, outputBamName, outputDir):
logger.info("sortbamByCoordinate: Trying to sort BAM file by Coordinate")
if(os.path.isdir(outputDir)):
logger.info("sortbamByCoordinate: The output directory %s exists", outputDir)
outputFile = outputDir + "/" + outputBamName
else:
logger.info("sortbamByCoordinate:The output directory %s does not exists !!", outputDir)
sys.exit()
if(os.path.isfile(inputBam)):
try:
pysam.sort(inputBam, outputFile)
except IndexError as err:
exception = "Index error({0}): {1}".format(err.errno, err.strerror)
logger.info("%s", exception)
except IOError as err:
exception = "I/O error({0}): {1}".format(err.errno, err.strerror)
logger.info("%s", exception)
else:
logger.info("sortbamByCoordinate: bam File %s does not exists !!", inputBam)
sys.exit()
logger.info("sortbamByCoordinate: Finished sorting BAM file by Coordinate.")
return(outputFile)
outFolderReferenceFiles = parentFolder + hubFastaDir + "/"
outFolderBamFiles = outFolderReferenceFiles + "bamFiles/"
# Create hierarchical reference and bamfile folders
if not os.path.exists(outFolderReferenceFiles):
os.mkdir(outFolderReferenceFiles)
if not os.path.exists(outFolderBamFiles):
os.mkdir(outFolderBamFiles)
# Check and create bam, sorted bam, and indexed bam files
bamFile = os.path.join(resultsDir, "mapping.bam")
sortedbamFile = os.path.join(resultsDir, "mapping.sorted")
if not os.path.isfile(os.path.join(resultsDir, "mapping.bam")):
try:
samToBamFile(os.path.join(resultsDir, "mapping.sam"), bamFile)
pysam.sort(bamFile, sortedbamFile)
pysam.index(sortedbamFile + ".bam")
except:
continue
# Copy files
try:
system("cp %s %s" % (os.path.join(resultsDir, "mapping.sorted.bam"), outFolderBamFiles + experiment + ".sorted.bam"))
system("cp %s %s" % (os.path.join(resultsDir, "mapping.sorted.bam.bai"), outFolderBamFiles + experiment + ".sorted.bam.bai"))
except:
continue
genomes = open(parentFolder + "genomes.txt", "a")
for referenceFastaFile in self.referenceFastaFiles:
if referenceFastaFile.endswith(".fa") or referenceFastaFile.endswith(".fasta"):
header = referenceFastaFile.split("/")[-1].split(".fa")[0]
system("cp %s %s" % (referenceFastaFile, parentFolder + header + "/"))
stats_post_df_dict['UMI'].extend(umis)
stats_post_df_dict['counts'].extend(umi_counts)
average_distance = umi_methods.get_average_umi_distance(post_cluster_umis)
post_cluster_stats.append(average_distance)
cluster_size = len(post_cluster_umis)
random_umis = read_gn.getUmis(cluster_size)
average_distance_null = umi_methods.get_average_umi_distance(random_umis)
post_cluster_stats_null.append(average_distance_null)
outfile.close()
if not options.no_sort_output:
# sort the output
pysam.sort("-o", sorted_out_name, "-O", sort_format, out_name)
os.unlink(out_name) # delete the tempfile
if options.stats:
# generate the stats dataframe
stats_pre_df = pd.DataFrame(stats_pre_df_dict)
stats_post_df = pd.DataFrame(stats_post_df_dict)
# tally the counts per umi per position
pre_counts = collections.Counter(stats_pre_df["counts"])
post_counts = collections.Counter(stats_post_df["counts"])
counts_index = list(set(pre_counts.keys()).union(set(post_counts.keys())))
counts_index.sort()
with U.openFile(options.stats + "_per_umi_per_position.tsv", "w") as outf:
outf.write("counts\tinstances_pre\tinstances_post\n")
for count in counts_index:
nwithout_dups = ninput - nduplicates
logs.write("duplicates\t%i\t%5.2f%%\n" %
(nduplicates, 100.0 * nduplicates / ninput))
logs.write("without duplicates\t%i\t%5.2f%%\n" %
(nwithout_dups, 100.0 * nwithout_dups / ninput))
logs.write("target\t%i\t%5.2f%%\n" %
(min_reads, 100.0 * min_reads / nwithout_dups))
logs.write("noutput\t%i\t%5.2f%%\n" %
(noutput, 100.0 * noutput / nwithout_dups))
logs.close()
# if more than one samfile: sort
if len(samfiles) > 1:
tmpfilename = P.getTempFilename()
pysam.sort(outfile, tmpfilename)
shutil.move(tmpfilename + ".bam", outfile)
os.unlink(tmpfilename)
pysam.index(outfile)
E.info("buildNormalizedBam: %i input, %i output (%5.2f%%), should be %i" %
(ninput, noutput, 100.0 * noutput / ninput, min_reads))
gain_HET_ALT_RE_PAIR_SAMPLED = "/".join([splittmpbams_hap, chr + event +'_HET_ALT_RE_PAIR_SAMPLED.bam'])
gain_HET_ALT_RE_PAIR_SAMPLED_RENAMED = "/".join([splittmpbams_hap , chr +'_GAIN_HET_ALT_RE_PAIR_SAMPLED_RENAMED.bam'])
gain_NONHET_RE_PAIR = "/".join([splittmpbams_hap, chr + event +'_NONHET_REPAIR.bam'])
gain_NONHET_RE_PAIR_SAMPLED = "/".join([splittmpbams_hap, chr + event +'_NONHET_SAMPLED.bam'])
gain_NONHET_RE_PAIR_SAMPLED_RENAMED = "/".join([splittmpbams_hap, chr + '_GAIN_NONHET_RE_PAIR_SAMPLED_RENAMED.bam'])
gain_NONHET_FINAL = "/".join([finalbams_hap, chr.upper() +'_GAIN_NH'])
gain_HET_FINAL = "/".join([finalbams_hap, chr.upper() +'_GAIN_H'])
if(os.path.isfile(HET_ALT)):
samapleratehet = splitAndRePairHET(HET_ALT, gain_HET_ALT_RE_PAIR, chr )
subsample(gain_HET_ALT_RE_PAIR,gain_HET_ALT_RE_PAIR_SAMPLED, str(samapleratehet)) # we need to keep a bit more (by 15-20%)
renamereads(gain_HET_ALT_RE_PAIR_SAMPLED, gain_HET_ALT_RE_PAIR_SAMPLED_RENAMED )
pysam.sort(gain_HET_ALT_RE_PAIR_SAMPLED_RENAMED, gain_HET_FINAL)
#os.remove(gain_HET_ALT_RE_PAIR_SAMPLED_RENAMED)
#logger.debug('sampling rate for HET'+chr +' = '+ str(samapleratehet))
else:
print('HET_ALT FILE NOT EXISTING')
if(os.path.isfile(NONHET)):
samapleratenonhet = splitAndRePairNONHET(NONHET, gain_NONHET_RE_PAIR, chr)
subsample(gain_NONHET_RE_PAIR, gain_NONHET_RE_PAIR_SAMPLED, str(samapleratenonhet))
renamereads(gain_NONHET_RE_PAIR_SAMPLED, gain_NONHET_RE_PAIR_SAMPLED_RENAMED)
pysam.sort(gain_NONHET_RE_PAIR_SAMPLED_RENAMED, gain_NONHET_FINAL)
#os.remove(gain_NONHET_RE_PAIR_SAMPLED_RENAMED)
else:
print('NON_HET FILE NOT EXISTING')
def bam_sort_index(unsorted_bam_path):
sorted_bam_path = unsorted_bam_path.replace(".bam", "") + ".sorted.bam"
pysam.sort("-o", sorted_bam_path, unsorted_bam_path)
pysam.index(sorted_bam_path)
os.remove(unsorted_bam_path)
return sorted_bam_path
def index_bam(bam_fname, sort_if_needed=False):
ps.index(str(bam_fname))
if not bam_index_exists(bam_fname):
sorted_bam_fname = str(bam_fname).replace('.bam', '.sorted.tmp.bam')
ps.sort('-o', sorted_bam_fname, str(bam_fname))
rename(sorted_bam_fname, bam_fname)
ps.index(str(bam_fname))
logs.write("# min_reads=%i, threshold= %5.2f\n" % \
(min_reads, threshold))
logs.write("set\tcounts\tpercent\n")
logs.write("ninput\t%i\t%5.2f%%\n" % (ninput, 100.0) )
nwithout_dups = ninput - nduplicates
logs.write("duplicates\t%i\t%5.2f%%\n" % (nduplicates,100.0*nduplicates/ninput))
logs.write("without duplicates\t%i\t%5.2f%%\n" % (nwithout_dups,100.0*nwithout_dups/ninput))
logs.write("target\t%i\t%5.2f%%\n" % (min_reads,100.0*min_reads/nwithout_dups))
logs.write("noutput\t%i\t%5.2f%%\n" % (noutput,100.0*noutput/nwithout_dups))
logs.close()
# if more than one samfile: sort
if len(samfiles) > 1:
tmpfilename = P.getTempFilename()
pysam.sort( outfile, tmpfilename )
shutil.move( tmpfilename + ".bam", outfile )
os.unlink( tmpfilename )
pysam.index( outfile )
P.info( "buildNormalizedBam: %i input, %i output (%5.2f%%), should be %i" % (ninput, noutput, 100.0*noutput/ninput, min_reads ))
:return:
"""
logger.debug('Writing to {} ...'.format(bam_fname))
t0 = time.time()
fp = pysam.AlignmentFile(bam_fname, 'wb', header=bam_hdr)
ref_dict = {k['SN']: n for n, k in enumerate(bam_hdr['SQ'])}
cnt = 0
for cnt, (qname, read_data) in enumerate(iter(in_queue.get, __process_stop_code__)):
write_perfect_reads(qname, rg_id, long_qname_table, ref_dict, read_data, cigar_v2, fp)
fp.close()
t1 = time.time()
logger.debug('... {}: {} reads in {:0.2f}s ({:0.2f} t/s)'.format(bam_fname, cnt, t1 - t0, cnt/(t1 - t0)))
logger.debug('Sorting {} -> {}'.format(bam_fname, bam_fname + '.sorted'))
t0 = time.time()
pysam.sort('-m', '1G', '-o', bam_fname + '.sorted', bam_fname)
os.remove(bam_fname)
t1 = time.time()
logger.debug('... {:0.2f}s'.format(t1 - t0))
logger.debug('Shutting down thread for {}'.format(bam_fname))