Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def bed_tofasta(bed, ref_fasta, min_size=50, stranded=True, include_name=False, out=sys.stdout):
if not os.path.exists('%s.fai' % ref_fasta):
pysam.faidx(ref_fasta)
fasta = pysam.Fastafile(ref_fasta)
refs = set()
with open('%s.fai' % ref_fasta) as f:
for line in f:
refs.add(line.split('\t')[0].strip())
name = ''
for region in bed:
if include_name:
name = '%s|' % (region.name.strip())
if region.end - region.start >= min_size and region.chrom in refs:
seq = fasta.fetch(region.chrom, region.start, region.end)
if stranded and region.strand:
if not options[arg]:
miss_parameters.append(arg)
if miss_parameters:
sys.exit('Miss required option: ' + ' '.join(miss_parameters))
if options['--fusion'] and not options['--junc']:
try:
fusion_bam = pysam.AlignmentFile(options['--fusion'], 'rb')
except:
sys.exit('Please make sure %s is BAM file!' % options['--fusion'])
elif not options['--junc'] and not options['--fusion']:
sys.exit('--fusion or --junc should be used!')
elif options['--junc'] and options['--fusion']:
sys.exit('Could not use --fusion and --junc simultaneously!')
if not os.path.isfile(options['--genome'] + '.fai'):
pysam.faidx(options['--genome'])
genome_fa = pysam.FastaFile(options['--genome'])
ref_f = options['--ref']
output_prefix = options['--output']
output = output_prefix + '_circ.txt'
print('Start CIRCexplorer %s' % __version__)
if options['--junc']:
temp1 = options['--junc']
temp_dir, temp2 = create_temp(options['--tmp'], output_prefix, 0)
else:
temp_dir, temp1, temp2 = create_temp(options['--tmp'], output_prefix)
convert_fusion(fusion_bam, temp1)
annotate_fusion(ref_f, temp1, temp2)
fix_fusion(ref_f, genome_fa, temp2, output, options['--no-fix'])
if not options['--tmp']:
if options['--junc']:
delete_temp(temp_dir, temp1, temp2, 0)
def get_seq(reference, chr, start, end):
seq = ""
for item in pysam.faidx(reference, chr + ":" + str(start) + "-" + str(end)):
# if item[0] == ">": continue
seq = seq + item.rstrip('\n')
seq = seq.replace('>', '')
seq = seq.replace(chr + ":" + str(start) + "-" + str(end), '')
if re.search(r'[^ACGTUWSMKRYBDHVNacgtuwsmkrybdhvn]', seq) is not None:
print("The return value in get_seq function includes non-nucleotide characters:", file = sys.stderr)
print(seq, file = sys.stderr)
sys.exit(1)
return seq
for line in ens_id_f:
iso, gene = line.split()
ens_iso[iso] = gene
with gzip.open('ensGene.txt.gz', 'rb') as ens_f:
with open(sys.argv[3], 'w') as outf:
for line in ens_f:
entry = line.split()
iso = entry[1]
outf.write('\t'.join([ens_iso[iso]] + entry[1:11]) + '\n')
elif sys.argv[2] == 'fa': # Genome sequences
urllib.urlretrieve(path + 'bigZips/chromFa.tar.gz', 'chromFa.tar.gz')
with tarfile.open('chromFa.tar.gz', 'r:gz') as seq:
with open(sys.argv[3], 'w') as outf:
for f in seq:
outf.write(seq.extractfile(f).read())
pysam.faidx(sys.argv[3])
else:
sys.exit('Only support ref/kg/ens/fa!')
def _create_sam_reference_index(fpath):
'It creates a sam index for a reference sequence file'
index_fpath = fpath + '.fai'
if os.path.exists(index_fpath):
return
pysam.faidx(fpath)
def __init__(self, filename):
if not os.path.exists(filename + '.fai'):
import pysam
pysam.faidx(filename)
self.fasta = open(filename)
self.index = self.load_index(filename + '.fai')
@files( "%s.fasta" % PARAMS["genome"], "%s.fa" % PARAMS["genome"])
def indexGenome( infile, outfile ):
'''index the genome for samtools.
Samtools does not like long lines, so create a new file
with split lines (what a waste).
'''
# statement = '''fold %(infile)s | perl -p -e "s/chr//" > %(outfile)s'''
statement = '''fold %(infile)s > %(outfile)s'''
P.run()
pysam.faidx( outfile )
def index_file(original_file, file_format="vcf", overwrite=False):
"""
:param original_file:
:param new_file:
:param file_format:
:return:
"""
if overwrite or not has_index_file(original_file, file_format=file_format):
if file_format.lower() == 'fa':
pysam.faidx(original_file)
elif file_format.lower() == 'vcf':
pysam.tabix_index(original_file, preset="vcf", force=True)
elif file_format.lower() == 'vci':
pysam.tabix_index(original_file, seq_col=0, start_col=1, end_col=1, force=True)
else:
raise G2GValueError("Unknown file format: {0}".format(file_format))
def pysam_faidx(n):
print('timings for pysam.faidx')
ti = []
tf = []
for _ in range(n):
t = time.time()
pysam.faidx(fa_file.name)
ti.append(time.time() - t)
t = time.time()
read_pysam(fa_file.name, headers)
tf.append(time.time() - t)
os.remove(index)
# profile memory usage and report timings
tracemalloc.start()
pysam.faidx(fa_file.name)
read_pysam(fa_file.name, headers)
os.remove(index)
print(tracemalloc.get_traced_memory())
print(mean(ti))
print(mean(tf)/nreads/10*1000*1000)
tracemalloc.stop()
@files("%s.fasta" % PARAMS["genome"], "%s.fa" % PARAMS["genome"])
def indexGenome(infile, outfile):
'''index the genome for samtools.
Samtools does not like long lines, so create a new file
with split lines (what a waste).
'''
# statement = '''fold %(infile)s | perl -p -e "s/chr//" > %(outfile)s'''
statement = '''fold %(infile)s > %(outfile)s'''
P.run()
pysam.faidx(outfile)