Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
action='store_true')
parser.add_argument('-contig_key',
help=argparse.SUPPRESS,
default='{}/parus_indel/annotation/contigs_key.txt'.format(os.getenv('HOME')))
args = parser.parse_args()
# submission loop
if args.sub is True:
command_line = [' '.join([x for x in sys.argv if x != '-sub' and x != '-evolgen'])]
out_stem = args.cds_fa.split('/')[-1].replace('.fna.gz', '')
q_sub(command_line, out=args.out_dir + out_stem, evolgen=args.evolgen, t=96)
sys.exit()
# variables
fa = args.cds_fa
wga = pysam.TabixFile(args.wga_bed)
out_dir = args.out_dir
if not os.path.isdir(out_dir):
os.makedirs(out_dir)
contig_data = contig_dict(args.contig_key)
# loop through fasta
sequence, chromo, coords, skip, trans_name, method = '', '', [], False, '', ''
for line in gzip.open(fa):
# skip sequence lines following skipped header
if skip is True:
if not line.startswith('>'):
continue
else:
skip = False
def run_from_args(args):
#cp = cProfile.Profile()
info, carriers=get_info(args.lmerged_vcf, args.chrom, args.sample_list)
if args.dry_run_info_file!='':
info.to_csv(args.dry_run_info_file)
exit(1)
info=prune_info(info)
component_pos=info.groupby(['comp']).agg({'chr': 'first', 'varstart': np.min, 'varstop': np.max}).reset_index()
cntab=pysam.TabixFile(args.cnfile)
cn=get_cndata(0, component_pos, info, cntab, args.verbose)
if args.verbose:
sys.stderr.write("Memory usage info: %s\n" % (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
nind=np.unique(cn.loc[cn.comp==0, 'id']).shape[0]
print(str(nind))
outf1=open(args.outfile, "w")
outf2=open(args.diag_outfile, "w")
header='\t'.join(['#comp', 'cluster', 'dist_cluster', 'start', 'stop', 'nocl', 'bic', 'mean_sep', 'mean_offset', 'cov', 'wts', 'cn_med', 'cn_mad', 'info_ncarriers', 'is_rare', 'mm_corr', 'dist','nvar', 'score', 'ptspos', 'ptsend', 'prpos', 'prend', 'is_winner'])
outf1.write(header+"\n")
outf2.write(header+"\n")
for comp in component_pos.comp.unique():
if(comp==args.test_comp) or (args.test_comp==-1):
cn=get_cndata(comp, component_pos, info, cntab, args.verbose)
if args.verbose:
sys.stderr.write("Memory usage info: %s\n" % (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
def process(self):
if self.genelist:
gene_list = pysam.TabixFile(self.genelist)
with open(self.input, 'r') as fin:
with open(self.output, 'w') as fout:
for line in fin:
line = line.strip()
if line.startswith('#chr'):
header = line.split('\t')
if self.genelist:
header += ['PIDD_GENE', 'Inheritance', 'Phenotype']
fout.write('\t'.join(header) + '\n')
continue
elif line.startswith('##'):
continue
try:
def vcf_query(chrom=None, pos=None, ref=None, alt=None, variant_str=None, individual=None, verbose=False, limit=100, release='mainset_July2016'):
if variant_str:
variant_str=str(variant_str).strip().replace('_','-')
chrom, pos, ref, alt = variant_str.split('-')
#tb=pysam.TabixFile('/slms/UGI/vm_exports/vyp/phenotips/uclex_files/current/chr%s.vcf.gz' % chrom,)
tb=pysam.TabixFile('/SAN/vyplab/UCLex/current/%s_chr%s.vcf.gz' % (release, chrom,))
#mainset_February2016_chrX_filtered.vcf.gz
region=str('%s:%s-%s'%(chrom, pos, int(pos),))
headers=[h for h in tb.header]
headers=(headers[len(headers)-1]).strip().split('\t')
records=tb.fetch(region=region)
records=[r.split('\t') for r in records]
def response(POS, REF, ALT, index, geno, chrom, pos):
alleles=[geno['REF']]+geno['ALT'].split(',')
homozygous_genotype='/'.join([str(index),str(index)])
heterozygous_genotype='/'.join(['0',str(index)])
variant=dict()
variant['POS']=POS
variant['REF']=REF
variant['ALT']=ALT
variant['index']=index
variant['variant_id']='-'.join([str(chrom),str(POS),variant['REF'],variant['ALT']])
def run_from_args(args):
#cp = cProfile.Profile()
sys.stderr.write("Memory usage start: %s\n" % (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
info, carriers=get_info(args.lmerged_vcf, args.chrom, args.sample_list)
component_pos=info.groupby(['comp']).agg({'chr': 'first', 'varstart': np.min, 'varstop': np.max}).reset_index()
sys.stderr.write("Memory usage info: %s\n" % (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
sys.stderr.write("cntab\n")
cntab=pysam.TabixFile(args.cnfile)
cn=get_cndata(0, component_pos, info, cntab)
sys.stderr.write("Memory usage cntab: %s\n" % (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
nind=np.unique(cn.loc[cn.comp==0, 'id']).shape[0]
print(str(nind))
outf1=open(args.outfile, "w")
outf2=open(args.diag_outfile, "w")
header='\t'.join(['#comp', 'cluster', 'dist_cluster', 'start', 'stop', 'nocl', 'bic', 'mm', 'kk', 'cn_med', 'cn_mad', 'info_ncarriers', 'is_rare', 'mm_corr', 'dist','nvar', 'score', 'ptspos', 'ptsend', 'prpos', 'prend', 'is_winner'])
outf1.write(header+"\n")
outf2.write(header+"\n")
for comp in component_pos.comp.unique():
if (comp>0) and (comp%100==0):
cn=get_cndata(comp, component_pos, info, cntab)
sys.stderr.write("comp="+str(comp)+"\n")
sys.stderr.write("Memory usage: %s\n" % (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
def bed_regions(bed_file, chromo):
open_bed = pysam.TabixFile(bed_file)
for line in open_bed.fetch(chromo, parser=pysam.asTuple()):
start = int(line[1])
end = int(line[2])
yield start, end
def load_file(self):
self.tbx = None
# try to load a tabix file is available
if self.properties['file'].endswith(".bgz"):
# from the tabix file is not possible to know the
# global min and max
try:
self.tbx = pysam.TabixFile(self.properties['file'])
except IOError:
self.interval_tree, __, __ = file_to_intervaltree(self.properties['file'],
self.properties['region'])
# load the file as an interval tree
else:
self.interval_tree, __, __ = file_to_intervaltree(self.properties['file'],
self.properties['region'])
self.num_fields = None
def get_data(filename, region):
tb = pysam.TabixFile(filename)
if region:
range_string = region['chr'] + ":" + str(region['start']) + "-" + str(region['end'])
it = tb.fetch(range_string)
else:
it = tb.fetch()
data=""
for row in it:
data += row
return data
def _open_file(self) -> pysam.TabixFile:
# Open gtf file.
tabix_file = pysam.TabixFile(self._file_path, parser=self._parser)
# Yield file object and ensure it is closed.
try:
yield tabix_file
finally:
tabix_file.close()
_chrom = 0
_pos = 1
_shared = 2
_deleted = 3 if not reverse else 4
_inserted = 4 if not reverse else 3
_fragment = 5
num_lines_chrom = 0
num_lines_processed = 0
pos_from = 0
pos_to = 0
tree = IntervalTree()
tabix_file = pysam.TabixFile(filename)
iterator = tabix_file.fetch(contig, parser=pysam.asTuple())
LOG.info("Parsing VCI, contig: {}".format(contig))
for rec in iterator:
num_lines_chrom += 1
if len(rec) != 6:
raise exceptions.G2GError("Unexpected line in VCI file: {0}".format(rec))
if rec[2] == '.':
continue
"""
1 3000019 G . A 3000019
1 3003170 A . T 3151