Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
cov_corr,
cov_perc):
genomic_signature = GenomicSignature(0)
# make sure distributions have been loaded
self.read_distributions()
# find keys into GC and TD distributions
# gc -> [mean GC][scaffold length][percentile]
# td -> [scaffold length][percentile]
gs = genome_stats[genome_id]
closest_gc = find_nearest(list(self.gc_dist.keys()), gs.median_gc / 100.0)
sample_seq_len = list(self.gc_dist[closest_gc].keys())[0]
d = self.gc_dist[closest_gc][sample_seq_len]
gc_lower_bound_key = find_nearest(list(d.keys()), (100 - gc_per) / 2.0)
gc_upper_bound_key = find_nearest(list(d.keys()), (100 + gc_per) / 2.0)
td_bound_key = find_nearest(list(self.td_dist[list(self.td_dist.keys())[0]].keys()), td_per)
outlying_stats = {}
outlying_dists = defaultdict(list)
for scaffold_id in scaffold_ids:
base_scaffold_id = scaffold_id
if '-#' in scaffold_id:
base_scaffold_id = base_scaffold_id[0:base_scaffold_id.rfind('-#')]
stats = scaffold_stats.stats[base_scaffold_id]
# find GC and TD bounds
closest_seq_len = find_nearest(list(self.gc_dist[closest_gc].keys()), stats.length)
gc_lower_bound = self.gc_dist[closest_gc][closest_seq_len][gc_lower_bound_key]
gc_upper_bound = self.gc_dist[closest_gc][closest_seq_len][gc_upper_bound_key]
link_scaffold_ids,
xlabel, ylabel)
_, ymax = axes_scatter.get_ylim()
xmin, xmax = axes_scatter.get_xlim()
# draw vertical line at x=0
axes_scatter.plot([0, 0], [0, ymax], linestyle='dashed', color=self.axes_colour, lw=1.0, zorder=0)
# plot reference distributions
closest_gc = find_nearest(list(gc_dist.keys()), mean_gc / 100)
for percentile in percentiles_to_plot:
# find closest distribution values
temp_scaffold_len = list(gc_dist[closest_gc].keys())[0]
d = gc_dist[closest_gc][temp_scaffold_len]
gc_lower_bound_key = find_nearest(list(d.keys()), (100 - percentile) / 2.0)
gc_upper_bound_key = find_nearest(list(d.keys()), (100 + percentile) / 2.0)
xL = []
xU = []
y = []
for window_size in gc_dist[closest_gc]:
xL.append(gc_dist[closest_gc][window_size][gc_lower_bound_key] * 100)
xU.append(gc_dist[closest_gc][window_size][gc_upper_bound_key] * 100)
y.append(window_size / 1000.0)
# sort by y-values
sort_indexY = np.argsort(y)
xL = np.array(xL)[sort_indexY]
xU = np.array(xU)[sort_indexY]
y = np.array(y)[sort_indexY]
axes_scatter.plot(xL, y, 'r--', lw=1.0, zorder=0)
pts = self.data_pts(genome_scaffold_stats, mean_gc)
scatter, x_pts, y_pts, plot_labels = self.scatter(axes_scatter,
pts,
highlight_scaffold_ids,
link_scaffold_ids,
xlabel, ylabel)
_, ymax = axes_scatter.get_ylim()
xmin, xmax = axes_scatter.get_xlim()
# draw vertical line at x=0
axes_scatter.plot([0, 0], [0, ymax], linestyle='dashed', color=self.axes_colour, lw=1.0, zorder=0)
# plot reference distributions
closest_gc = find_nearest(list(gc_dist.keys()), mean_gc / 100)
for percentile in percentiles_to_plot:
# find closest distribution values
temp_scaffold_len = list(gc_dist[closest_gc].keys())[0]
d = gc_dist[closest_gc][temp_scaffold_len]
gc_lower_bound_key = find_nearest(list(d.keys()), (100 - percentile) / 2.0)
gc_upper_bound_key = find_nearest(list(d.keys()), (100 + percentile) / 2.0)
xL = []
xU = []
y = []
for window_size in gc_dist[closest_gc]:
xL.append(gc_dist[closest_gc][window_size][gc_lower_bound_key] * 100)
xU.append(gc_dist[closest_gc][window_size][gc_upper_bound_key] * 100)
y.append(window_size / 1000.0)
# sort by y-values
xlabel, ylabel)
_, ymax = axes_scatter.get_ylim()
xmin, xmax = axes_scatter.get_xlim()
# draw vertical line at x=0
axes_scatter.plot([0, 0], [0, ymax], linestyle='dashed', color=self.axes_colour, lw=1.0, zorder=0)
# plot reference distributions
closest_gc = find_nearest(list(gc_dist.keys()), mean_gc / 100)
for percentile in percentiles_to_plot:
# find closest distribution values
temp_scaffold_len = list(gc_dist[closest_gc].keys())[0]
d = gc_dist[closest_gc][temp_scaffold_len]
gc_lower_bound_key = find_nearest(list(d.keys()), (100 - percentile) / 2.0)
gc_upper_bound_key = find_nearest(list(d.keys()), (100 + percentile) / 2.0)
xL = []
xU = []
y = []
for window_size in gc_dist[closest_gc]:
xL.append(gc_dist[closest_gc][window_size][gc_lower_bound_key] * 100)
xU.append(gc_dist[closest_gc][window_size][gc_upper_bound_key] * 100)
y.append(window_size / 1000.0)
# sort by y-values
sort_indexY = np.argsort(y)
xL = np.array(xL)[sort_indexY]
xU = np.array(xU)[sort_indexY]
y = np.array(y)[sort_indexY]
axes_scatter.plot(xL, y, 'r--', lw=1.0, zorder=0)
axes_scatter.plot(xU, y, 'r--', lw=1.0, zorder=0)
processed_scaffolds,
len(scaffold_stats.stats),
processed_scaffolds * 100.0 / len(scaffold_stats.stats)))
sys.stdout.flush()
if scaffold_id not in scaffolds_of_interest:
continue
for genome_id, gs in genome_stats.items():
# find keys into GC and TD distributions
# gc -> [mean GC][scaffold length][percentile]
# td -> [scaffold length][percentile]
closest_gc = find_nearest(list(self.gc_dist.keys()), gs.median_gc / 100.0)
sample_seq_len = list(self.gc_dist[closest_gc].keys())[0]
d = self.gc_dist[closest_gc][sample_seq_len]
gc_lower_bound_key = find_nearest(list(d.keys()), (100 - gc_per) / 2.0)
gc_upper_bound_key = find_nearest(list(d.keys()), (100 + gc_per) / 2.0)
td_bound_key = find_nearest(list(self.td_dist[list(self.td_dist.keys())[0]].keys()), td_per)
# find GC and TD bounds
closest_seq_len = find_nearest(list(self.gc_dist[closest_gc].keys()), ss.length)
gc_lower_bound = self.gc_dist[closest_gc][closest_seq_len][gc_lower_bound_key]
gc_upper_bound = self.gc_dist[closest_gc][closest_seq_len][gc_upper_bound_key]
closest_seq_len = find_nearest(list(self.td_dist.keys()), ss.length)
td_bound = self.td_dist[closest_seq_len][td_bound_key]
# find changes from mean
delta_gc = (ss.gc - gs.median_gc) / 100.0
delta_td = genomic_signature.manhattan(ss.signature, gs.mean_signature)
len(scaffold_stats.stats),
processed_scaffolds * 100.0 / len(scaffold_stats.stats)))
sys.stdout.flush()
if scaffold_id not in scaffolds_of_interest:
continue
for genome_id, gs in genome_stats.items():
# find keys into GC and TD distributions
# gc -> [mean GC][scaffold length][percentile]
# td -> [scaffold length][percentile]
closest_gc = find_nearest(list(self.gc_dist.keys()), gs.median_gc / 100.0)
sample_seq_len = list(self.gc_dist[closest_gc].keys())[0]
d = self.gc_dist[closest_gc][sample_seq_len]
gc_lower_bound_key = find_nearest(list(d.keys()), (100 - gc_per) / 2.0)
gc_upper_bound_key = find_nearest(list(d.keys()), (100 + gc_per) / 2.0)
td_bound_key = find_nearest(list(self.td_dist[list(self.td_dist.keys())[0]].keys()), td_per)
# find GC and TD bounds
closest_seq_len = find_nearest(list(self.gc_dist[closest_gc].keys()), ss.length)
gc_lower_bound = self.gc_dist[closest_gc][closest_seq_len][gc_lower_bound_key]
gc_upper_bound = self.gc_dist[closest_gc][closest_seq_len][gc_upper_bound_key]
closest_seq_len = find_nearest(list(self.td_dist.keys()), ss.length)
td_bound = self.td_dist[closest_seq_len][td_bound_key]
# find changes from mean
delta_gc = (ss.gc - gs.median_gc) / 100.0
delta_td = genomic_signature.manhattan(ss.signature, gs.mean_signature)
# determine if scaffold compatible
genomic_signature = GenomicSignature(0)
# make sure distributions have been loaded
self.read_distributions()
# find keys into GC and TD distributions
# gc -> [mean GC][scaffold length][percentile]
# td -> [scaffold length][percentile]
gs = genome_stats[genome_id]
closest_gc = find_nearest(list(self.gc_dist.keys()), gs.median_gc / 100.0)
sample_seq_len = list(self.gc_dist[closest_gc].keys())[0]
d = self.gc_dist[closest_gc][sample_seq_len]
gc_lower_bound_key = find_nearest(list(d.keys()), (100 - gc_per) / 2.0)
gc_upper_bound_key = find_nearest(list(d.keys()), (100 + gc_per) / 2.0)
td_bound_key = find_nearest(list(self.td_dist[list(self.td_dist.keys())[0]].keys()), td_per)
outlying_stats = {}
outlying_dists = defaultdict(list)
for scaffold_id in scaffold_ids:
base_scaffold_id = scaffold_id
if '-#' in scaffold_id:
base_scaffold_id = base_scaffold_id[0:base_scaffold_id.rfind('-#')]
stats = scaffold_stats.stats[base_scaffold_id]
# find GC and TD bounds
closest_seq_len = find_nearest(list(self.gc_dist[closest_gc].keys()), stats.length)
gc_lower_bound = self.gc_dist[closest_gc][closest_seq_len][gc_lower_bound_key]
gc_upper_bound = self.gc_dist[closest_gc][closest_seq_len][gc_upper_bound_key]
closest_seq_len = find_nearest(list(self.td_dist.keys()), stats.length)
td_bound = self.td_dist[closest_seq_len][td_bound_key]
processed_scaffolds += 1
if not self.logger.is_silent:
sys.stdout.write(' Processed {:,} of {:,} ({:.1f}%) scaffolds.\r'.format(
processed_scaffolds,
len(scaffold_stats.stats),
processed_scaffolds * 100.0 / len(scaffold_stats.stats)))
sys.stdout.flush()
if scaffold_id not in scaffolds_of_interest:
continue
for genome_id, gs in genome_stats.items():
# find keys into GC and TD distributions
# gc -> [mean GC][scaffold length][percentile]
# td -> [scaffold length][percentile]
closest_gc = find_nearest(list(self.gc_dist.keys()), gs.median_gc / 100.0)
sample_seq_len = list(self.gc_dist[closest_gc].keys())[0]
d = self.gc_dist[closest_gc][sample_seq_len]
gc_lower_bound_key = find_nearest(list(d.keys()), (100 - gc_per) / 2.0)
gc_upper_bound_key = find_nearest(list(d.keys()), (100 + gc_per) / 2.0)
td_bound_key = find_nearest(list(self.td_dist[list(self.td_dist.keys())[0]].keys()), td_per)
# find GC and TD bounds
closest_seq_len = find_nearest(list(self.gc_dist[closest_gc].keys()), ss.length)
gc_lower_bound = self.gc_dist[closest_gc][closest_seq_len][gc_lower_bound_key]
gc_upper_bound = self.gc_dist[closest_gc][closest_seq_len][gc_upper_bound_key]
closest_seq_len = find_nearest(list(self.td_dist.keys()), ss.length)
td_bound = self.td_dist[closest_seq_len][td_bound_key]
# find changes from mean
cov_perc):
genomic_signature = GenomicSignature(0)
# make sure distributions have been loaded
self.read_distributions()
# find keys into GC and TD distributions
# gc -> [mean GC][scaffold length][percentile]
# td -> [scaffold length][percentile]
gs = genome_stats[genome_id]
closest_gc = find_nearest(list(self.gc_dist.keys()), gs.median_gc / 100.0)
sample_seq_len = list(self.gc_dist[closest_gc].keys())[0]
d = self.gc_dist[closest_gc][sample_seq_len]
gc_lower_bound_key = find_nearest(list(d.keys()), (100 - gc_per) / 2.0)
gc_upper_bound_key = find_nearest(list(d.keys()), (100 + gc_per) / 2.0)
td_bound_key = find_nearest(list(self.td_dist[list(self.td_dist.keys())[0]].keys()), td_per)
outlying_stats = {}
outlying_dists = defaultdict(list)
for scaffold_id in scaffold_ids:
base_scaffold_id = scaffold_id
if '-#' in scaffold_id:
base_scaffold_id = base_scaffold_id[0:base_scaffold_id.rfind('-#')]
stats = scaffold_stats.stats[base_scaffold_id]
# find GC and TD bounds
closest_seq_len = find_nearest(list(self.gc_dist[closest_gc].keys()), stats.length)
gc_lower_bound = self.gc_dist[closest_gc][closest_seq_len][gc_lower_bound_key]
gc_upper_bound = self.gc_dist[closest_gc][closest_seq_len][gc_upper_bound_key]
pts = self.data_pts(genome_scaffold_stats, mean_signature)
scatter, x_pts, y_pts, plot_labels = self.scatter(axes_scatter,
pts,
highlight_scaffold_ids,
link_scaffold_ids,
xlabel, ylabel)
_, ymax = axes_scatter.get_ylim()
xmin, xmax = axes_scatter.get_xlim()
# plot reference distributions
for percentile in percentiles_to_plot:
# find closest distribution values
first_key = list(td_dist.keys())[0]
td_bound_key = find_nearest(list(td_dist[first_key].keys()), percentile)
x = []
y = []
for window_size in td_dist:
x.append(td_dist[window_size][td_bound_key])
y.append(window_size / 1000.0)
# sort by y-values
sort_indexY = np.argsort(y)
x = np.array(x)[sort_indexY]
y = np.array(y)[sort_indexY]
# make sure x-values are strictly decreasing as y increases
# as this is conservative and visually satisfying
for i in range(0, len(x) - 1):
for j in range(i + 1, len(x)):