Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
2, 3,
4, 5])
message("Intersecting...")
if no_strandness:
prom_with_tss_bo = promoter_bo.intersect(tss_bo,
wb=True,
s=False,
S=False)
else:
prom_with_tss_bo = promoter_bo.intersect(tss_bo,
wb=True,
s=False,
S=True)
tmp_file = make_tmp_file("promoter_slop", ".bed")
promoter_bo.saveas(tmp_file.name)
tmp_file = make_tmp_file("promoter_intersection_with_tss_as_", ".bed")
prom_with_tss_bo.saveas(tmp_file.name)
for i in prom_with_tss_bo:
tx_id_tss, gn_id_tss = i.fields[9].split("||")
tx_id_prom, gene_id_prom = i.fields[3].split("||")
if gene_id_prom != gn_id_tss:
if tx_id_prom in tx_with_divergent:
dist = abs(tss_pos[tx_id_prom] - tss_pos[tx_id_tss])
if dist < dist_to_divergent[tx_id_prom]:
dist_to_divergent[tx_id_prom] = dist
tx_with_divergent[tx_id_prom] = tx_id_tss
else:
message("Getting TTS (using %d bins)." % around_bin_nb)
dws_region_bo = tmp.get_tts(name=["transcript_id"]
).slop(s=True,
l=-1,
r=downstream,
g=chrom_info.name).cut(bed_col)
else:
message(
"Getting downstream regions (%d bins)." %
around_bin_nb)
dws_region_bo = main_region_bo.flank(s=True,
l=0,
r=downstream,
g=chrom_info.name)
dws_bed_file = make_tmp_file(prefix="dowstream_region" + ft_type,
suffix=".bed")
dws_region_bo.saveas(dws_bed_file.name)
tmp_file = bw_profile_mp(in_bed_file=dws_bed_file.name,
nb_proc=nb_proc,
big_wig=[
x.name for x in bigwiglist],
bin_nb=around_bin_nb,
pseudo_count=pseudo_count,
stranded=not no_stranded,
type="downstream",
labels=labels,
outputfile=outputfile.name,
zero_to_na=zero_to_na,
verbose=pygtftk.utils.VERBOSITY)
cat_label = [str(x).replace(", ", "_") for x in cat_label]
# The string can be very problematic later...
breaks.categories = cat_label
message("Categories: " + str(list(breaks.categories)),
type="INFO",
force=True)
# -------------------------------------------------------------------------
#
# Write to disk
#
# -------------------------------------------------------------------------
tmp_file = make_tmp_file(prefix="discretized_keys", suffix=".txt")
with tmp_file as tp_file:
for p, v in zip(dest_pos, breaks):
tp_file.write(str(p) + "\t" + str(v) + '\n')
gtf.add_attr_to_pos(tmp_file,
new_key=dest_key).write(outputfile,
gc_off=True)
close_properly(outputfile, inputfile)
# Ignore features in B that overlap A
io=True,
# In addition to the closest feature in B report distance
# use negative distances to report upstream features.
# Report distance with respect to A.
# When A is on the - strand, "upstream" means B has a
# higher(start, stop).
D="a",
# Ignore features in B that are upstream of features in A
iu=True,
# How ties are handled. "first" Report the first tie
t="first",
# Require that the query and the closest hit have different names/gene_ids.
N=True)
basal_reg_bed_downstream_file = make_tmp_file(prefix='basal_reg_bed_upstream', suffix='.bed')
basal_reg_bed_downstream.saveas(basal_reg_bed_downstream_file.name)
for line in basal_reg_bed_downstream:
gene_id = line.name
strand = line.strand
end = line.end
start = line.start
gene_id = "|".join([gene_id, str(start), str(end), strand])
chroms[gene_id] = line.chrom
strands[gene_id] = strand
if strand == '+':
# if the feature chromosome in B is
# '.' we have reached the start of the chr
if line.fields[6] == '.':
del gtf
# -------------------------------------------------------------------------
# Check whether TSSs intersect a H3K4me3 peak
#
# -------------------------------------------------------------------------
message("Checking TSS intersection with merged BED file.")
tx_to_peak = defaultdict()
merged_peaks_bo = BedTool(merged_peaks)
tss_bo_x_bf = tss_bo.intersect(merged_peaks_bo, wa=True, wb=True)
tmp_bed = make_tmp_file(prefix="TSS_intersect",
suffix=".bed")
tss_bo_x_bf.saveas(tmp_bed.name)
for line in tss_bo_x_bf:
tx_to_peak[line[3]] = line[9]
# -------------------------------------------------------------------------
# Compute coverage of requested region
# Each worker will send a file
# -------------------------------------------------------------------------
outputfile_list = {}
message("Starting coverage analysis.")
bed_cov_file = make_tmp_file(prefix="merged_peaks", suffix=".cov")
bed_cov = bw_cov_mp(bw_list=[x.name for x in bigwig_list],
minibatch_size=minibatch_size, minibatch_nb=minibatch_nb,
bed_excl=bed_excl, use_markov_shuffling=use_markov,
nb_threads=nb_threads)
# Initialize result dict
hits = dict()
if not no_gtf:
if not no_basic_feature:
for feat_type in gtf.get_feature_list(nr=True):
message("Processing " + str(feat_type), type="INFO")
gtf_sub = gtf.select_by_key("feature", feat_type, 0)
gtf_sub_bed = gtf_sub.to_bed(name=["transcript_id",
"gene_id",
"exon_id"]).sort().merge() # merging bed file !
tmp_file = make_tmp_file(prefix="ologram_" + str(feat_type), suffix='.bed')
gtf_sub_bed.saveas(tmp_file.name)
del gtf_sub
hits[feat_type] = overlap_partial(bedA=peak_file, bedsB=gtf_sub_bed, ft_type=feat_type)
nb_gene_line = len(gtf.select_by_key(key="feature", value="gene"))
nb_tx_line = len(gtf.select_by_key(key="feature", value="transcript"))
if nb_gene_line and nb_tx_line:
# -------------------------------------------------------------------------
# Get the intergenic regions
# -------------------------------------------------------------------------
message("Processing intergenic regions", type="INFO")
gtf_sub_bed = gtf.get_intergenic(chrom_info,
0,
else:
message("Loading regions")
if ft_type == 'user_regions':
main_region_bo = BedTool(inputfile.name).cut(bed_col)
elif ft_type == 'single_nuc':
main_region_bo = BedTool(inputfile.name).cut(bed_col).slop(s=True,
l=upstream,
r=downstream,
g=chrom_info.name)
else:
message("Unknown method.")
# Save for tracability
main_region_bed = make_tmp_file(prefix="region" + ft_type, suffix=".bed")
main_region_bo.saveas(main_region_bed.name)
# -------------------------------------------------------------------------
#
# Print a header in the output file
#
# -------------------------------------------------------------------------
message("Preparing comments")
comments = "#"
comments += "ft_type:" + ft_type + ";"
comments += "from:" + str(upstream) + ";"
comments += "to:" + str(downstream) + ";"
comments += "labels:" + ",".join(labels) + ";"
# -------------------------------------------------------------------------
tss = gtf.get_tss(name=["transcript_id"], as_dict=True)
tx_to_gn = gtf.get_tx_to_gn()
for k in tss:
gn_id = tx_to_gn[k]
gn_tss_dist[gn_id][k] = int(tss[k])
# if_dict_of_dict is true, get_gn_to_tx() returns a dict of dict
# that maps gene_id to transcript_id and transcript_id to TSS
# numbering (1 for most 5', then 2...). For transcripts having
# the same TSSs, the tss number will be the same.
gn_to_tx_to_tss = gtf.get_gn_to_tx(as_dict_of_dict=True)
message("Numbering TSSs.")
tss_number_file = make_tmp_file(prefix='tx_to_tss_number', suffix='.txt')
gn_how_many_tss = dict()
for gn_id in gn_to_tx_to_tss:
for tx_id in gn_to_tx_to_tss[gn_id]:
tss_num = str(gn_to_tx_to_tss[gn_id][tx_id])
tss_number_file.write(tx_id + "\t" + tss_num + "\n")
if gn_id not in gn_how_many_tss:
gn_how_many_tss[gn_id] = tss_num
else:
if int(tss_num) > int(gn_how_many_tss[gn_id]):
gn_how_many_tss[gn_id] = tss_num
tss_number_file.close()
gtf = gtf.add_attr_from_file(feat='transcript',
as_gz_ext = [True for x in genome if x.name.endswith(".gz")]
if any(as_gz_ext):
message("Genome in gz format is not currently supported.", type="ERROR")
if len(genome) == 1:
message("Checking fasta file chromosome list")
genome = genome[0]
with genome as genome_file:
for i in genome_file:
if i.startswith(">"):
i = i.rstrip("\n")
genome_chr_list += [i[1:]]
else:
message("Merging fasta files")
tmp_genome = make_tmp_file(prefix="genome", suffix=".fa")
with tmp_genome as tg:
for curr_file in genome:
message("Merging %s" % curr_file.name)
with curr_file as cf:
shutil.copyfileobj(cf, tg, 1024 * 1024 * 100)
message("Checking fasta file chromosome list")
genome = open(tmp_genome.name, "r")
with genome as genome_file:
for i in genome_file:
if i.startswith(">"):
i = i.rstrip("\n")
genome_chr_list += [i[1:]]
rev_comp = not no_rev_comp
def get_ceas_records(
inputfile=None,
outputfile=None,
show_tables=False,
target_table='GeneTable'):
"""
Convert a CEAS sqlite file back into a flat file.
"""
# ----------------------------------------------------------------------
# load the CEAS file
# ----------------------------------------------------------------------
if inputfile.name.endswith('gz'):
tmp_file = make_tmp_file(prefix='ceas_gunzip', suffix='.txt')
with gzip.open(inputfile.name, 'rb') as f_in:
with open(tmp_file.name, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
inputfile = open(tmp_file.name)
conn = sqlite3.connect(inputfile.name)
cursor = conn.cursor()
# ----------------------------------------------------------------------
# A func to get the list of tables
# ----------------------------------------------------------------------
def get_tables(cursor):
out_list = list()
cursor.execute('SELECT name from sqlite_master where type= "table"')