Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
downstream=1500,
chrom_info=None,
no_strandness=False,
no_annotation=False):
"""
Find transcript with divergent promoters.
"""
message("Using -u " + str(upstream) + ".")
message("Using -d " + str(downstream) + ".")
tx_with_divergent = dict()
dist_to_divergent = dict()
tss_pos = dict()
message("Loading GTF.")
gtf = GTF(inputfile)
message("Getting transcript coordinates.")
tx_feat = gtf.select_by_key("feature",
"transcript")
message("Getting tss coordinates.")
tss_bo = tx_feat.get_tss(name=["transcript_id", "gene_id"],
sep="||")
# get tss position
for i in tss_bo:
tx_id_tss, gn_id_tss = i.name.split("||")
tss_pos[tx_id_tss] = int(i.start)
for line in sorted(lines, key=operator.itemgetter(3)):
tmp_file.write('\t'.join(line))
tmp_file.close()
tss_bo = BedTool(tmp_file.name)
# ----------------------------------------------------------------------
# Get the list of non redundant TSSs
# ----------------------------------------------------------------------
gene_dict = defaultdict(dict)
to_delete = []
message("Looking for redundant TSS (gene-wise).")
for line in tss_bo:
tss = line.start
name = line.name
gene_id, tx_id = name.split("|")
if gene_id in gene_dict:
if tss not in gene_dict[gene_id]:
gene_dict[gene_id][tss] = tx_id
else:
to_delete += [tx_id]
else:
gene_dict[gene_id][tss] = tx_id
message("Deleted transcripts: " + ",".join(to_delete[1:min(10,
# -------------------------------------------------------------------------
#
# Colors orders
#
# -------------------------------------------------------------------------
if color_order is None:
if group_by == 'bwig':
color_order = ",".join(input_file_bwig)
elif group_by == 'tx_classes':
color_order = ",".join(class_list)
elif group_by == 'chrom':
color_order = ",".join(list(input_file_chrom))
else:
message("color_order is undefined.", type="ERROR")
color_order = color_order.split(",")
else:
color_order = color_order.split(",")
color_order_pb = False
if group_by == 'bwig':
if len(color_order) != len(input_file_bwig):
color_order_pb = True
if len(set(color_order)) != len(set(input_file_bwig)):
color_order_pb = True
for co in color_order:
if co not in input_file_bwig:
color_order_pb = True
elif group_by == 'tx_classes':
Takes a GTF as input to search for genes with alternative promoters.
"""
# -------------------------------------------------------------------------
# Create a list of labels.
# Take user input in account
# -------------------------------------------------------------------------
bed_list = [x.name for x in bed_list]
if len(bed_list) != len(set(bed_list)):
message("Found the same BED file several times.",
type="ERROR")
if len(bed_list) < 2:
message("At least two bed files are needed.",
type="ERROR")
message('Checking labels.')
if labels is not None:
labels = labels.split(",")
# Ensure the number of labels is the same as the number of bed files.
if len(labels) != len(bed_list):
message("The number of labels should be the same as the number of"
" bed files.", type="ERROR")
# Ensure labels are non-redondant
if len(labels) > len(set(labels)):
message("Labels must be unique.", type="ERROR")
else:
labels = []
for i in range(len(bed_list)):
warnings.filterwarnings("ignore", category=matplotlib.cbook.MatplotlibDeprecationWarning)
def fxn():
warnings.warn("deprecated", DeprecationWarning)
# -------------------------------------------------------------------------
# Saving
# -------------------------------------------------------------------------
with warnings.catch_warnings():
warnings.simplefilter("ignore")
fxn()
message("Saving diagram to file : " + pdf_file.name)
message("Be patient. This may be long for large datasets.")
# NOTE : We must manually specify figure size with save_as_pdf_pages
save_as_pdf_pages(filename=pdf_file.name,
plots=[p1 + theme(figure_size=figsize),
p2 + theme(figure_size=figsize),
p3 + theme(figure_size=figsize)],
width=pdf_width,
height=pdf_height)
gc.disable()
def bed_to_gtf(
inputfile=None,
outputfile=None,
ft_type="transcript",
source="Unknown"):
"""
Convert a bed file to a gtf. This will make the poor bed feel as if it was a
nice gtf (but with lots of empty fields...). May be helpful sometimes...
"""
message("Converting the bed file into GTF file.")
if inputfile.name == '':
tmp_file = make_tmp_file(prefix="input_bed", suffix=".bed")
for i in inputfile:
write_properly(chomp(str(i)), tmp_file)
tmp_file.close()
inputfile.close()
bed_obj = BedTool(tmp_file.name)
else:
bed_obj = BedTool(inputfile.name)
n = 1
for i in bed_obj:
message("The number of labels should be the same as the number of levels.",
type="ERROR")
if len(labels) != len(set(labels)):
message("Redundant labels not allowed.", type="ERROR")
# -------------------------------------------------------------------------
#
# Load GTF. Retrieve values for src-key
#
# -------------------------------------------------------------------------
gtf = GTF(inputfile, check_ensembl_format=False)
src_values = gtf.extract_data(src_key, as_list=True)
if len([x for x in src_values if x not in ['.', '?']]) == 0:
message('The key was not found in this GTF.',
type="ERROR")
min_val = None
max_val = None
dest_values = []
dest_pos = []
for p, v in enumerate(src_values):
try:
a = float(v)
if min_val is not None:
if a > max_val:
max_val = a
if a < min_val:
min_val = a
message("Successfully change directory to " + ensembl_collection)
except:
message("Unable to change directory to '%s'." % ensembl_collection,
type="ERROR")
try:
all_releases = ftp.listdir(ftp.curdir)
except Exception as e:
print(str(e))
message("Unable to list directory.",
type="ERROR")
if release is not None:
release_dir = "release-" + release
if release_dir not in all_releases:
message("This release number could not be found. Aborting",
type="ERROR")
else:
version_list = []
for ver in all_releases:
regexp = re.compile("release-(\d+)")
hit = regexp.search(ver)
if hit:
version_list += [int(hit.group(1))]
release = max(version_list)
release_dir = "release-" + str(release)
message("Latest version is %d." % release)
try:
ftp.chdir(release_dir)
for i in file_dataset:
print("\t" + i)
else:
if format == "gtf":
example_file = get_example_file(datasetname=dataset,
ext=format)
if example_file:
example_file = example_file[0]
else:
example_file = get_example_file(datasetname=dataset,
ext=format + ".gz")
if example_file:
example_file = example_file[0]
else:
message("No GTF file found for this dataset.",
type="ERROR")
GTF(example_file, check_ensembl_format=False).write(outputfile, gc_off=True)
elif format in ["fa", "join", "join_mat", "genome", "chromInfo", "genes", "geneList"]:
try:
infile = open(get_example_file(datasetname=dataset,
ext=format)[0], "r")
except:
message("Unable to find example file.", type="ERROR")
for line in infile:
outputfile.write(line)
elif format == "*":
file_path = glob.glob(os.path.join(pygtftk.__path__[0],
def select_5p_transcript(self):
"""Select the most 5' transcript of each gene. Note that the genomic
boundaries for each gene could differ from that displayed by its
associated transcripts. The gene features may be subsequently deleted
using select_by_key.
:Example:
>>> from pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
"""
message("Calling select_5p_transcript.", type="DEBUG")
new_data = self._dll.select_transcript(self._data, 3)
return self._clone(new_data)