Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
elif file_with_values is None:
if key is None or value is None:
message("Key and value are mandatory. Alternatively use -e/t/g/f or -f with -k.",
type="ERROR")
elif file_with_values is not None:
if key is None:
message("Please set -k.", type="ERROR")
if value is not None:
message("The -f and -v arguments are mutually exclusive.", type="ERROR")
# ----------------------------------------------------------------------
# Load file with value
# ----------------------------------------------------------------------
gtf = GTF(inputfile, check_ensembl_format=False)
all_values = gtf.extract_data(key, as_list=True, no_na=True, nr=True)
if log:
feat_before = len(gtf)
if not file_with_values:
value_list = value.split(",")
gtf = gtf.select_by_key(key, value, invert_match)
else:
value_list = []
for line in file_with_values:
cols = line.split("\t")
value_list += [cols[col - 1]]
file_with_values.close()
file_with_values = open(file_with_values.name)
def del_attr(
inputfile=None,
outputfile=None,
key="transcript_id",
reg_exp=False,
invert_match=False):
"""
Delete extended attributes in the target gtf file. attr_list can be a
comma-separated list of attributes.
"""
gtf = GTF(inputfile, check_ensembl_format=False)
if reg_exp:
try:
rgxp = re.compile(key)
except:
message("Check the regular expression please.", type="ERROR")
key_list = [key]
else:
key_list = key.split(",")
for i in gtf:
feature_keys = i.get_attr_names()
if not invert_match:
for k in key_list:
# -----------------------------------------------------------
if matrix is True:
if new_key is not None:
message("--new-key and --matrix are mutually exclusive.",
type="ERROR")
else:
if new_key is None:
message("--new-key is required when --matrix is False.",
type="ERROR")
# -----------------------------------------------------------
# load the GTF
# -----------------------------------------------------------
gtf = GTF(inputfile, check_ensembl_format=False)
# -----------------------------------------------------------
# Check target feature
# -----------------------------------------------------------
feat_list = gtf.get_feature_list(nr=True)
if target_feature is not None:
target_feature_list = target_feature.split(",")
for i in target_feature_list:
if i not in feat_list + ["*"]:
message("Feature " + i + " not found.",
type="ERROR")
else:
target_feature = ",".join(feat_list)
downstream=1500,
chrom_info=None):
"""
Find transcript with convergent tts.
"""
message("Using -u " + str(upstream) + ".")
message("Using -d " + str(downstream) + ".")
tx_to_convergent_nm = dict()
dist_to_convergent = dict()
tts_pos = dict()
message("Loading GTF.")
gtf = GTF(inputfile)
message("Getting transcript coordinates.")
tx_feat = gtf.select_by_key("feature", "transcript")
message("Getting tts coordinates.")
tts_bo = tx_feat.get_tts(name=["transcript_id", "gene_id"],
sep="||")
# get tts position
for i in tts_bo:
tx_id_ov, gn_id_ov = i.name.split("||")
tts_pos[tx_id_ov] = int(i.start)
message("Getting tts coordinates.")
if not token[0].isdigit():
raise GTFtkError("Column 1 of intput file should be an int.")
new_data = self._dll.add_attr_to_pos(self._data,
native_str(input_file.name),
native_str(new_key))
return self._clone(new_data)
if __name__ == "__main__":
from pygtftk.utils import get_example_file
a = get_example_file()
gtf = GTF(a[0])
for i in gtf["feature", "transcript"]:
i.write(sys.stdout)
no_strandness=False,
no_annotation=False):
"""
Find transcript with divergent promoters.
"""
message("Using -u " + str(upstream) + ".")
message("Using -d " + str(downstream) + ".")
tx_with_divergent = dict()
dist_to_divergent = dict()
tss_pos = dict()
message("Loading GTF.")
gtf = GTF(inputfile)
message("Getting transcript coordinates.")
tx_feat = gtf.select_by_key("feature",
"transcript")
message("Getting tss coordinates.")
tss_bo = tx_feat.get_tss(name=["transcript_id", "gene_id"],
sep="||")
# get tss position
for i in tss_bo:
tx_id_tss, gn_id_tss = i.name.split("||")
tss_pos[tx_id_tss] = int(i.start)
message("Getting promoter coordinates.")
def select_by_max_exon_nb(inputfile=None,
outputfile=None):
"""
Select transcripts based on the number of exons.
"""
msg = "Selecting transcript with the highest number of exon for each gene."
message(msg)
gtf = GTF(inputfile,
check_ensembl_format=False
).select_by_max_exon_nb()
gtf.write(outputfile, gc_off=True)
def tss_numbering(
inputfile=None,
outputfile=None,
compute_dist=False,
key_name='tss_number',
key_name_dist='dist_to_first_tss',
add_nb_tss_to_gene=False,
gene_key='nb_tss'):
"""
Computes the distance between TSS of gene transcripts.
"""
gtf = GTF(inputfile, check_ensembl_format=True)
gn_tss_dist = defaultdict(dict)
message("Getting TSSs.")
tss = gtf.get_tss(name=["transcript_id"], as_dict=True)
tx_to_gn = gtf.get_tx_to_gn()
for k in tss:
gn_id = tx_to_gn[k]
gn_tss_dist[gn_id][k] = int(tss[k])
# if_dict_of_dict is true, get_gn_to_tx() returns a dict of dict
# that maps gene_id to transcript_id and transcript_id to TSS
# numbering (1 for most 5', then 2...). For transcripts having
# the same TSSs, the tss number will be the same.
gn_to_tx_to_tss = gtf.get_gn_to_tx(as_dict_of_dict=True)
def splicing_site(inputfile=None,
outputfile=None,
exon_numbering_key=False,
names="exon_id,transcript_id,gene_id",
separator="\t"):
"""
Compute the locations of splice donor are acceptor sites. You may extend them in 3' and 5' depending on your needs.
"""
gtf = GTF(inputfile)
nb_exons = gtf.nb_exons()
info = "feature,seqid,start,end,transcript_id," + exon_numbering_key
info += ",strand," + names
exon_info = gtf.extract_data_iter_list(info)
for i in exon_info:
if i[0] == "exon":
if i[5] == ".":
message("Some exon lines do not contain any numbering. "
"Use add_exon_nb or set --exon-numbering-key to the proper key.",
type="ERROR")
def intergenic(
inputfile=None,
outputfile=None,
chrom_info=None):
"""
Extract intergenic regions.
"""
message("Searching for intergenic regions.")
gtf = GTF(inputfile)
intergenic_regions = gtf.get_intergenic(chrom_info)
nb_intergenic_region = 1
for i in intergenic_regions:
i.name = "region_" + str(nb_intergenic_region)
write_properly(chomp(str(i)), outputfile)
nb_intergenic_region += 1
gc.disable()
close_properly(outputfile, inputfile)