Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
"""
if keys is None:
raise GTFtkError("Please provide a key.")
if [as_list, as_dict, as_dict_of_lists,
as_list_of_list,
as_dict_of_values, as_dict_of_merged_list].count(True) > 1:
msg = "Choose between as_list, as_dict_of_values, as_dict_of_merged_list, as_dict_of_list or as_dict"
raise GTFtkError(msg)
if not isinstance(keys, list):
if isinstance(keys, str):
keys = keys.split(",")
else:
raise GTFtkError("Please provide a key as str or list.")
if zero_based:
base = 0
else:
base = 1
if nr:
nr = 1
else:
nr = 0
keys = [x if x not in ['chrom', 'chr'] else 'seqid' for x in keys]
keys_csv = ",".join(keys)
message("Calling extract_data (" + ",".join(keys) + ").", type="DEBUG")
>>> assert a_gtf.select_by_key("feature", "transcript").extract_data("bla", as_list=True, nr=False, hide_undef=False).count('?') == 15
>>> assert a_gtf.select_by_key("feature", "transcript").extract_data("bla", as_list=True, nr=True, hide_undef=False).count('?') == 1
>>> assert len(a_gtf.select_by_key("feature", "transcript").extract_data("start", as_dict=True)) == 11
>>> assert len(a_gtf.select_by_key("feature", "transcript").extract_data("seqid", as_dict=True)) == 1
>>> assert [len(x) for x in a_gtf.select_by_key("feature", "transcript").extract_data("seqid,start", as_list_of_list=True)].count(2) == 15
>>> assert len(a_gtf.select_by_key("feature", "transcript").extract_data("seqid,start", as_list_of_list=True, nr=True)) == 11
"""
if keys is None:
raise GTFtkError("Please provide a key.")
if [as_list, as_dict, as_dict_of_lists,
as_list_of_list,
as_dict_of_values, as_dict_of_merged_list].count(True) > 1:
msg = "Choose between as_list, as_dict_of_values, as_dict_of_merged_list, as_dict_of_list or as_dict"
raise GTFtkError(msg)
if not isinstance(keys, list):
if isinstance(keys, str):
keys = keys.split(",")
else:
raise GTFtkError("Please provide a key as str or list.")
if zero_based:
base = 0
else:
base = 1
if nr:
nr = 1
else:
nr = 0
>>> from pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> a_dict = a_gtf.nb_exons()
>>> a_gtf = a_gtf.add_attr_from_dict(feat="transcript", a_dict=a_dict, new_key="exon_nb")
>>> b_dict = a_gtf.select_by_key("feature", "transcript").extract_data("transcript_id,exon_nb", as_dict_of_values=True)
>>> assert a_dict['G0006T001'] == int(b_dict['G0006T001'])
>>> assert a_dict['G0008T001'] == int(b_dict['G0008T001'])
"""
message("Calling add_attr_from_dict", type="DEBUG")
if len(a_dict) == 0:
raise GTFtkError(
"Need some data to join.")
tmp_file = make_tmp_file("add_attr_from_dict", ".txt")
for i, j in list(a_dict.items()):
if isinstance(j, list):
j = ",".join([str(x) for x in j])
tmp_file.write("\t".join([str(i), str(j)]) + "\n")
tmp_file.close()
new_data = self._dll.add_attributes(self._data,
native_str(feat),
native_str(key),
native_str(new_key),
native_str(tmp_file.name))
>>> from pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> chr_info_path = get_example_file(ext="chromInfo")[0]
>>> chr_info_file = open(chr_info_path, "r")
>>> a_gtf = GTF(a_file)
>>> a_bed = a_gtf.get_intergenic(chrom_file=chr_info_file)
>>> assert len(a_bed) == 10
"""
message("Calling 'get_intergenic'.", type="DEBUG")
if not isinstance(chrom_file, io.IOBase):
raise GTFtkError('chrom_file should be a file object.')
if not os.path.exists(chrom_file.name):
raise GTFtkError('chrom_file could not be found.')
gtf = self.select_by_key("feature",
"transcript")
tx_bo = gtf.to_bed(name=["gene_id",
"transcript_id"]).slop(s=True,
l=upstream,
r=downstream,
g=chrom_file.name).cut([0, 1,
2, 3,
4, 5])
if chr_list is None:
parsed_exp_str = flatten_list_recur(parsed_exp.asList())
result = []
pos = 0
for i in tab:
if not any([True if x in na_omit else False for x in i]):
try:
[float(x) for x in i]
if eval(parsed_exp_str):
result += [pos]
except:
msg = "Found non numeric values in: '%s'." % ",".join(
i)
GTFtkError(msg)
pos += 1
# Call C function
if len(result) < 1:
tmp_f = make_tmp_file()
a_gtf = GTF(tmp_f.name, check_ensembl_format=False)
a_gtf.fn = self.fn
return a_gtf
return self.select_by_positions(result)
group_1 = identifier + comparison_operator + value
group_2 = value + comparison_operator + identifier
comparison = group_1 | group_2
boolean_expr = operatorPrecedence(comparison,
[(and_operator, 2, opAssoc.LEFT),
(or_operator, 2, opAssoc.LEFT)])
boolean_expr_par = lparen + boolean_expr + rparen
expression = Forward()
expression << boolean_expr | boolean_expr_par
try:
parsed_exp = expression.parseString(bool_exp, parseAll=True)
except:
raise GTFtkError("Expression not supported.")
# delete the suffix/prefixed: 'float(i.' + .* + ')'
attr_used = [x[8:-1] for x in _find_keys(parsed_exp, res=[])]
for i in attr_used:
if i not in [x for x in attr_list]:
GTFtkError("Your expression seems to contain an unknow key.")
tab = self.extract_data(",".join(attr_used), hide_undef=False)
parsed_exp_str = flatten_list_recur(parsed_exp.asList())
result = []
pos = 0
for i in tab:
:param feat: The target features.
:param keys: The source keys.
:param new_key: The destination key.
:param sep: The separator.
>>> from pygtftk.utils import get_example_file
>>> from pygtftk.gtf_interface import GTF
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> a_list = a_gtf.merge_attr(feat="exon,transcript,CDS", keys="gene_id,transcript_id", new_key="merge").extract_data("merge", hide_undef=True, as_list=True, nr=True)
>>> assert a_list[0] == 'G0001|G0001T002'
"""
if sep == "\t":
raise GTFtkError("Tabulation is not allowed as a separator.")
if new_key in self.get_attr_list(add_basic=False, as_dict=True):
tmp_file = make_tmp_file(prefix="merge_attr",
suffix=".txt")
if feat == "*":
self.extract_data(keys,
no_na=False,
hide_undef=False).write(tmp_file,
sep=sep)
self = self.del_attr("*", new_key, force=True)
self = self.add_attr_column(tmp_file, new_key)
return self
else:
>>> from pygtftk.gtf_interface import GTF
>>> from pygtftk.utils import TAB
>>> a_file = get_example_file()[0]
>>> a_gtf = GTF(a_file)
>>> b_gtf = a_gtf.add_attr_from_list(feat="gene", key="gene_id", key_value=("G0001", "G0002"), new_key="coding_pot", new_key_value=("0.5", "0.8"))
>>> assert b_gtf.extract_data(keys="coding_pot", as_list=True, no_na=True, hide_undef=True) == ['0.5', '0.8']
>>> b_gtf = a_gtf.add_attr_from_list(feat="gene", key="gene_id", key_value=("G0002", "G0001"), new_key="coding_pot", new_key_value=("0.8", "0.5"))
>>> assert b_gtf.extract_data(keys="coding_pot", as_list=True, no_na=True, hide_undef=True) == ['0.5', '0.8']
>>> key_value = tuple(a_gtf.extract_data("transcript_id", no_na=True, as_list=True, nr=True))
>>> b=a_gtf.add_attr_from_list(None, key="transcript_id", key_value=key_value, new_key="bla", new_key_value=tuple([str(x) for x in range(len(key_value))]))
"""
message("Calling add_attr_from_list", type="DEBUG")
if not isinstance(key_value, tuple) or not isinstance(new_key_value, tuple):
raise GTFtkError("key_value and new_key_value should be tuple.")
if feat is None:
feat = ",".join(self.get_feature_list(nr=True))
if len(set(key_value)) != len(key_value):
raise GTFtkError("Each key should appear once in key_value.")
if len(key_value) != len(new_key_value):
raise GTFtkError(
"key_value and new_key_value should have the same length.")
if len(key_value) == 0:
raise GTFtkError(
"Need some data to join.")
tmp_file = make_tmp_file("add_attr", ".txt")
if format not in ['bed6', 'bed', 'bed3']:
raise GTFtkError('Unsupported bed format')
if pygtftk.utils.ADD_CHR == 1:
chrom_out = "chr" + self.chrom
else:
chrom_out = self.chrom
token = [chrom_out,
str(int(self.get_5p_end()) - 1),
str(self.get_5p_end())]
if format == 'bed6' or format == 'bed':
if name is None:
raise GTFtkError("Need a name (column 4) to write a BED6 format.")
token += [name,
str(self.score),
self.strand]
pygtftk.utils.write_properly('\t'.join(token), outputfile)
if input_obj == '-':
self.fn = "-"
else:
if input_obj != '':
check_file_or_dir_exists(input_obj)
self.fn = input_obj
self._data = 0
else:
self.fn = "-"
self._data = 0
elif isinstance(input_obj, GTF):
self.fn = input_obj.fn
self._data = input_obj._data
else:
raise GTFtkError("Unsupported input type.")
message("Instantiating a GTF.")
if new_data is None:
self._data = self._dll.load_GTF(native_str(self.fn))
if check_ensembl_format:
tab = self.extract_data_iter_list("feature")
not_found = True
n = 0
for i in tab:
n += 1