How to use the gtdbtk.biolib_lite.newick.parse_label function in gtdbtk

To help you get started, we’ve selected a few gtdbtk examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Ecogenomics / GTDBTk / gtdbtk / infer_ranks.py View on Github external
def _find_ingroup_taxon(self, ingroup_taxon, tree):
        """Find node of ingroup taxon in tree."""

        ingroup_node = None
        for node in tree.postorder_node_iter():
            support, taxon, auxiliary_info = parse_label(node.label)

            if taxon:
                taxa = [t.strip() for t in taxon.split(';')]
                if ingroup_taxon in taxa:
                    if ingroup_node is not None:
                        raise GTDBTkExit(f'Ingroup taxon {ingroup_taxon} '
                                         f'identified multiple times.')
                    ingroup_node = node

        if ingroup_node is None:
            raise GTDBTkExit(f'Ingroup taxon {ingroup_taxon} not found in tree.')

        return ingroup_node
github Ecogenomics / GTDBTk / gtdbtk / relative_distance.py View on Github external
Returns
        -------
        dict : d[rank_index][taxon] -> relative divergence
        """

        # calculate relative distance for all nodes
        self.decorate_rel_dist(tree)

        # assign internal nodes with ranks from
        rel_dists = defaultdict(dict)
        for node in tree.preorder_node_iter(lambda n: n != tree.seed_node):
            if not node.label or node.is_leaf():
                continue

            # check for support value
            _support, taxon_name, _auxiliary_info = parse_label(node.label)

            if not taxon_name:
                continue

            # get most-specific rank if a node represents multiple ranks
            if ';' in taxon_name:
                taxon_name = taxon_name.split(';')[-1].strip()

            most_specific_rank = taxon_name[0:3]
            rel_dists[Taxonomy.rank_index[most_specific_rank]
                      ][taxon_name] = node.rel_dist

        return rel_dists
github Ecogenomics / GTDBTk / gtdbtk / decorate.py View on Github external
Parameters
        ----------
        leaf : Node
          Node in tree.
          
        Returns
        -------
        list
          Taxa for leaf in rank order.
        """

        leaf_taxa = []

        parent = leaf
        while parent:
            _support, taxon, _aux_info = parse_label(parent.label)

            if taxon:
                for t in taxon.split(';')[::-1]:
                    leaf_taxa.append(t.strip())

            parent = parent.parent_node

        ordered_taxa = leaf_taxa[::-1]

        # fill in missing ranks
        last_rank = ordered_taxa[-1][0:3]
        for i in range(Taxonomy.rank_prefixes.index(last_rank) + 1, len(Taxonomy.rank_prefixes)):
            ordered_taxa.append(Taxonomy.rank_prefixes[i])

        return ordered_taxa

github Ecogenomics / GTDBTk / gtdbtk / classify.py View on Github external
count += 1
                    taxa = []
                    cur_node = leaf
                    current_rel_dist = 1.0
                    while cur_node.parent_node:
                        if hasattr(cur_node, 'rel_dist') and current_rel_dist == 1.0 and cur_node.rel_dist < 1.0:
                            current_rel_dist = cur_node.rel_dist
                        if cur_node.is_internal():
                            child_genomes = [nd.taxon.label for nd in cur_node.leaf_nodes(
                            ) if nd.taxon.label not in user_genome_ids]
                            if len(child_genomes) == 1:
                                is_on_terminal_branch = True
                                term_branch_taxonomy = self.gtdb_taxonomy.get(
                                    child_genomes[0])

                        _support, taxon, _aux_info = parse_label(
                            cur_node.label)
                        if taxon:
                            for t in taxon.split(';')[::-1]:
                                taxa.append(t.strip())
                        cur_node = cur_node.parent_node

                    taxa_str = ';'.join(taxa[::-1])
                    pplacer_tax = str(taxa_str)

                    if is_on_terminal_branch:
                        tax_of_leaf = term_branch_taxonomy[term_branch_taxonomy.index(
                            taxa_str.split(';')[-1]) + 1:-1]
                        #print ('tax_of_leaf', tax_of_leaf)
                        taxa_str = self._classify_on_terminal_branch(
                            tax_of_leaf, current_rel_dist, taxa_str.split(';')[-1][0:3], term_branch_taxonomy, marker_dict)
                    else:
github Ecogenomics / GTDBTk / gtdbtk / classify.py View on Github external
Parameters
        ----------
        tree : Dendropy Tree
            Phylogenetic tree.

        Returns
        -------
        list
            List of phyla level lineages.
        """
        phyla = []
        for node in tree.preorder_node_iter():
            if not node.label or node.is_leaf():
                continue

            _support, taxon_name, _auxiliary_info = parse_label(node.label)
            if taxon_name:
                taxa = [x.strip() for x in taxon_name.split(';')]
                if taxa[-1].startswith('p__'):
                    phyla.append(taxa[-1])

        return phyla
github Ecogenomics / GTDBTk / gtdbtk / classify.py View on Github external
for r in rel_dists.keys():
                rel_dists[r].pop(p, None)

            for t in children:
                for r in rel_dists.keys():
                    rel_dists[r].pop(t, None)

            phylum_rel_dists[phylum] = rel_dists

            # calculate relative distance to all nodes
            rd.decorate_rel_dist(cur_tree)

            # determine which lineages represents the 'ingroup'
            ingroup_subtree = None
            for c in cur_tree.seed_node.child_node_iter():
                _support, taxon_name, _auxiliary_info = parse_label(c.label)
                if not taxon_name or p not in taxon_name:
                    ingroup_subtree = c
                    break

            # do a preorder traversal of 'ingroup' and record relative
            # divergence to nodes
            for n in ingroup_subtree.preorder_iter():
                rel_node_dists[n.id].append(n.rel_dist)

        sys.stdout.write(
            '==> Inference for RED distributions finished.                         ')
        sys.stdout.flush()
        sys.stdout.write('\n')

        return phylum_rel_dists, rel_node_dists
github Ecogenomics / GTDBTk / gtdbtk / classify.py View on Github external
taxa_for_dist_inference.intersection_update(valid_taxa)

            # explicitly add in the species since they have no
            # children and thus be absent from the taxon_child dictionary
            taxa_for_dist_inference.update(species)

        # restrict taxa used for inferring distribution to those with
        # sufficient support
        if min_support > 0:
            for node in tree.preorder_node_iter():
                if not node.label or node.is_leaf():
                    continue

                # check for support value
                support, taxon_name, _auxiliary_info = parse_label(node.label)

                if not taxon_name:
                    continue

                if support and float(support) < min_support:
                    taxa_for_dist_inference.difference_update([taxon_name])
                elif not support and min_support > 0:
                    # no support value, so inform user if they were trying to
                    # filter on this property
                    print(
                        '[Error] Tree does not contain support values. As such, --min_support should be set to 0.')
                    continue

        # restrict taxa used for inferring distribution to the trusted set
        if trusted_taxa:
            taxa_for_dist_inference = trusted_taxa.intersection(