How to use the biotite.sequence function in biotite

To help you get started, we’ve selected a few biotite examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github biotite-dev / biotite / tests / application / test_blast.py View on Github external
import biotite.application.blast as blast
import numpy as np
from requests.exceptions import ConnectionError
import pytest
import os.path
from ..util import data_dir, cannot_connect_to


BLAST_URL = "https://blast.ncbi.nlm.nih.gov/Blast.cgi"


# Start of E. coli lacZ ORF (UID: AJ308295)
dna_seq = seq.NucleotideSequence("ATGACCATGATTACGCCAAGCTTTCCGGGGAATTCA")

# Start of E. coli lacZ, translated dna_seq (UID: AJ308295)
prot_seq = seq.ProteinSequence("MTMITPSFPGNS")


@pytest.mark.skipif(
    cannot_connect_to(BLAST_URL),
    reason="NCBI BLAST is not available"
)
def test_blastn():
    app = blast.BlastWebApp("blastn", dna_seq, obey_rules=False)
    app.set_max_expect_value(100)
    app.start()
    app.join(timeout=300)
    alignments = app.get_alignments()
    # BLAST should find original sequence as best hit
    assert dna_seq == alignments[0].sequences[0]
    assert dna_seq == alignments[0].sequences[1]
github biotite-dev / biotite / tests / sequence / test_alphabet.py View on Github external
def test_encoding(alphabet_symbols, symbols, exp_code, use_letter_alphabet):
    if use_letter_alphabet:
        alph = seq.LetterAlphabet(alphabet_symbols)
    else:
        alph = seq.Alphabet(alphabet_symbols)
    
    if len(symbols) == 1:
        assert alph.encode(symbols[0]) == exp_code[0]
    else:
        assert list(alph.encode_multiple(symbols)) == list(exp_code)
github biotite-dev / biotite / tests / sequence / test_gff.py View on Github external
def test_genbank_consistency(path):
    """
    Test whether the same annotation (if reasonable) can be read from a
    GFF3 file and a GenBank file.
    """
    file = gb.GenBankFile.read(join(data_dir("sequence"), path))
    ref_annot = gb.get_annotation(file)

    file = gff.GFFFile.read(join(data_dir("sequence"), path[:-3] + ".gff3"))
    test_annot = gff.get_annotation(file)
    
    # Remove qualifiers, since they will be different
    # in GFF3 and GenBank
    ref_annot = seq.Annotation(
        [seq.Feature(feature.key, feature.locs) for feature in ref_annot]
    )
    test_annot = seq.Annotation(
        [seq.Feature(feature.key, feature.locs) for feature in test_annot]
    )
    for feature in test_annot:
        # Only CDS, gene, intron and exon should be equal
        # in GenBank and GFF3
        if feature.key in ["CDS", "gene", "intron", "exon"]:
            try:
                assert feature in test_annot
            except AssertionError:
                print(feature.key)
                for loc in feature.locs:
                    print(loc)
                raise
github biotite-dev / biotite / tests / sequence / test_genbank.py View on Github external
def test_genbank_utility_gb():
    """
    Check whether the high-level utility functions return the expected
    content of a known GenBank file. 
    """
    gb_file = gb.GenBankFile.read(join(data_dir("sequence"), "ec_bl21.gb"))
    assert gb.get_locus(gb_file) \
        == ("CP001509", 4558953, "DNA", True, "BCT", "16-FEB-2017")
    assert gb.get_definition(gb_file) \
        == ("Escherichia coli BL21(DE3), complete genome.")
    assert gb.get_version(gb_file) == "CP001509.3"
    assert gb.get_gi(gb_file) == 296142109
    assert gb.get_db_link(gb_file) \
        == {"BioProject" : "PRJNA20713", "BioSample" : "SAMN02603478"}
    annotation = gb.get_annotation(gb_file, include_only=["CDS"])
    feature = seq.Feature(
        "CDS",
        [seq.Location(5681, 6457, seq.Location.Strand.REVERSE)],
        {"gene": "yaaA", "transl_table": "11"}
    )
    in_annotation = False
    for f in annotation:
        if f.key == feature.key and f.locs == feature.locs and \
           all([(key, val in f.qual.items())
                for key, val in feature.qual.items()]):
                    in_annotation = True
    assert in_annotation
    assert len(gb.get_sequence(gb_file, format="gb")) == 4558953
github biotite-dev / biotite / tests / sequence / test_alphabet.py View on Github external
def test_error(alphabet_symbols, use_letter_alphabet, is_single_val):
    if use_letter_alphabet:
        alph = seq.LetterAlphabet(alphabet_symbols)
    else:
        alph = seq.Alphabet(alphabet_symbols)

    if is_single_val:
        with pytest.raises(seq.AlphabetError):
            alph.encode("G")
        with pytest.raises(seq.AlphabetError):
            alph.encode(42)
        with pytest.raises(seq.AlphabetError):
            alph.decode(len(alphabet_symbols))
        with pytest.raises(seq.AlphabetError):
            alph.decode(-1)
    else:
        with pytest.raises(seq.AlphabetError):
            alph.encode_multiple("G")
        with pytest.raises(seq.AlphabetError):
github biotite-dev / biotite / doc / tutorial / src / sequence.py View on Github external
"""
From A to T - The Sequence subpackage
=====================================

.. currentmodule:: biotite.sequence

:mod:`biotite.sequence` is a *Biotite* subpackage concerning maybe the
most popular type of data in bioinformatics: sequences.
The instantiation can be quite simple as
"""

import biotite.sequence as seq

dna = seq.NucleotideSequence("AACTGCTA")
print(dna)

########################################################################
# This example shows :class:`NucleotideSequence` which is a subclass of
# the abstract base class :class:`Sequence`.
# A :class:`NucleotideSequence` accepts an iterable object of strings,
# where each string can be ``'A'``, ``'C'``, ``'G'`` or ``'T'``.
# Each of these letters is called a *symbol*.
# 
# In general the sequence implementation in *Biotite* allows for
# *sequences of anything*.
# This means any immutable and hashable *Python* object can be used as
# a symbol in a sequence, as long as the object is part of the
# :class:`Alphabet` of the particular :class:`Sequence`.
# An :class:`Alphabet` object simply represents a list of objects that
# are allowed to occur in a :class:`Sequence`.
github biotite-dev / biotite / doc / examples / scripts / sequence / hcn_hydropathy.py View on Github external
ax = figure.add_subplot(111)

# Plot hydropathy
ax.plot(
    np.arange(1+ma_radius, len(hcn1)-ma_radius+1), hydropathies,
    color=biotite.colors["dimorange"]
)
ax.axhline(0, color="gray", linewidth=0.5)
ax.set_xlim(1, len(hcn1)+1)
ax.set_xlabel("HCN1 sequence position")
ax.set_ylabel("Hydropathy (15 residues moving average)")

# Draw boxes for annotated transmembrane helices for comparison
# with hydropathy plot
annotation = gb.get_annotation(gp_file, include_only=["Region"])
transmembrane_annotation = seq.Annotation(
    [feature for feature in annotation
     if feature.qual["region_name"] == "Transmembrane region"]
)
for feature in transmembrane_annotation:
    first, last = feature.get_location_range()
    ax.axvspan(first, last, color=(0.0, 0.0, 0.0, 0.2), linewidth=0)

# Plot similarity score as measure for conservation
ax2 = ax.twinx()
ax2.plot(
    np.arange(1+ma_radius, len(hcn1)-ma_radius+1), scores,
    color=biotite.colors["brightorange"]
)
ax2.set_ylabel("Similarity score (15 residues moving average)")

ax.legend(
github biotite-dev / biotite / doc / examples / scripts / sequence / codon_usage.py View on Github external
# Obtain the sequence of the CDS
    cds_seq = k12_genome[cds]
    if len(cds_seq) % 3 != 0:
        # A CDS' length should be a multiple of 3,
        # otherwise the CDS is malformed
        continue
    # Iterate over the sequence in non-overlapping frames of 3
    # and count the occurence of each codon
    for i in range(0, len(cds_seq), 3):
        codon_code = tuple(cds_seq.code[i:i+3])
        codon_counter[codon_code] += 1

# Convert the total frequencies into relative frequencies
# for each amino acid
# The NCBI codon table with ID 11 is the bacterial codon table
table = seq.CodonTable.load(11)
# As the script uses symbol codes, each amino acid is represented by a
# number between 0 and 19, instead of the single letter code
for amino_acid_code in range(20):
    # Find all codons coding for the amino acid
    # The codons are also in symbol code format, e.g. ATG -> (0, 3, 2)
    codon_codes_for_aa = table[amino_acid_code]
    # Get the total amount of codon occurrences for the amino acid
    total = 0
    for codon_code in codon_codes_for_aa:
        total += codon_counter[codon_code]
    # Replace the total frequencies with relative frequencies
    # and print it
    for codon_code in codon_codes_for_aa:
        # Convert total frequencies into relative frequencies
        codon_counter[codon_code] /= total
        # The rest of this code block prints the codon usage table