Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
import biotite.application.blast as blast
import numpy as np
from requests.exceptions import ConnectionError
import pytest
import os.path
from ..util import data_dir, cannot_connect_to
BLAST_URL = "https://blast.ncbi.nlm.nih.gov/Blast.cgi"
# Start of E. coli lacZ ORF (UID: AJ308295)
dna_seq = seq.NucleotideSequence("ATGACCATGATTACGCCAAGCTTTCCGGGGAATTCA")
# Start of E. coli lacZ, translated dna_seq (UID: AJ308295)
prot_seq = seq.ProteinSequence("MTMITPSFPGNS")
@pytest.mark.skipif(
cannot_connect_to(BLAST_URL),
reason="NCBI BLAST is not available"
)
def test_blastn():
app = blast.BlastWebApp("blastn", dna_seq, obey_rules=False)
app.set_max_expect_value(100)
app.start()
app.join(timeout=300)
alignments = app.get_alignments()
# BLAST should find original sequence as best hit
assert dna_seq == alignments[0].sequences[0]
assert dna_seq == alignments[0].sequences[1]
def test_encoding(alphabet_symbols, symbols, exp_code, use_letter_alphabet):
if use_letter_alphabet:
alph = seq.LetterAlphabet(alphabet_symbols)
else:
alph = seq.Alphabet(alphabet_symbols)
if len(symbols) == 1:
assert alph.encode(symbols[0]) == exp_code[0]
else:
assert list(alph.encode_multiple(symbols)) == list(exp_code)
def test_genbank_consistency(path):
"""
Test whether the same annotation (if reasonable) can be read from a
GFF3 file and a GenBank file.
"""
file = gb.GenBankFile.read(join(data_dir("sequence"), path))
ref_annot = gb.get_annotation(file)
file = gff.GFFFile.read(join(data_dir("sequence"), path[:-3] + ".gff3"))
test_annot = gff.get_annotation(file)
# Remove qualifiers, since they will be different
# in GFF3 and GenBank
ref_annot = seq.Annotation(
[seq.Feature(feature.key, feature.locs) for feature in ref_annot]
)
test_annot = seq.Annotation(
[seq.Feature(feature.key, feature.locs) for feature in test_annot]
)
for feature in test_annot:
# Only CDS, gene, intron and exon should be equal
# in GenBank and GFF3
if feature.key in ["CDS", "gene", "intron", "exon"]:
try:
assert feature in test_annot
except AssertionError:
print(feature.key)
for loc in feature.locs:
print(loc)
raise
def test_genbank_utility_gb():
"""
Check whether the high-level utility functions return the expected
content of a known GenBank file.
"""
gb_file = gb.GenBankFile.read(join(data_dir("sequence"), "ec_bl21.gb"))
assert gb.get_locus(gb_file) \
== ("CP001509", 4558953, "DNA", True, "BCT", "16-FEB-2017")
assert gb.get_definition(gb_file) \
== ("Escherichia coli BL21(DE3), complete genome.")
assert gb.get_version(gb_file) == "CP001509.3"
assert gb.get_gi(gb_file) == 296142109
assert gb.get_db_link(gb_file) \
== {"BioProject" : "PRJNA20713", "BioSample" : "SAMN02603478"}
annotation = gb.get_annotation(gb_file, include_only=["CDS"])
feature = seq.Feature(
"CDS",
[seq.Location(5681, 6457, seq.Location.Strand.REVERSE)],
{"gene": "yaaA", "transl_table": "11"}
)
in_annotation = False
for f in annotation:
if f.key == feature.key and f.locs == feature.locs and \
all([(key, val in f.qual.items())
for key, val in feature.qual.items()]):
in_annotation = True
assert in_annotation
assert len(gb.get_sequence(gb_file, format="gb")) == 4558953
def test_error(alphabet_symbols, use_letter_alphabet, is_single_val):
if use_letter_alphabet:
alph = seq.LetterAlphabet(alphabet_symbols)
else:
alph = seq.Alphabet(alphabet_symbols)
if is_single_val:
with pytest.raises(seq.AlphabetError):
alph.encode("G")
with pytest.raises(seq.AlphabetError):
alph.encode(42)
with pytest.raises(seq.AlphabetError):
alph.decode(len(alphabet_symbols))
with pytest.raises(seq.AlphabetError):
alph.decode(-1)
else:
with pytest.raises(seq.AlphabetError):
alph.encode_multiple("G")
with pytest.raises(seq.AlphabetError):
"""
From A to T - The Sequence subpackage
=====================================
.. currentmodule:: biotite.sequence
:mod:`biotite.sequence` is a *Biotite* subpackage concerning maybe the
most popular type of data in bioinformatics: sequences.
The instantiation can be quite simple as
"""
import biotite.sequence as seq
dna = seq.NucleotideSequence("AACTGCTA")
print(dna)
########################################################################
# This example shows :class:`NucleotideSequence` which is a subclass of
# the abstract base class :class:`Sequence`.
# A :class:`NucleotideSequence` accepts an iterable object of strings,
# where each string can be ``'A'``, ``'C'``, ``'G'`` or ``'T'``.
# Each of these letters is called a *symbol*.
#
# In general the sequence implementation in *Biotite* allows for
# *sequences of anything*.
# This means any immutable and hashable *Python* object can be used as
# a symbol in a sequence, as long as the object is part of the
# :class:`Alphabet` of the particular :class:`Sequence`.
# An :class:`Alphabet` object simply represents a list of objects that
# are allowed to occur in a :class:`Sequence`.
ax = figure.add_subplot(111)
# Plot hydropathy
ax.plot(
np.arange(1+ma_radius, len(hcn1)-ma_radius+1), hydropathies,
color=biotite.colors["dimorange"]
)
ax.axhline(0, color="gray", linewidth=0.5)
ax.set_xlim(1, len(hcn1)+1)
ax.set_xlabel("HCN1 sequence position")
ax.set_ylabel("Hydropathy (15 residues moving average)")
# Draw boxes for annotated transmembrane helices for comparison
# with hydropathy plot
annotation = gb.get_annotation(gp_file, include_only=["Region"])
transmembrane_annotation = seq.Annotation(
[feature for feature in annotation
if feature.qual["region_name"] == "Transmembrane region"]
)
for feature in transmembrane_annotation:
first, last = feature.get_location_range()
ax.axvspan(first, last, color=(0.0, 0.0, 0.0, 0.2), linewidth=0)
# Plot similarity score as measure for conservation
ax2 = ax.twinx()
ax2.plot(
np.arange(1+ma_radius, len(hcn1)-ma_radius+1), scores,
color=biotite.colors["brightorange"]
)
ax2.set_ylabel("Similarity score (15 residues moving average)")
ax.legend(
# Obtain the sequence of the CDS
cds_seq = k12_genome[cds]
if len(cds_seq) % 3 != 0:
# A CDS' length should be a multiple of 3,
# otherwise the CDS is malformed
continue
# Iterate over the sequence in non-overlapping frames of 3
# and count the occurence of each codon
for i in range(0, len(cds_seq), 3):
codon_code = tuple(cds_seq.code[i:i+3])
codon_counter[codon_code] += 1
# Convert the total frequencies into relative frequencies
# for each amino acid
# The NCBI codon table with ID 11 is the bacterial codon table
table = seq.CodonTable.load(11)
# As the script uses symbol codes, each amino acid is represented by a
# number between 0 and 19, instead of the single letter code
for amino_acid_code in range(20):
# Find all codons coding for the amino acid
# The codons are also in symbol code format, e.g. ATG -> (0, 3, 2)
codon_codes_for_aa = table[amino_acid_code]
# Get the total amount of codon occurrences for the amino acid
total = 0
for codon_code in codon_codes_for_aa:
total += codon_counter[codon_code]
# Replace the total frequencies with relative frequencies
# and print it
for codon_code in codon_codes_for_aa:
# Convert total frequencies into relative frequencies
codon_counter[codon_code] /= total
# The rest of this code block prints the codon usage table