Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_custom_substitution_matrix(sequences, app_cls):
alph = seq.ProteinSequence.alphabet
# Strong identity matrix
score_matrix = np.identity(len(alph)) * 1000
matrix = align.SubstitutionMatrix(alph, alph, score_matrix)
exp_ali = (
"BI-QTITE\n"
"TITANITE\n"
"BI-SMITE\n"
"-I-QLITE"
)
app = app_cls(sequences, matrix=matrix)
app.start()
app.join()
alignment = app.get_alignment()
assert str(alignment) == exp_ali
query = entrez.SimpleQuery("luxA", "Gene Name") \
& entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
uids = entrez.search(query, db_name="protein")
fasta_file = fasta.FastaFile.read(entrez.fetch_single_file(
uids, None, db_name="protein", ret_type="fasta"
))
ids = []
sequences = []
for header, seq_str in fasta_file.items():
# Extract the UniProt Entry name from header
identifier = header.split("|")[-1].split()[0]
ids.append(identifier)
sequences.append(seq.ProteinSequence(seq_str))
matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment, order, tree, distances = align.align_multiple(
sequences, matrix, gap_penalty=(-10,-1), terminal_penalty=False
)
# Order alignment according to the guide tree
alignment = alignment[:, order]
ids = [ids[i] for i in order]
fig = plt.figure(figsize=(8.0, 20.0))
ax = fig.add_subplot(111)
graphics.plot_alignment_type_based(
ax, alignment, labels=ids, show_numbers=True, spacing=2.0
)
fig.tight_layout()
plt.show()
# Internally the sequences and the matrix are converted into protein
# sequences/matrix.
# Then the masquerading sequences are aligned via the software and
# finally the sequences are mapped back into the original sequence type.
# Let's show this on the example of a nonsense alphabet.
import numpy as np
import biotite.application.mafft as mafft
import biotite.sequence.align as align
alphabet = seq.Alphabet(("foo", "bar", 42))
sequences = [seq.GeneralSequence(alphabet, sequence) for sequence in [
["foo", "bar", 42, "foo", "foo", 42, 42],
["foo", 42, "foo", "bar", "foo", 42, 42],
]]
matrix = align.SubstitutionMatrix(
alphabet, alphabet, np.array([
[ 100, -100, -100],
[-100, 100, -100],
[-100, -100, 100]
])
)
alignment = mafft.MafftApp.align(sequences, matrix=matrix)
# As the alphabet do not has characters as symbols
# the alignment cannot be directly printed
# However, we can print the trace
print(alignment.trace)
########################################################################
# Secondary structure annotation
# ------------------------------
#
of amino acids in the *BLOSUM62* substitution matrix.
The amino acids are clustered with the UPGMA method.
"""
# Code source: Patrick Kunzmann
# License: BSD 3 clause
import numpy as np
import matplotlib.pyplot as plt
import biotite.sequence as seq
import biotite.sequence.align as align
import biotite.sequence.phylo as phylo
import biotite.sequence.graphics as graphics
# Obtain BLOSUM62
matrix = align.SubstitutionMatrix.std_protein_matrix()
print(matrix)
########################################################################
# The original *BLOSUM62* contains symbols for ambiguous amino acids and
# the stop signal.
# As these are not actual amino acids, a new substitution matrix is
# created, where these symbols are are removed.
# Matrix should not contain ambiguous symbols or stop signal
matrix = align.SubstitutionMatrix(
seq.Alphabet(matrix.get_alphabet1().get_symbols()[:-4]),
seq.Alphabet(matrix.get_alphabet2().get_symbols()[:-4]),
matrix.score_matrix()[:-4, :-4]
)
similarities = matrix.score_matrix()
print(matrix)
# :class:`ndarray` storing the similarity scores.
# You can choose one of many predefined matrices from an internal
# database or you can create a custom matrix on your own.
#
# So much for theory.
# Let's start by showing different ways to construct a
# :class:`SubstitutionMatrix`, in our case for protein sequence
# alignments:
import biotite.sequence as seq
import biotite.sequence.align as align
import numpy as np
alph = seq.ProteinSequence.alphabet
# Load the standard protein substitution matrix, which is BLOSUM62
matrix = align.SubstitutionMatrix.std_protein_matrix()
print("\nBLOSUM62\n")
print(matrix)
# Load another matrix from internal database
matrix = align.SubstitutionMatrix(alph, alph, "BLOSUM50")
# Load a matrix dictionary representation,
# modify it, and create the SubstitutionMatrix
# (The dictionary could be alternatively loaded from a string containing
# the matrix in NCBI format)
matrix_dict = align.SubstitutionMatrix.dict_from_db("BLOSUM62")
matrix_dict[("P","Y")] = 100
matrix = align.SubstitutionMatrix(alph, alph, matrix_dict)
# And now create a matrix by directly provding the ndarray
# containing the similarity scores
# (identity matrix in our case)
scores = np.identity(len(alph), dtype=int)
matrix = align.SubstitutionMatrix(alph, alph, scores)
) % 360 - 180
)**2,
axis=-1
)
# Chose PB, where the RMSDA to the reference angle is lowest
# Due to the definition of Biotite symbol codes
# the index of the chosen PB is directly the symbol code
pb_seq_code = np.argmin(rmsda, axis=0)
# Put the array of symbol codes into actual sequence objects
pb_sequence = seq.GeneralSequence(pb_alphabet)
pb_sequence.code = pb_seq_code
pb_seqs.append(pb_sequence)
# Perfrom a multiple sequence alignment of the PB sequences
matrix_dict = align.SubstitutionMatrix.dict_from_str(matrix_str)
matrix = align.SubstitutionMatrix(pb_alphabet, pb_alphabet, matrix_dict)
alignment, order, _, _ = align.align_multiple(
pb_seqs, matrix, gap_penalty=(-500,-100), terminal_penalty=False
)
# Visualize the alignment
# Order alignment according to guide tree
alignment = alignment[:, order.tolist()]
labels = [organisms[i] for i in order]
fig = plt.figure(figsize=(8.0, 4.0))
ax = fig.add_subplot(111)
# The color scheme was generated with the 'Gecos' software
graphics.plot_alignment_type_based(
ax, alignment, labels=labels, symbols_per_line=45, spacing=2,
show_numbers=True, color_scheme="flower"
)
# Organism names in italic
# sequence
consensus_sequence.code = consensus_code
return consensus_sequence
drug_type_consensus = create_consensus(
[sequences[strain] for strain in (1, 10, 13, 20, 53, 54)]
)
fiber_type_consensus = create_consensus(
[sequences[strain] for strain in (9, 5, 11, 45, 66, 68, 78)]
)
# Create an alignment for visualization purposes
# No insertion/deletions -> Align ungapped
matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment = align.align_ungapped(
drug_type_consensus, fiber_type_consensus, matrix=matrix
)
# A colormap for hightlighting sequence dissimilarity:
# At low similarity the symbols are colored red,
# at high similarity the symbols are colored white
cmap = LinearSegmentedColormap.from_list(
"custom", colors=[(1.0, 0.3, 0.3), (1.0, 1.0, 1.0)]
# ^ reddish ^ white
)
fig = plt.figure(figsize=(8.0, 6.0))
ax = fig.add_subplot(111)
graphics.plot_alignment_similarity_based(
ax, alignment, matrix=matrix, symbols_per_line=50,
seq.NucleotideSequence("tttacggctagctcagtcctaggtacaatgctagc"),
seq.NucleotideSequence("ttgacggctagctcagtcctaggtatagtgctagc"),
seq.NucleotideSequence("ctgatagctagctcagtcctagggattatgctagc"),
seq.NucleotideSequence("ctgatggctagctcagtcctagggattatgctagc"),
seq.NucleotideSequence("tttatggctagctcagtcctaggtacaatgctagc"),
seq.NucleotideSequence("tttatagctagctcagcccttggtacaatgctagc"),
seq.NucleotideSequence("ttgacagctagctcagtcctagggactatgctagc"),
seq.NucleotideSequence("ttgacagctagctcagtcctagggattgtgctagc"),
seq.NucleotideSequence("ttgacggctagctcagtcctaggtattgtgctagc")]
# Sequences do not need to be aligned
# -> Create alignment with trivial trace
# [[0 0 0 ...]
# [1 1 1 ...]
# [2 2 2 ...]
# ... ]
alignment = align.Alignment(
sequences = seqs,
trace = np.tile(np.arange(len(seqs[0])), len(seqs)) \
.reshape(len(seqs), len(seqs[0])) \
.transpose(),
score = 0
)
# Create sequence logo from alignment
fig = plt.figure(figsize=(8.0, 1.5))
ax = fig.add_subplot(111)
graphics.plot_sequence_logo(ax, alignment)
# Remove the entire frame
ax.axis("off")
fig.tight_layout()
plt.show()