Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def sequences():
"""
10 Cas9 sequences.
"""
fasta_file = fasta.FastaFile.read(join(data_dir("sequence"), "cas9.fasta"))
return [seq.ProteinSequence(sequence) for sequence in fasta_file.values()]
def test_fetch(common_name, as_file_like):
path = None if as_file_like else biotite.temp_dir()
db_name = "Protein" if common_name else "protein"
file = entrez.fetch("1L2Y_A", path, "fa", db_name,
"fasta", overwrite=True)
fasta_file = fasta.FastaFile.read(file)
prot_seq = fasta.get_sequence(fasta_file)
def test_access():
path = os.path.join(data_dir("sequence"), "nuc.fasta")
file = fasta.FastaFile.read(path)
assert file["dna sequence"] == "ACGCTACGT"
assert file["another dna sequence"] == "A"
assert file["third dna sequence"] == "ACGT"
assert dict(file.items()) == {
"dna sequence" : "ACGCTACGT",
"another dna sequence" : "A",
"third dna sequence" : "ACGT"
}
file["another dna sequence"] = "AA"
del file["dna sequence"]
file["yet another sequence"] = "ACGT"
assert dict(file.items()) == {
"another dna sequence" : "AA",
"third dna sequence" : "ACGT",
"yet another sequence" : "ACGT"
}
path = os.path.join(data_dir("sequence"), "nuc.fasta")
file = fasta.FastaFile.read(path)
assert seq.NucleotideSequence("ACGCTACGT") == fasta.get_sequence(file)
seq_dict = fasta.get_sequences(file)
file2 = fasta.FastaFile()
fasta.set_sequences(file2, seq_dict)
seq_dict2 = fasta.get_sequences(file2)
assert seq_dict == seq_dict2
file3 = fasta.FastaFile()
fasta.set_sequence(file3, seq.NucleotideSequence("AACCTTGG"))
assert file3["sequence"] == "AACCTTGG"
path = os.path.join(data_dir("sequence"), "prot.fasta")
file4 = fasta.FastaFile.read(path)
assert seq.ProteinSequence("YAHGFRTGS") == fasta.get_sequence(file4)
path = os.path.join(data_dir("sequence"), "invalid.fasta")
file5 = fasta.FastaFile.read(path)
with pytest.raises(ValueError):
seq.NucleotideSequence(fasta.get_sequence(file5))
# License: BSD 3 clause
import numpy as np
import matplotlib.pyplot as plt
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.sequence.align as align
import biotite.sequence.graphics as graphics
import biotite.database.entrez as entrez
# Search for protein products of LexA gene in UniProtKB/Swiss-Prot database
query = entrez.SimpleQuery("luxA", "Gene Name") \
& entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
uids = entrez.search(query, db_name="protein")
fasta_file = fasta.FastaFile.read(entrez.fetch_single_file(
uids, None, db_name="protein", ret_type="fasta"
))
ids = []
sequences = []
for header, seq_str in fasta_file.items():
# Extract the UniProt Entry name from header
identifier = header.split("|")[-1].split()[0]
ids.append(identifier)
sequences.append(seq.ProteinSequence(seq_str))
matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment, order, tree, distances = align.align_multiple(
sequences, matrix, gap_penalty=(-10,-1), terminal_penalty=False
)
# Order alignment according to the guide tree
import warnings
import numpy as np
import matplotlib.pyplot as plt
import biotite
import biotite.database.entrez as entrez
import biotite.sequence as seq
import biotite.sequence.align as align
import biotite.sequence.graphics as graphics
import biotite.sequence.io.fasta as fasta
import biotite.application.clustalo as clustalo
uids = ["5JHB_A", "5LUQ_A", "5FLC_B", "5YZ0_A", "5NP0_A", "4FUL_A"]
names = ["PI3K", "DNA-PKcs", "mTOR", "ATR", "ATM", "hSMG-1"]
sequences = []
file = fasta.FastaFile.read(entrez.fetch_single_file(
uids, None, db_name="protein", ret_type="fasta"
))
for header, seq_str in file.items():
sequences.append(seq.ProteinSequence(seq_str))
alignment = clustalo.ClustalOmegaApp.align(sequences)
########################################################################
# Since the sequences are relatively long, the display of the entire
# alignment using a :func:`plot_alignment` would be too big.
# Instead we use a heat map, which highlights the similarity in the
# alignment column using a color map.
#
# Like the :class:`LetterSimilarityPlotter` we will use the
# *average normalized similarity* as measure.
Since we want to perform a six-frame translation we have to look at
the complementary strand of the genome as well.
"""
# Code source: Patrick Kunzmann
# License: BSD 3 clause
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.database.entrez as entrez
import matplotlib.pyplot as plt
# Download Porcine circovirus genome
file = entrez.fetch("KP282147", None, "fa", "nuccore", "fasta")
fasta_file = fasta.FastaFile.read(file)
genome = fasta.get_sequence(fasta_file)
# Perform translation for forward strand
proteins, positions = genome.translate()
print("Forward strand:")
for i in range(len(proteins)):
print("{:4d} - {:4d}: {:}"
.format(positions[i][0], positions[i][1], str(proteins[i])))
print("\n")
# Perform translation for complementary strand
genome_rev = genome.reverse().complement()
proteins, positions = genome_rev.translate()
print("Reverse strand:")
for i in range(len(proteins)):
print("{:5d} - {:5d}: {:}"
.format(positions[i][0], positions[i][1], str(proteins[i])))
for ax in (axes, twin):
ax.set_yticklabels(ax.get_yticklabels(), fontdict={"color":"white"})
axes.get_figure().patch.set_facecolor("#181818")
# Using cyclotide sequences as example
query = (
entrez.SimpleQuery("Cyclotide") &
entrez.SimpleQuery("cter") &
entrez.SimpleQuery("srcdb_swiss-prot", field="Properties") ^
entrez.SimpleQuery("Precursor")
)
uids = entrez.search(query, "protein")
fasta_file = fasta.FastaFile.read(
entrez.fetch_single_file(uids, None, "protein", "fasta")
)
sequence_dict = fasta.get_sequences(fasta_file)
headers = list(sequence_dict.keys())
sequences = list(sequence_dict.values())
labels = [header[-1] for header in headers]
# Perform a multiple sequence alignment
matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment, order, _, _ = align.align_multiple(sequences, matrix)
# Order alignment according to guide tree
alignment = alignment[:, order.tolist()]
labels = [labels[i] for i in order]
# Visualize the alignment using the new alignment plotter
fig = plt.figure(figsize=(8.0, 3.7))
# Again, symbol codes are used here
opt_codons = {}
for amino_acid_code in range(20):
codon_codes_for_aa = table[amino_acid_code]
# Find codon with maximum frequency
max_freq = 0
best_codon_code = None
for codon_code in codon_codes_for_aa:
if codon_counter[codon_code] > max_freq:
max_freq = codon_counter[codon_code]
best_codon_code = codon_code
# Map the amino acid to the codon with maximum frequency
opt_codons[amino_acid_code] = best_codon_code
# Fetch the streptavidin protein sequence from Streptomyces avidinii
fasta_file = fasta.FastaFile.read(
entrez.fetch("P22629", None, "fasta", "protein", "fasta")
)
strep_prot_seq = fasta.get_sequence(fasta_file)
# Create a DNA sequence from the protein sequence
# using the optimal codons
strep_dna_seq = seq.NucleotideSequence()
strep_dna_seq.code = np.concatenate(
[opt_codons[amino_acid_code] for amino_acid_code in strep_prot_seq.code]
)
# Add stop codon
strep_dna_seq += seq.NucleotideSequence("TAA")
# Put the DNA sequence into a FASTA file
fasta_file = fasta.FastaFile()
fasta_file["Codon optimized streptavidin"] = str(strep_dna_seq)
# Print the contents of the created FASTA file
print(fasta_file)