Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_TP53_translation_from_cdna():
tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
cdna = tp53_001.coding_sequence
amino_acids = translate_cdna(cdna, first_codon_is_start=True)
eq_(amino_acids, tp53_001.protein_sequence)
def test_sequence_key_with_reading_frame_deletion_with_five_prime_utr():
# Delete second codon of TP53-001, the surrounding context
# includes nucleotides from the 5' UTR. Since TP53 is on the negative
# strand we have to take the reverse complement of the variant which turns
# it into 'CTC'>''
tp53_deletion = Variant(
"17", 7676589, "CTC", "", ensembl_grch38)
tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
# Sequence of TP53 around second codon with 10 context nucleotides:
# In [51]: t.sequence[193-10:193+13]
# Out[51]: 'CACTGCCATGGAGGAGCCGCAGT'
# Which can be split into the following parts:
# last 7 nt of 5' UTR: CACTGCC
# start codon: ATG (translates to M)
# 2nd codon: GAG <---- variant occurs here
# 3rd codon: GAG
# 4th codon: CCG
# 5th codon: CAG
# first nt of 6th codon: T
result = ReferenceCodingSequenceKey.from_variant_and_transcript(
variant=tp53_deletion,
transcript=tp53_001,
def test_gene_ids_grch38_hla_a():
# chr6:29,945,884 is a position for HLA-A
# Gene ID = ENSG00000206503
# based on:
# http://useast.ensembl.org/Homo_sapiens/Gene/
# Summary?db=core;g=ENSG00000206503;r=6:29941260-29945884
ids = ensembl_grch38.gene_ids_at_locus(6, 29945884)
expected = "ENSG00000206503"
assert ids == ["ENSG00000206503"], \
"Expected HLA-A, gene ID = %s, got: %s" % (expected, ids)
def test_gene_ids_of_gene_name_hla_grch38():
hla_a_gene_ids = ensembl_grch38.gene_ids_of_gene_name("HLA-A")
assert 'ENSG00000206503' in hla_a_gene_ids, hla_a_gene_ids
hla_b_gene_ids = ensembl_grch38.gene_ids_of_gene_name("HLA-B")
assert 'ENSG00000234745' in hla_b_gene_ids, hla_b_gene_ids
hla_c_gene_ids = ensembl_grch38.gene_ids_of_gene_name("HLA-C")
assert 'ENSG00000204525' in hla_c_gene_ids, hla_c_gene_ids
def test_partition_variant_reads_deletion():
alignment_file = load_bam("data/cancer-wgs-primary.chr12.bam")
chromosome = "12"
base1_location = 70091490
ref = "TTGTAGATGCTGCCTCTCC"
alt = ""
variant = Variant(
contig=chromosome,
start=base1_location,
ref=ref,
alt=alt,
ensembl=ensembl_grch38)
read_collector = ReadCollector()
read_evidence = read_collector.read_evidence_for_variant(
alignment_file=alignment_file,
variant=variant)
assert len(read_evidence.alt_reads) > 1
for variant_read in read_evidence.alt_reads:
eq_(variant_read.allele, alt)
def test_serialization():
variants = [
Variant(
1, start=10, ref="AA", alt="AAT", genome=ensembl_grch38),
Variant(10, start=15, ref="A", alt="G"),
Variant(20, start=150, ref="", alt="G"),
]
for original in variants:
# This causes the variant's ensembl object to make a SQL connection,
# which makes the ensembl object non-serializable. By calling this
# method, we are checking that we don't attempt to directly serialize
# the ensembl object.
original.effects()
# Test pickling.
serialized = pickle.dumps(original)
reconstituted = pickle.loads(serialized)
eq_(original, reconstituted)
eq_(original.contig, reconstituted.contig)
def test_most_common_nucleotides_for_chr12_deletion():
samfile = load_bam("data/cancer-wgs-primary.chr12.bam")
chromosome = "chr12"
base1_location = 70091490
ref = "TTGTAGATGCTGCCTCTCC"
alt = ""
variant = Variant(
chromosome,
base1_location,
ref,
alt,
ensembl=ensembl_grch38)
read_creator = ReadCollector()
variant_reads = read_creator.allele_reads_supporting_variant(
alignments=samfile,
variant=variant)
consensus_sequence, chosen_counts, other_counts = most_common_nucleotides(
variant_reads)
print(chosen_counts)
print(other_counts)
eq_(len(chosen_counts), len(consensus_sequence))
eq_(len(other_counts), len(consensus_sequence))
assert other_counts.sum() < chosen_counts.sum(), \
"Counts for alternate nucleotides should not exceed the chosen sequence"
number_matching_reads = 0
for variant_read in variant_reads:
full_seq = variant_read.prefix + variant_read.allele + variant_read.suffix
def make_decoy_set(hits, multiple=10):
from collections import Counter
import pyensembl
proteins_dict = pyensembl.ensembl_grch38.protein_sequences.fasta_dictionary
protein_list = list(proteins_dict.values())
lengths = Counter()
for hit in hits:
lengths[len(hit)] += 1
decoys = set([])
n_proteins = len(protein_list)
for length, count in lengths.items():
for protein_idx in np.random.randint(low=0, high=n_proteins, size=count * multiple):
protein = protein_list[protein_idx]
if len(protein) < length:
continue
i = np.random.randint(low=0, high=len(protein) - length + 1, size=1)[0]
peptide = protein[i:i + length]
if "X" in peptide or "U" in peptide or "*" in peptide:
def make_decoy_set(hits, multiple=10):
from collections import Counter
import pyensembl
proteins_dict = pyensembl.ensembl_grch38.protein_sequences.fasta_dictionary
protein_list = list(proteins_dict.values())
lengths = Counter()
for hit in hits:
lengths[len(hit)] += 1
decoys = set([])
n_proteins = len(protein_list)
for length, count in lengths.items():
for protein_idx in np.random.randint(low=0, high=n_proteins, size=count * multiple):
protein = protein_list[protein_idx]
if len(protein) < length:
continue
i = np.random.randint(low=0, high=len(protein) - length + 1, size=1)[0]
peptide = protein[i:i + length]
if "X" in peptide or "U" in peptide or "*" in peptide:
def full_proteome_sequence(cls, genome=ensembl_grch38):
if genome not in cls._genome_to_sequence:
# make a single string out of all the proteins
concatenated_proteins = "".join(
ensembl_grch38.protein_sequences.fasta_dictionary.values())
cls._genome_to_sequence[genome] = concatenated_proteins
else:
concatenated_proteins = cls._genome_to_sequence[genome]
return concatenated_proteins