Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_sequence_key_for_variant_on_transcript_deletion_reverse_strand():
# delete start codon of TP53-001, which in reverse complement means
# deleting the sequence "CAT"
tp53_deletion = Variant(
"17", 7676592, "CAT", "", ensembl_grch38)
tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
# Sequence of TP53 around start codon with 10 context nucleotides:
# In [51]: t.sequence[190-10:190+13]
# Out[51]: 'GGTCACTGCC_ATG_GAGGAGCCGC'
eq_(tp53_001.sequence[190 - 10:190 + 13], "GGTCACTGCCATGGAGGAGCCGC")
# get the 5 nucleotides before the variant and 10 nucleotides after
sequence_key = ReferenceSequenceKey.from_variant_and_transcript(
variant=tp53_deletion,
transcript=tp53_001,
context_size=10)
expected_sequence_key = ReferenceSequenceKey(
strand="-",
sequence_before_variant_locus="GGTCACTGCC",
sequence_at_variant_locus="ATG",
sequence_after_variant_locus="GAGGAGCCGC")
def test_reference_coding_sequence_key_insertion_inside_start_codon():
# insert nucleotide "C" in the middle of the start codon of TP53-001,
# keeping only 1 nucleotide of context. In the reverse complement this
# becomes 'T'>'TG'
tp53_insertion = Variant(
"17", 7676592, "T", "TG", ensembl_grch38)
tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
result = ReferenceCodingSequenceKey.from_variant_and_transcript(
variant=tp53_insertion,
transcript=tp53_001,
context_size=1)
assert result is None, "Expected result to be None when variant affects start codon"
def test_sequence_key_with_reading_frame_substitution_with_five_prime_utr():
# Replace second codon of TP53-001 with 'CCC', the surrounding context
# includes nucleotides from the 5' UTR. Since TP53 is on the negative
# strand we have to take the reverse complement of the variant which turns
# it into CTC>GGG
tp53_substitution = Variant(
"17", 7676589, "CTC", "GGG", ensembl_grch38)
tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
# Sequence of TP53 around second codon with 10 context nucleotides:
# In [51]: t.sequence[193-10:193+13]
# Out[51]: 'CACTGCCATGGAGGAGCCGCAGT'
# Which can be split into the following parts:
# last 7 nt of 5' UTR: CACTGCC
# start codon: ATG (translates to M)
# 2nd codon: GAG <---- variant occurs here
# 3rd codon: GAG
# 4th codon: CCG
# 5th codon: CAG
# first nt of 6th codon: T
result = ReferenceCodingSequenceKey.from_variant_and_transcript(
variant=tp53_substitution,
transcript=tp53_001,
context_size=10)
def test_sequence_key_for_variant_on_transcript_insertion():
# Insert 'CCC' after the 6th nucleotide of BRCA2-001's 5' UTR
brca2_variant_insertion = Variant(
"13", 32315479, "T", "TCCC", ensembl_grch38)
brca2_001 = ensembl_grch38.transcripts_by_name("BRCA2-001")[0]
# first 50 characters of BRCA2-001:
# "GGGCTTGTGGCGCGAGCTTCTGAAACTAGGCGGCAGAGGCGGAGCCGCTG"
brca2_ref_seq = brca2_001.sequence[:50]
eq_(brca2_ref_seq, "GGGCTTGTGGCGCGAGCTTCTGAAACTAGGCGGCAGAGGCGGAGCCGCTG")
print(brca2_ref_seq)
# get the 5 nucleotides before the variant and 10 nucleotides after
sequence_key = ReferenceSequenceKey.from_variant_and_transcript(
variant=brca2_variant_insertion,
transcript=brca2_001,
context_size=10)
# expecting nothing at the variant locus since we're inserting between
# two reference nucleotides
expected_sequence_key = ReferenceSequenceKey(
strand="+",
sequence_before_variant_locus=brca2_ref_seq[:6],
def test_interbase_range_for_brca2_utr_deletion():
# Deletion of the 6th nucleotide of BRCA2-001's 5' UTR
brca2_deletion = Variant("13", 32315479, "T", "", ensembl_grch38)
brca2_001 = ensembl_grch38.transcripts_by_name("BRCA2-001")[0]
interbase_range = interbase_range_affected_by_variant_on_transcript(
variant=brca2_deletion,
transcript=brca2_001)
print(interbase_range)
eq_(interbase_range, (5, 6))
def test_interbase_range_for_brca2_utr_insertion():
# T>TC insertion after the 6th nucleotide of BRCA2-001's 5' UTR
brca2_insertion = Variant("13", 32315479, "T", "TC", ensembl_grch38)
brca2_001 = ensembl_grch38.transcripts_by_name("BRCA2-001")[0]
interbase_range = interbase_range_affected_by_variant_on_transcript(
variant=brca2_insertion,
transcript=brca2_001)
print(interbase_range)
eq_(interbase_range, (6, 6))
def test_interbase_range_for_brca2_utr_substitution():
# rs769125639 is a simple T>A substitution in the 6th nucleotide of
# BRCA2-001's 5' UTR
brca2_variant_rs769125639 = Variant(
"13", 32315479, "T", "A", ensembl_grch38)
brca2_001 = ensembl_grch38.transcripts_by_name("BRCA2-001")[0]
interbase_range = interbase_range_affected_by_variant_on_transcript(
variant=brca2_variant_rs769125639,
transcript=brca2_001)
print(interbase_range)
eq_(interbase_range, (5, 6))
def test_sequence_key_with_reading_frame_insertion_context_6nt_contains_start():
# Insert nucleotide "T" after second codon of TP53-001,
# but in this test we're going to only keep enough context to see
# the start codon but none of the 5' UTR. In the reverse complement this
# variant becomes CTC>CTCA
tp53_insertion = Variant(
"17", 7676586, "CTC", "CTCA", ensembl_grch38)
tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
# Sequence of TP53 around boundary of 2nd/3rd codons
# with 6 context nucleotides:
# start codon: ATG (translates to M)
# 2nd codon: GAG (translates to E)
# <---- insertion variant occurs between these two codons
# 3rd codon: GAG
# 4th codon: CCG
result = ReferenceCodingSequenceKey.from_variant_and_transcript(
variant=tp53_insertion,
transcript=tp53_001,
context_size=6)
expected = ReferenceCodingSequenceKey(
strand="-",
sequence_before_variant_locus="ATGGAG",