How to use the pyensembl.ensembl_grch38 function in pyensembl

To help you get started, we’ve selected a few pyensembl examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github openvax / isovar / test / test_translation_helpers.py View on Github external
def test_TP53_translation_from_cdna():
    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    cdna = tp53_001.coding_sequence
    amino_acids = translate_cdna(cdna, first_codon_is_start=True)
    eq_(amino_acids, tp53_001.protein_sequence)
github openvax / isovar / test / test_reference_coding_sequence_key.py View on Github external
def test_sequence_key_with_reading_frame_deletion_with_five_prime_utr():
    # Delete second codon of TP53-001, the surrounding context
    # includes nucleotides from the 5' UTR. Since TP53 is on the negative
    # strand we have to take the reverse complement of the variant which turns
    # it into 'CTC'>''
    tp53_deletion = Variant(
        "17", 7676589, "CTC", "", ensembl_grch38)
    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]

    # Sequence of TP53 around second codon with 10 context nucleotides:
    # In [51]: t.sequence[193-10:193+13]
    # Out[51]: 'CACTGCCATGGAGGAGCCGCAGT'
    # Which can be split into the following parts:
    #  last 7 nt of 5' UTR: CACTGCC
    #  start codon: ATG (translates to M)
    #  2nd codon: GAG    <---- variant occurs here
    #  3rd codon: GAG
    #  4th codon: CCG
    #  5th codon:  CAG
    #  first nt of 6th codon: T

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_deletion,
        transcript=tp53_001,
github openvax / pyensembl / test / test_gene_ids.py View on Github external
def test_gene_ids_grch38_hla_a():
    # chr6:29,945,884  is a position for HLA-A
    # Gene ID = ENSG00000206503
    # based on:
    # http://useast.ensembl.org/Homo_sapiens/Gene/
    # Summary?db=core;g=ENSG00000206503;r=6:29941260-29945884
    ids = ensembl_grch38.gene_ids_at_locus(6, 29945884)
    expected = "ENSG00000206503"
    assert ids == ["ENSG00000206503"], \
        "Expected HLA-A, gene ID = %s, got: %s" % (expected, ids)
github openvax / pyensembl / test / test_gene_ids.py View on Github external
def test_gene_ids_of_gene_name_hla_grch38():
    hla_a_gene_ids = ensembl_grch38.gene_ids_of_gene_name("HLA-A")
    assert 'ENSG00000206503' in hla_a_gene_ids, hla_a_gene_ids

    hla_b_gene_ids = ensembl_grch38.gene_ids_of_gene_name("HLA-B")
    assert 'ENSG00000234745' in hla_b_gene_ids, hla_b_gene_ids

    hla_c_gene_ids = ensembl_grch38.gene_ids_of_gene_name("HLA-C")
    assert 'ENSG00000204525' in hla_c_gene_ids, hla_c_gene_ids
github openvax / isovar / test / test_gather_variant_reads_wgs.py View on Github external
def test_partition_variant_reads_deletion():
    alignment_file = load_bam("data/cancer-wgs-primary.chr12.bam")
    chromosome = "12"
    base1_location = 70091490
    ref = "TTGTAGATGCTGCCTCTCC"
    alt = ""
    variant = Variant(
        contig=chromosome,
        start=base1_location,
        ref=ref,
        alt=alt,
        ensembl=ensembl_grch38)
    read_collector = ReadCollector()
    read_evidence = read_collector.read_evidence_for_variant(
        alignment_file=alignment_file,
        variant=variant)
    assert len(read_evidence.alt_reads) > 1
    for variant_read in read_evidence.alt_reads:
        eq_(variant_read.allele, alt)
github openvax / varcode / test / test_variant.py View on Github external
def test_serialization():
    variants = [
        Variant(
            1, start=10, ref="AA", alt="AAT", genome=ensembl_grch38),
        Variant(10, start=15, ref="A", alt="G"),
        Variant(20, start=150, ref="", alt="G"),
    ]
    for original in variants:
        # This causes the variant's ensembl object to make a SQL connection,
        # which makes the ensembl object non-serializable. By calling this
        # method, we are checking that we don't attempt to directly serialize
        # the ensembl object.
        original.effects()

        # Test pickling.
        serialized = pickle.dumps(original)
        reconstituted = pickle.loads(serialized)
        eq_(original, reconstituted)

        eq_(original.contig, reconstituted.contig)
github openvax / isovar / test / test_nucleotide_counts.py View on Github external
def test_most_common_nucleotides_for_chr12_deletion():
    samfile = load_bam("data/cancer-wgs-primary.chr12.bam")
    chromosome = "chr12"
    base1_location = 70091490
    ref = "TTGTAGATGCTGCCTCTCC"
    alt = ""
    variant = Variant(
        chromosome,
        base1_location,
        ref,
        alt,
        ensembl=ensembl_grch38)
    read_creator = ReadCollector()
    variant_reads = read_creator.allele_reads_supporting_variant(
        alignments=samfile,
        variant=variant)
    consensus_sequence, chosen_counts, other_counts = most_common_nucleotides(
        variant_reads)
    print(chosen_counts)
    print(other_counts)
    eq_(len(chosen_counts), len(consensus_sequence))
    eq_(len(other_counts), len(consensus_sequence))
    assert other_counts.sum() < chosen_counts.sum(), \
        "Counts for alternate nucleotides should not exceed the chosen sequence"

    number_matching_reads = 0
    for variant_read in variant_reads:
        full_seq = variant_read.prefix + variant_read.allele + variant_read.suffix
github openvax / pepnet / experiments / eval_architectures_class2_mhc_binding.py View on Github external
def make_decoy_set(hits, multiple=10):
    from collections import Counter
    import pyensembl
    proteins_dict = pyensembl.ensembl_grch38.protein_sequences.fasta_dictionary
    protein_list = list(proteins_dict.values())
    lengths = Counter()
    for hit in hits:
        lengths[len(hit)] += 1

    decoys = set([])
    n_proteins = len(protein_list)
    for length, count in lengths.items():
        for protein_idx in np.random.randint(low=0, high=n_proteins, size=count * multiple):
            protein = protein_list[protein_idx]
            if len(protein) < length:
                continue

            i = np.random.randint(low=0, high=len(protein) - length + 1, size=1)[0]
            peptide = protein[i:i + length]
            if "X" in peptide or "U" in peptide or "*" in peptide:
github openvax / pepnet / experiments / eval_conv_saturation.py View on Github external
def make_decoy_set(hits, multiple=10):
    from collections import Counter
    import pyensembl
    proteins_dict = pyensembl.ensembl_grch38.protein_sequences.fasta_dictionary
    protein_list = list(proteins_dict.values())
    lengths = Counter()
    for hit in hits:
        lengths[len(hit)] += 1

    decoys = set([])
    n_proteins = len(protein_list)
    for length, count in lengths.items():
        for protein_idx in np.random.randint(low=0, high=n_proteins, size=count * multiple):
            protein = protein_list[protein_idx]
            if len(protein) < length:
                continue

            i = np.random.randint(low=0, high=len(protein) - length + 1, size=1)[0]
            peptide = protein[i:i + length]
            if "X" in peptide or "U" in peptide or "*" in peptide:
github openvax / mhcdouble / mhc2 / decoys.py View on Github external
def full_proteome_sequence(cls, genome=ensembl_grch38):
        if genome not in cls._genome_to_sequence:
            # make a single string out of all the proteins
            concatenated_proteins = "".join(
                ensembl_grch38.protein_sequences.fasta_dictionary.values())
            cls._genome_to_sequence[genome] = concatenated_proteins
        else:
            concatenated_proteins = cls._genome_to_sequence[genome]
        return concatenated_proteins