How to use the pyensembl.Genome function in pyensembl

To help you get started, we’ve selected a few pyensembl examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github openvax / pyensembl / test / test_missing_genome_sources.py View on Github external
def test_protein_fasta_only():
    genome_only_proteins = Genome(
        reference_name="GRCm38",
        annotation_name="_test_mouse_ensembl81_subset",
        protein_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH])
    genome_only_proteins.index()

    eq_(4, len(genome_only_proteins.protein_sequences.fasta_dictionary))

    with assert_raises(ValueError) as cm:
        genome_only_proteins.genes()
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome_only_proteins.transcript_sequence("DOES_NOT_EXIST")
    no_transcript_(cm)
github openvax / pyensembl / test / test_missing_genome_sources.py View on Github external
def test_gtf_only():
    genome_only_gtf = Genome(
        reference_name="GRCm38",
        annotation_name="_test_mouse_ensembl81_subset",
        gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH)
    genome_only_gtf.index()

    eq_(1, len(genome_only_gtf.genes()))

    with assert_raises(ValueError) as cm:
        genome_only_gtf.transcript_sequence("DOES_NOT_EXIST")

    no_transcript_(cm)

    with assert_raises(ValueError) as cm:
        genome_only_gtf.protein_sequence("genome_only_gtf")

    no_protein_(cm)
github openvax / pyensembl / test / test_missing_genome_sources.py View on Github external
def test_gtf_transcript_only():
    genome_gtf_with_cdna = Genome(
        reference_name="GRCm38",
        annotation_name="_test_mouse_ensembl81_subset",
        gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH,
        transcript_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH])
    genome_gtf_with_cdna.index()

    eq_(1, len(genome_gtf_with_cdna.genes()))

    transcript = genome_gtf_with_cdna.transcripts()[0]
    ok_(transcript.sequence)

    with assert_raises(ValueError) as cm:
        transcript.protein_sequence
    no_protein_(cm)
github openvax / varcode / test / test_mouse.py View on Github external
from .data import data_path

MOUSE_ENSEMBL_RELEASE = 95
SERVER = "ftp://ftp.ensembl.org"
MOUSE_GTF_PATH = \
    SERVER + "/pub/release-%d/gtf/mus_musculus/Mus_musculus.GRCm38.%d.gtf.gz" % (
        MOUSE_ENSEMBL_RELEASE, MOUSE_ENSEMBL_RELEASE)
MOUSE_TRANSCRIPT_FASTA_PATH = \
    SERVER + "/pub/release-%d/fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.all.fa.gz"
MOUSE_PROTEIN_FASTA_PATH = \
    SERVER + "/pub/release-%d/fasta/mus_musculus/pep/Mus_musculus.GRCm38.pep.all.fa.gz" % (
        MOUSE_ENSEMBL_RELEASE)

MOUSE_VCF = data_path("mouse_vcf_dbsnp_chr1_partial.vcf")

explicit_url_genome = Genome(
    reference_name="GRCm38",
    annotation_name="ensembl",
    annotation_version=MOUSE_ENSEMBL_RELEASE,
    gtf_path_or_url=MOUSE_GTF_PATH,
    transcript_fasta_paths_or_urls=[MOUSE_TRANSCRIPT_FASTA_PATH],
    protein_fasta_paths_or_urls=[MOUSE_PROTEIN_FASTA_PATH])

ensembl_mouse_genome = EnsemblRelease(MOUSE_ENSEMBL_RELEASE, species="mouse")

def test_load_vcf_mouse_with_explicit_urls():
    variants = load_vcf(MOUSE_VCF, genome=explicit_url_genome)
    eq_(len(variants), 217)

def test_load_vcf_mouse_with_ensembl_release():
    variants = load_vcf(MOUSE_VCF, genome=ensembl_mouse_genome)
    eq_(len(variants), 217)
github openvax / pyensembl / test / test_missing_genome_sources.py View on Github external
def test_transcript_fasta_only():
    genome = Genome(
        reference_name="GRCm38",
        annotation_name="_test_mouse_ensembl81_subset",
        transcript_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH])
    genome.index()

    eq_(2, len(genome.transcript_sequences.fasta_dictionary))

    with assert_raises(ValueError) as cm:
        genome.genes()
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
        genome.gene_ids()
    no_gtf_(cm)

    with assert_raises(ValueError) as cm:
github openvax / pyensembl / test / data.py View on Github external
# grep "ENSMUSG00000017167" Mus_musculus.GRCm38.pep.all.fa -A 50

# Tested against:
# http://useast.ensembl.org/Mus_musculus/Gene/Summary?db=core;g=ENSMUSG00000017167

MOUSE_ENSMUSG00000017167_PATH = data_path(
    "mouse.ensembl.81.partial.ENSMUSG00000017167.gtf")
MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH = data_path(
    "mouse.ensembl.81.partial.ENSMUSG00000017167.fa")
MOUSE_ENSMUSG00000088969_NCRNA_FASTA_PATH = data_path(
    "mouse.ensembl.81.partial.ncrna.ENSMUSG00000017167.fa")
MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH = data_path(
    "mouse.ensembl.81.partial.ENSMUSG00000017167.pep")


custom_mouse_genome_grcm38_subset = Genome(
    reference_name="GRCm38",
    annotation_name="_test_mouse_ensembl81_subset",
    gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH,
    transcript_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH],
    protein_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH])


def setup_init_custom_mouse_genome():
    """
    If a unit test needs to start from a cleared cache, add this to the test
    setup.
    """
    custom_mouse_genome_grcm38_subset.clear_cache()
    custom_mouse_genome_grcm38_subset.index()
github openvax / pyensembl / test / test_missing_genome_sources.py View on Github external
def test_gtf_protein_only():
    genome_gtf_with_proteins = Genome(
        reference_name="GRCm38",
        annotation_name="_test_mouse_ensembl81_subset",
        gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH,
        protein_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH])
    genome_gtf_with_proteins.index()

    eq_(1, len(genome_gtf_with_proteins.genes()))

    transcript = genome_gtf_with_proteins.transcripts()[0]
    ok_(transcript.protein_sequence)

    with assert_raises(ValueError) as cm:
        transcript.sequence
    no_transcript_(cm)
github openvax / pyensembl / test / test_ucsc_gtf.py View on Github external
def test_ucsc_refseq_genome():
    """
    Test Genome object with a small RefSeq GTF file downloaded from
    http://genome.ucsc.edu/cgi-bin/hgTables
    """
    with TemporaryDirectory() as tmpdir:
        genome = Genome(
            reference_name="GRCh38",
            annotation_name="ucsc_test",
            gtf_path_or_url=UCSC_REFSEQ_PATH,
            cache_directory_path=tmpdir)
        genome.index()
        genes = genome.genes()
        for gene in genes:
            assert gene.id, \
                "Gene with missing ID in %s" % (genome.db._load_gtf_as_dataframe(),)
        assert len(genes) == 2, \
            "Expected 2 genes, got %d: %s" % (
                len(genes), genes)
        transcripts = genome.transcripts()
        for transcript in transcripts:
            assert transcript.id, \
                "Transcript with missing ID in %s" % (genome.db._load_gtf_as_dataframe(),)
github openvax / pyensembl / test / test_ucsc_gtf.py View on Github external
def test_ucsc_gencode_genome():
    """
    Testing with a small GENCODE GTF file downloaded from
    http://genome.ucsc.edu/cgi-bin/hgTables
    """
    with TemporaryDirectory() as tmpdir:
        genome = Genome(
            reference_name="GRCh38",
            annotation_name="ucsc_test",
            gtf_path_or_url=UCSC_GENCODE_PATH,
            cache_directory_path=tmpdir)
        genome.index()
        genes = genome.genes()
        for gene in genes:
            assert gene.id, \
                "Gene with missing ID in %s" % (genome.gtf.dataframe(),)
        assert len(genes) == 7, \
            "Expected 7 genes, got %d: %s" % (
                len(genes), genes)
        transcripts = genome.transcripts()
        for transcript in transcripts:
            assert transcript.id, \
                "Transcript with missing ID in %s" % (genome.gtf.dataframe(),)
github openvax / varcode / varcode / reference.py View on Github external
def infer_genome(genome_object_string_or_int):
    """
    If given an integer, return associated human EnsemblRelease for that
    Ensembl version.

    If given a string, return latest EnsemblRelease which has a reference
    of the same name.

    If given a PyEnsembl Genome, simply return it.
    """
    if isinstance(genome_object_string_or_int, Genome):
        return genome_object_string_or_int
    if is_integer(genome_object_string_or_int):
        return cached_release(genome_object_string_or_int)
    elif is_string(genome_object_string_or_int):
        # first infer the canonical reference name, e.g. mapping hg19 -> GRCh37
        # and then get the associated PyEnsembl Genome object
        reference_name = infer_reference_name(genome_object_string_or_int)
        return genome_for_reference_name(reference_name)
    else:
        raise TypeError(
            ("Expected genome to be an int, string, or pyensembl.Genome "
                "instance, got %s : %s") % (
                str(genome_object_string_or_int),
                type(genome_object_string_or_int)))