How to use the genomepy.provider.ProviderBase function in genomepy

To help you get started, we’ve selected a few genomepy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github vanheeringen-lab / genomepy / tests / test_02_links.py View on Github external
def test_ensemblgenomes_genome_download_links():
    """Test Ensembl FTP links for various genomes

    These genomes are hosted on ftp.ensemblgenomes.org.
    """
    p = genomepy.provider.ProviderBase.create("Ensembl")

    for genome in ["Amel_4.5", "WBcel235"]:
        p.get_genome_download_link(genome)
github vanheeringen-lab / genomepy / genomepy / provider.py View on Github external
Parameters
        ----------
        name : str
            Genome name
        """
        genome = self.genomes[safe(name)]
        link = genome["ftp_path"]
        link = link.replace("ftp://", "https://")
        link += "/" + link.split("/")[-1] + "_genomic.gff.gz"

        if check_url(link):
            return link


@register_provider("URL")
class UrlProvider(ProviderBase):
    """
    URL genome provider.

    Simply download a genome directly through an url.
    """

    def __init__(self):
        self.name = "URL"
        self.genomes = {}

    def genome_taxid(self, genome):
        return "na"

    def assembly_accession(self, genome):
        return "na"
github vanheeringen-lab / genomepy / genomepy / annotation.py View on Github external
try:
            genome = Genome(genome_name)
            search_term = genome.tax_id
        except FileNotFoundError:
            logger.info(f"Genome {genome_name} not installed locally")
            p = ProviderBase.create("Ensembl")
            for name, *_rest in p.search(genome_name):
                if name == genome_name:
                    logger.info(
                        f"It can be downloaded from Ensembl: genomepy install {name} Ensembl --annotation"
                    )
                    return None
            return None

    # search Ensembl by taxonomy_id or by specific Ensembl name (if we know it)
    p = ProviderBase.create("Ensembl")
    name, accession, species, tax_id, *rest = [row for row in p.search(search_term)][0]

    # Check if the assembly_id of the current Ensembl genome is the same as the
    # local genome. If it is identical, we can correctly assume that the genomes
    # sequences are identical.
    # For the genomes in the lookup table, we already know they match.
    if genome_name in common_names or accession == genome.assembly_accession:
        return name, accession, tax_id
    else:
        print(f"Could not find a matching genome in Ensembl")
        return None
github vanheeringen-lab / genomepy / genomepy / genome.py View on Github external
def _update_metadata(self, metadata):
        """check if there is missing info that can be updated"""
        print("Updating metadata in README.txt", file=sys.stderr)
        if metadata.get("provider", "na") == "na":
            self._update_provider(metadata)

        known_provider = metadata["provider"] in ["Ensembl", "UCSC", "NCBI"]
        name = safe(metadata.get("original name", ""))
        missing_info = any(
            key not in metadata for key in ["tax_id", "assembly_accession"]
        )
        p = genome = None
        if known_provider and name and missing_info:
            p = ProviderBase.create(metadata["provider"])
            genome = p.genomes.get(name)

        if "tax_id" not in metadata:
            self._update_tax_id(metadata, p, genome)
        if "assembly_accession" not in metadata:
            self._update_assembly_accession(metadata, p, genome)
github vanheeringen-lab / genomepy / genomepy / provider.py View on Github external
Parameters
        ----------
        name : str
            Genome name
        """
        ucsc_gene_url = f"http://hgdownload.cse.ucsc.edu/goldenPath/{name}/database/"
        annot_files = ["knownGene.txt.gz", "ensGene.txt.gz", "refGene.txt.gz"]

        for file in annot_files:
            link = ucsc_gene_url + file
            if check_url(link):
                return link


@register_provider("NCBI")
class NcbiProvider(ProviderBase):
    """
    NCBI genome provider.

    Uses the assembly reports page to search and list genomes.
    """

    assembly_url = "https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/"

    def __init__(self):
        # Necessary for bucketcache, otherwise methods with identical names
        # from different classes will use the same cache :-O!
        self.name = "NCBI"
        # Populate on init, so that methods can be cached
        self.genomes = self._get_genomes()
        self.accession_fields = ["assembly_accession", "gbrs_paired_asm"]
        self.taxid_fields = ["species_taxid", "taxid"]
github vanheeringen-lab / genomepy / genomepy / provider.py View on Github external
for name in genomes:
                if self._search_taxids(genome=genomes[name], term=term):
                    yield self._genome_info_tuple(name)

        else:
            term = safe(term).lower()
            for name in genomes:
                if self._search_descriptions(genome=genomes[name], term=term):
                    yield self._genome_info_tuple(name)


register_provider = ProviderBase.register_provider


@register_provider("Ensembl")
class EnsemblProvider(ProviderBase):
    """
    Ensembl genome provider.

    Will search both ensembl.org as well as ensemblgenomes.org.
    The bacteria division is not yet supported.
    """

    rest_url = "http://rest.ensembl.org/"

    def __init__(self):
        # Necessary for bucketcache, otherwise methods with identical names
        # from different classes will use the same cache :-O!
        self.name = "Ensembl"
        # Populate on init, so that methods can be cached
        self.genomes = self._get_genomes()
        self.accession_fields = ["assembly_accession"]
github vanheeringen-lab / genomepy / genomepy / provider.py View on Github external
base_url = ftp_site + "/release-{}/gtf/{}/{}.{}.{}.gtf.gz"
        safe_name = re.sub(r"\.p\d+$", "", name)
        link = base_url.format(
            version,
            genome["url_name"].lower(),
            genome["url_name"].capitalize(),
            safe_name,
            version,
        )

        if check_url(link):
            return link


@register_provider("UCSC")
class UcscProvider(ProviderBase):
    """
    UCSC genome provider.

    The UCSC API REST server is used to search and list genomes.
    """

    base_url = "http://hgdownload.soe.ucsc.edu/goldenPath"
    ucsc_url = base_url + "/{0}/bigZips/chromFa.tar.gz"
    ucsc_url_masked = base_url + "/{0}/bigZips/chromFaMasked.tar.gz"
    alt_ucsc_url = base_url + "/{0}/bigZips/{0}.fa.gz"
    alt_ucsc_url_masked = base_url + "/{0}/bigZips/{0}.fa.masked.gz"
    rest_url = "http://api.genome.ucsc.edu/list/ucscGenomes"

    def __init__(self):
        # Necessary for bucketcache, otherwise methods with identical names
        # from different classes will use the same cache :-O!
github vanheeringen-lab / genomepy / genomepy / functions.py View on Github external
def list_available_genomes(provider=None):
    """
    List all available genomes.

    Parameters
    ----------
    provider : str, optional
        List genomes from specific provider. Genomes from all
        providers will be returned if not specified.

    Returns
    -------
    list with genome names
    """
    if provider:
        providers = [ProviderBase.create(provider)]
    else:
        # if provider is not specified search all providers
        providers = [ProviderBase.create(p) for p in ProviderBase.list_providers()]

    for p in providers:
        for row in p.list_available_genomes():
            yield [p.name] + list(row)