Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_ensemblgenomes_genome_download_links():
"""Test Ensembl FTP links for various genomes
These genomes are hosted on ftp.ensemblgenomes.org.
"""
p = genomepy.provider.ProviderBase.create("Ensembl")
for genome in ["Amel_4.5", "WBcel235"]:
p.get_genome_download_link(genome)
Parameters
----------
name : str
Genome name
"""
genome = self.genomes[safe(name)]
link = genome["ftp_path"]
link = link.replace("ftp://", "https://")
link += "/" + link.split("/")[-1] + "_genomic.gff.gz"
if check_url(link):
return link
@register_provider("URL")
class UrlProvider(ProviderBase):
"""
URL genome provider.
Simply download a genome directly through an url.
"""
def __init__(self):
self.name = "URL"
self.genomes = {}
def genome_taxid(self, genome):
return "na"
def assembly_accession(self, genome):
return "na"
try:
genome = Genome(genome_name)
search_term = genome.tax_id
except FileNotFoundError:
logger.info(f"Genome {genome_name} not installed locally")
p = ProviderBase.create("Ensembl")
for name, *_rest in p.search(genome_name):
if name == genome_name:
logger.info(
f"It can be downloaded from Ensembl: genomepy install {name} Ensembl --annotation"
)
return None
return None
# search Ensembl by taxonomy_id or by specific Ensembl name (if we know it)
p = ProviderBase.create("Ensembl")
name, accession, species, tax_id, *rest = [row for row in p.search(search_term)][0]
# Check if the assembly_id of the current Ensembl genome is the same as the
# local genome. If it is identical, we can correctly assume that the genomes
# sequences are identical.
# For the genomes in the lookup table, we already know they match.
if genome_name in common_names or accession == genome.assembly_accession:
return name, accession, tax_id
else:
print(f"Could not find a matching genome in Ensembl")
return None
def _update_metadata(self, metadata):
"""check if there is missing info that can be updated"""
print("Updating metadata in README.txt", file=sys.stderr)
if metadata.get("provider", "na") == "na":
self._update_provider(metadata)
known_provider = metadata["provider"] in ["Ensembl", "UCSC", "NCBI"]
name = safe(metadata.get("original name", ""))
missing_info = any(
key not in metadata for key in ["tax_id", "assembly_accession"]
)
p = genome = None
if known_provider and name and missing_info:
p = ProviderBase.create(metadata["provider"])
genome = p.genomes.get(name)
if "tax_id" not in metadata:
self._update_tax_id(metadata, p, genome)
if "assembly_accession" not in metadata:
self._update_assembly_accession(metadata, p, genome)
Parameters
----------
name : str
Genome name
"""
ucsc_gene_url = f"http://hgdownload.cse.ucsc.edu/goldenPath/{name}/database/"
annot_files = ["knownGene.txt.gz", "ensGene.txt.gz", "refGene.txt.gz"]
for file in annot_files:
link = ucsc_gene_url + file
if check_url(link):
return link
@register_provider("NCBI")
class NcbiProvider(ProviderBase):
"""
NCBI genome provider.
Uses the assembly reports page to search and list genomes.
"""
assembly_url = "https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/"
def __init__(self):
# Necessary for bucketcache, otherwise methods with identical names
# from different classes will use the same cache :-O!
self.name = "NCBI"
# Populate on init, so that methods can be cached
self.genomes = self._get_genomes()
self.accession_fields = ["assembly_accession", "gbrs_paired_asm"]
self.taxid_fields = ["species_taxid", "taxid"]
for name in genomes:
if self._search_taxids(genome=genomes[name], term=term):
yield self._genome_info_tuple(name)
else:
term = safe(term).lower()
for name in genomes:
if self._search_descriptions(genome=genomes[name], term=term):
yield self._genome_info_tuple(name)
register_provider = ProviderBase.register_provider
@register_provider("Ensembl")
class EnsemblProvider(ProviderBase):
"""
Ensembl genome provider.
Will search both ensembl.org as well as ensemblgenomes.org.
The bacteria division is not yet supported.
"""
rest_url = "http://rest.ensembl.org/"
def __init__(self):
# Necessary for bucketcache, otherwise methods with identical names
# from different classes will use the same cache :-O!
self.name = "Ensembl"
# Populate on init, so that methods can be cached
self.genomes = self._get_genomes()
self.accession_fields = ["assembly_accession"]
base_url = ftp_site + "/release-{}/gtf/{}/{}.{}.{}.gtf.gz"
safe_name = re.sub(r"\.p\d+$", "", name)
link = base_url.format(
version,
genome["url_name"].lower(),
genome["url_name"].capitalize(),
safe_name,
version,
)
if check_url(link):
return link
@register_provider("UCSC")
class UcscProvider(ProviderBase):
"""
UCSC genome provider.
The UCSC API REST server is used to search and list genomes.
"""
base_url = "http://hgdownload.soe.ucsc.edu/goldenPath"
ucsc_url = base_url + "/{0}/bigZips/chromFa.tar.gz"
ucsc_url_masked = base_url + "/{0}/bigZips/chromFaMasked.tar.gz"
alt_ucsc_url = base_url + "/{0}/bigZips/{0}.fa.gz"
alt_ucsc_url_masked = base_url + "/{0}/bigZips/{0}.fa.masked.gz"
rest_url = "http://api.genome.ucsc.edu/list/ucscGenomes"
def __init__(self):
# Necessary for bucketcache, otherwise methods with identical names
# from different classes will use the same cache :-O!
def list_available_genomes(provider=None):
"""
List all available genomes.
Parameters
----------
provider : str, optional
List genomes from specific provider. Genomes from all
providers will be returned if not specified.
Returns
-------
list with genome names
"""
if provider:
providers = [ProviderBase.create(provider)]
else:
# if provider is not specified search all providers
providers = [ProviderBase.create(p) for p in ProviderBase.list_providers()]
for p in providers:
for row in p.list_available_genomes():
yield [p.name] + list(row)