Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_ncbi_genome_download_links(masking):
"""Test NCBI HTTPS links for various genomes
Also test masking (should be ignored).
These genomes are hosted on ftp://ftp.ncbi.nlm.nih.gov."""
p = genomepy.provider.ProviderBase.create("NCBI")
for genome in ["Charlie1.0", "GRCh38.p13"]:
p.get_genome_download_link(genome, mask=masking)
def test_ensemblgenomes_annotation(localname=None):
"""Test Ensembl annotation
This annotation is hosted on ftp.ensemblgenomes.org.
"""
tmp = mkdtemp()
p = genomepy.provider.ProviderBase.create("Ensembl")
for name, version in [("TAIR10", 45)]:
p.download_annotation(name, tmp, localname=localname, version=version)
localname = genomepy.utils.get_localname(name, localname)
gtf = os.path.join(tmp, localname, localname + ".annotation.gtf.gz")
validate_gzipped_gtf(gtf)
bed = os.path.join(tmp, localname, localname + ".annotation.bed.gz")
validate_gzipped_bed(bed)
shutil.rmtree(tmp)
# Fast lookup for some common queries
common_names = {
"danRer11": "GRCz11",
"hg38": "GRCh38",
"mm10": "GRCm38",
"dm6": "BDGP6.28",
}
if genome_name in common_names:
search_term = common_names[genome_name]
else:
try:
genome = Genome(genome_name)
search_term = genome.tax_id
except FileNotFoundError:
logger.info(f"Genome {genome_name} not installed locally")
p = ProviderBase.create("Ensembl")
for name, *_rest in p.search(genome_name):
if name == genome_name:
logger.info(
f"It can be downloaded from Ensembl: genomepy install {name} Ensembl --annotation"
)
return None
return None
# search Ensembl by taxonomy_id or by specific Ensembl name (if we know it)
p = ProviderBase.create("Ensembl")
name, accession, species, tax_id, *rest = [row for row in p.search(search_term)][0]
# Check if the assembly_id of the current Ensembl genome is the same as the
# local genome. If it is identical, we can correctly assume that the genomes
# sequences are identical.
# For the genomes in the lookup table, we already know they match.
to_annotation : text , optional
URL only: direct link to annotation file.
Required if this is not the same directory as the fasta.
"""
genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)
localname = get_localname(name, localname)
out_dir = os.path.join(genomes_dir, localname)
# Check if genome already exists, or if downloading is forced
genome_found = (
len([f for f in glob_ext_files(out_dir) if f"{localname}.fa" in f]) >= 1
)
if (not genome_found or force) and not only_annotation:
# Download genome from provider
p = ProviderBase.create(provider)
p.download_genome(
name,
genomes_dir,
mask=mask,
regex=regex,
invert_match=invert_match,
localname=localname,
bgzip=bgzip,
**kwargs,
)
genome_found = True
# Export installed genome(s)
generate_env()
# Generates a Fasta object, index, gaps and sizes file
term : str
Search term, case-insensitive.
provider : str , optional
Provider name
Yields
------
tuple
genome information (name/identfier and description)
"""
if provider:
providers = [ProviderBase.create(provider)]
else:
# if provider is not specified search all providers
providers = [ProviderBase.create(p) for p in ProviderBase.list_providers()]
for p in providers:
for row in p.search(term):
yield [
x.encode("latin-1") for x in list(row[:1]) + [p.name] + list(row[1:])
]