Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
Parameters
----------
name : str
NCBI genome name
localname : str
Custom name for your genome
out_dir : str
Output directory
mask : str , optional
masking level: soft/hard/none, default=soft
"""
# Create mapping of accessions to names
genome = self.genomes[safe(name)]
url = genome["ftp_path"]
url += f"/{url.split('/')[-1]}_assembly_report.txt"
url = url.replace("ftp://", "https://")
tr = {}
urlcleanup()
with urlopen(url) as response:
for line in response.read().decode("utf-8").splitlines():
if line.startswith("#"):
continue
vals = line.strip().split("\t")
tr[vals[6]] = vals[0]
# mask sequence if required
if mask == "soft":
def get_annotation_download_link(self, name, **kwargs):
"""
Parse and test the link to the NCBI annotation file.
Parameters
----------
name : str
Genome name
"""
genome = self.genomes[safe(name)]
link = genome["ftp_path"]
link = link.replace("ftp://", "https://")
link += "/" + link.split("/")[-1] + "_genomic.gff.gz"
if check_url(link):
return link
def _get_genomes(self):
sys.stderr.write("Downloading assembly summaries from Ensembl\n")
genomes = {}
divisions = self._request_json("info/divisions?")
for division in divisions:
if division == "EnsemblBacteria":
continue
division_genomes = self._request_json(
"info/genomes/division/{}?".format(division)
)
for genome in division_genomes:
genomes[safe(genome["assembly_name"])] = genome
return genomes
regex : str , optional
Regular expression to select specific chromosome / scaffold names.
invert_match : bool , optional
Set to True to select all chromosomes that don't match the regex.
bgzip : bool , optional
If set to True the genome FASTA file will be compressed using bgzip.
If not specified, the setting from the configuration file will be used.
"""
self.check_name(name)
link = self.get_genome_download_link(name, mask=mask, **kwargs)
original_name = name
name = safe(name)
localname = get_localname(name, localname)
genomes_dir = os.path.expanduser(genomes_dir)
out_dir = os.path.join(genomes_dir, localname)
if not os.path.exists(out_dir):
mkdir_p(out_dir)
sys.stderr.write(f"Downloading genome from {link}...\n")
# download to tmp dir. Move genome on completion.
# tmp dir is in genome_dir to prevent moving the genome between disks
with TemporaryDirectory(dir=out_dir) as tmp_dir:
fname = os.path.join(tmp_dir, f"{localname}.fa")
# actual download
urlcleanup()
def check_name(self, name):
"""check if genome name can be found for provider"""
if self.name == "URL":
return
if not safe(name) in self.genomes:
raise GenomeDownloadError(
f"Could not download genome {name} from {self.name}.\n\n"
"Check for typos or try\n"
def get_url(level="toplevel"):
masks = {"soft": "dna_sm.{}", "hard": "dna_rm.{}", "none": "dna.{}"}
pattern = masks[mask].format(level)
asm_url = "{}/{}.{}.{}.fa.gz".format(
url,
genome["url_name"].capitalize(),
re.sub(r"\.p\d+$", "", safe(genome["assembly_name"])),
pattern,
)
return asm_url
def _search_descriptions(self, genome, term):
"""check if search term corresponds to the provider's description field(s)"""
for field in self.description_fields:
if term in safe(genome[field].lower()):
return True
def _update_metadata(self, metadata):
"""check if there is missing info that can be updated"""
print("Updating metadata in README.txt", file=sys.stderr)
if metadata.get("provider", "na") == "na":
self._update_provider(metadata)
known_provider = metadata["provider"] in ["Ensembl", "UCSC", "NCBI"]
name = safe(metadata.get("original name", ""))
missing_info = any(
key not in metadata for key in ["tax_id", "assembly_accession"]
)
p = genome = None
if known_provider and name and missing_info:
p = ProviderBase.create(metadata["provider"])
genome = p.genomes.get(name)
if "tax_id" not in metadata:
self._update_tax_id(metadata, p, genome)
if "assembly_accession" not in metadata:
self._update_assembly_accession(metadata, p, genome)
Yields
------
tuples with name and metadata
"""
term = str(term)
genomes = self.genomes
if safe(term) in genomes:
yield self._genome_info_tuple(term)
elif is_number(term):
for name in genomes:
if self._search_taxids(genome=genomes[name], term=term):
yield self._genome_info_tuple(name)
else:
term = safe(term).lower()
for name in genomes:
if self._search_descriptions(genome=genomes[name], term=term):
yield self._genome_info_tuple(name)