Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
mask : str , optional
Masking level. Options: soft, hard or none. Default is soft.
Returns
------
str with the http/ftp download link.
"""
genome = self.genomes[safe(name)]
# only soft masked genomes available. can be (un)masked in _post _process_download
link = genome["ftp_path"]
link = link.replace("ftp://", "https://")
link += "/" + link.split("/")[-1] + "_genomic.fna.gz"
if check_url(link):
return link
raise GenomeDownloadError(
f"Could not download genome {name} from {self.name}.\n"
"URL is broken. Select another genome or provider.\n"
def get_annotation_download_link(self, name, **kwargs):
"""
Parse and test the link to the NCBI annotation file.
Parameters
----------
name : str
Genome name
"""
genome = self.genomes[safe(name)]
link = genome["ftp_path"]
link = link.replace("ftp://", "https://")
link += "/" + link.split("/")[-1] + "_genomic.gff.gz"
if check_url(link):
return link
name + ".gff3",
name + ".gff3.gz",
)
):
fname = split
break
else:
raise FileNotFoundError(
"Could not parse the remote directory. "
"Please supply a URL using --url-to-annotation.\n"
)
# set variables for downloading
link = urldir + "/" + fname
if check_url(link):
return link
"""
Parse and test the link to the UCSC annotation file.
Will check UCSC, Ensembl and RefSeq annotation, respectively.
Parameters
----------
name : str
Genome name
"""
ucsc_gene_url = f"http://hgdownload.cse.ucsc.edu/goldenPath/{name}/database/"
annot_files = ["knownGene.txt.gz", "ensGene.txt.gz", "refGene.txt.gz"]
for file in annot_files:
link = ucsc_gene_url + file
if check_url(link):
return link
def get_annotation_download_link(self, name, **kwargs):
"""
check if the linked annotation file is of a supported file type (gtf/gff3/bed)
"""
link = kwargs.get("to_annotation")
if link:
ext = get_file_info(link)[0]
if ext not in [".gtf", ".gff", ".gff3", ".bed"]:
raise TypeError(
"Only (gzipped) gtf, gff and bed files are supported.\n"
)
if check_url(link):
return link
mask : str , optional
Masking level. Options: soft, hard or none. Default is soft.
Returns
------
str with the http/ftp download link.
"""
# soft masked genomes. can be unmasked in _post _process_download
urls = [self.ucsc_url, self.alt_ucsc_url]
if mask == "hard":
urls = [self.ucsc_url_masked, self.alt_ucsc_url_masked]
for genome_url in urls:
link = genome_url.format(name)
if check_url(link):
return link
raise GenomeDownloadError(
f"Could not download genome {name} from {self.name}.\n"
"URLs are broken. Select another genome or provider.\n"
asm_url = "{}/{}.{}.{}.fa.gz".format(
url,
genome["url_name"].capitalize(),
re.sub(r"\.p\d+$", "", safe(genome["assembly_name"])),
pattern,
)
return asm_url
# try to get the (much smaller) primary assembly,
# unless specified otherwise
link = get_url("primary_assembly")
if kwargs.get("toplevel") or not check_url(link):
link = get_url()
if check_url(link):
return link
raise GenomeDownloadError(
f"Could not download genome {name} from {self.name}.\n"
"URL is broken. Select another genome or provider.\n"
if division != "vertebrates":
ftp_site += f"/{division}"
# Get the GTF URL
base_url = ftp_site + "/release-{}/gtf/{}/{}.{}.{}.gtf.gz"
safe_name = re.sub(r"\.p\d+$", "", name)
link = base_url.format(
version,
genome["url_name"].lower(),
genome["url_name"].capitalize(),
safe_name,
version,
)
if check_url(link):
return link