How to use the genomepy.utils.safe function in genomepy

To help you get started, we’ve selected a few genomepy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github vanheeringen-lab / genomepy / genomepy / provider.py View on Github external
Parameters
        ----------
        name : str
            NCBI genome name

        localname : str
            Custom name for your genome

        out_dir : str
            Output directory

        mask : str , optional
            masking level: soft/hard/none, default=soft
        """
        # Create mapping of accessions to names
        genome = self.genomes[safe(name)]
        url = genome["ftp_path"]
        url += f"/{url.split('/')[-1]}_assembly_report.txt"
        url = url.replace("ftp://", "https://")

        tr = {}
        urlcleanup()
        with urlopen(url) as response:
            for line in response.read().decode("utf-8").splitlines():
                if line.startswith("#"):
                    continue
                vals = line.strip().split("\t")
                tr[vals[6]] = vals[0]

        # mask sequence if required
        if mask == "soft":
github vanheeringen-lab / genomepy / genomepy / provider.py View on Github external
def get_annotation_download_link(self, name, **kwargs):
        """
        Parse and test the link to the NCBI annotation file.

        Parameters
        ----------
        name : str
            Genome name
        """
        genome = self.genomes[safe(name)]
        link = genome["ftp_path"]
        link = link.replace("ftp://", "https://")
        link += "/" + link.split("/")[-1] + "_genomic.gff.gz"

        if check_url(link):
            return link
github vanheeringen-lab / genomepy / genomepy / provider.py View on Github external
def _get_genomes(self):
        sys.stderr.write("Downloading assembly summaries from Ensembl\n")

        genomes = {}
        divisions = self._request_json("info/divisions?")
        for division in divisions:
            if division == "EnsemblBacteria":
                continue
            division_genomes = self._request_json(
                "info/genomes/division/{}?".format(division)
            )
            for genome in division_genomes:
                genomes[safe(genome["assembly_name"])] = genome
        return genomes
github vanheeringen-lab / genomepy / genomepy / provider.py View on Github external
regex : str , optional
            Regular expression to select specific chromosome / scaffold names.

        invert_match : bool , optional
            Set to True to select all chromosomes that don't match the regex.

        bgzip : bool , optional
            If set to True the genome FASTA file will be compressed using bgzip.
            If not specified, the setting from the configuration file will be used.
        """
        self.check_name(name)

        link = self.get_genome_download_link(name, mask=mask, **kwargs)

        original_name = name
        name = safe(name)
        localname = get_localname(name, localname)

        genomes_dir = os.path.expanduser(genomes_dir)
        out_dir = os.path.join(genomes_dir, localname)
        if not os.path.exists(out_dir):
            mkdir_p(out_dir)

        sys.stderr.write(f"Downloading genome from {link}...\n")

        # download to tmp dir. Move genome on completion.
        # tmp dir is in genome_dir to prevent moving the genome between disks
        with TemporaryDirectory(dir=out_dir) as tmp_dir:
            fname = os.path.join(tmp_dir, f"{localname}.fa")

            # actual download
            urlcleanup()
github vanheeringen-lab / genomepy / genomepy / provider.py View on Github external
def check_name(self, name):
        """check if genome name can be found for provider"""
        if self.name == "URL":
            return

        if not safe(name) in self.genomes:
            raise GenomeDownloadError(
                f"Could not download genome {name} from {self.name}.\n\n"
                "Check for typos or try\n"
github vanheeringen-lab / genomepy / genomepy / provider.py View on Github external
def get_url(level="toplevel"):
            masks = {"soft": "dna_sm.{}", "hard": "dna_rm.{}", "none": "dna.{}"}
            pattern = masks[mask].format(level)

            asm_url = "{}/{}.{}.{}.fa.gz".format(
                url,
                genome["url_name"].capitalize(),
                re.sub(r"\.p\d+$", "", safe(genome["assembly_name"])),
                pattern,
            )
            return asm_url
github vanheeringen-lab / genomepy / genomepy / provider.py View on Github external
def _search_descriptions(self, genome, term):
        """check if search term corresponds to the provider's description field(s)"""
        for field in self.description_fields:
            if term in safe(genome[field].lower()):
                return True
github vanheeringen-lab / genomepy / genomepy / genome.py View on Github external
def _update_metadata(self, metadata):
        """check if there is missing info that can be updated"""
        print("Updating metadata in README.txt", file=sys.stderr)
        if metadata.get("provider", "na") == "na":
            self._update_provider(metadata)

        known_provider = metadata["provider"] in ["Ensembl", "UCSC", "NCBI"]
        name = safe(metadata.get("original name", ""))
        missing_info = any(
            key not in metadata for key in ["tax_id", "assembly_accession"]
        )
        p = genome = None
        if known_provider and name and missing_info:
            p = ProviderBase.create(metadata["provider"])
            genome = p.genomes.get(name)

        if "tax_id" not in metadata:
            self._update_tax_id(metadata, p, genome)
        if "assembly_accession" not in metadata:
            self._update_assembly_accession(metadata, p, genome)
github vanheeringen-lab / genomepy / genomepy / provider.py View on Github external
Yields
        ------
        tuples with name and metadata
        """
        term = str(term)
        genomes = self.genomes
        if safe(term) in genomes:
            yield self._genome_info_tuple(term)

        elif is_number(term):
            for name in genomes:
                if self._search_taxids(genome=genomes[name], term=term):
                    yield self._genome_info_tuple(name)

        else:
            term = safe(term).lower()
            for name in genomes:
                if self._search_descriptions(genome=genomes[name], term=term):
                    yield self._genome_info_tuple(name)