Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_no_fasta_files():
with pytest.raises(FileNotFoundError):
genomepy.Genome("empty", "tests/data/genome")
def test_url_genome():
"""Test URL.
Download S. cerevisiae genome directly from an url from UCSC and retrieve a
specific sequence.
"""
tmp = mkdtemp()
genomepy.install_genome(
"http://hgdownload.soe.ucsc.edu/goldenPath/ce11/bigZips/chromFa.tar.gz",
"url",
genome_dir=tmp,
localname="url_test",
)
g = genomepy.Genome("url_test", genome_dir=tmp)
assert str(g["chrI"][:12]).lower() == "gcctaagcctaa"
shutil.rmtree(tmp)
def test_no_fasta_files():
with pytest.raises(FileNotFoundError):
genomepy.Genome("empty", "tests/data/genome")
Returns
-------
pandas.DataFrame or dict
Chromosome mapping.
"""
if fmt.lower() not in ["dataframe", "dict"]:
raise ValueError("Invalid format, should be 'dataframe' or 'dict'")
logger.info("Loading chromosome mapping.")
if to.startswith("GCA"):
if provider is None:
raise ValueError("Need a provider: NCBI, UCSC or Ensembl")
asm_acc = to
else:
try:
genome = Genome(to)
logger.info("Using local genome information")
asm_acc = genome.assembly_accession
if provider is None:
provider = genome.provider
except Exception:
logger.info("Searching remote genome information")
result = [row for row in search(to, provider=provider)]
if len(result) > 1:
p = [row[1].decode() for row in result]
raise ValueError(
f"More than one result, need one of these providers: {', '.join(p)}"
)
if provider is None:
provider = result[0][1].decode()
asm_acc = result[0][2].decode()
motif_digest = self.checksum.get(motif_file, None)
# determine which regions are not in the cache
scan_regions = regions
if self.use_cache:
scan_regions = []
for region in regions:
key = str((region, genome, motif_digest, nreport, scan_rc))
ret = self.cache.get(key)
if ret == NO_VALUE:
scan_regions.append(region)
# scan the regions that are not in the cache
if len(scan_regions) > 0:
g = Genome(genome)
motifs = [(m, self.threshold[m.id]) for m in read_motifs(self.motifs)]
scan_func = partial(
scan_region_mult,
genome=g,
motifs=motifs,
nreport=nreport,
scan_rc=scan_rc,
)
for region, ret in self._scan_jobs(scan_func, scan_regions):
# return values or store values in cache
if self.use_cache:
# store values in cache
key = str(
(
are: ensembl.gene, entrezgene, symbol, name, refseq, entrezgene. Note that
refseq will return the protein refseq_id by default, use `product="rna"` to
return the RNA refseq_id. Currently, mapping to Ensembl transcript ids is
not supported.
product : str, optional
Either "protein" or "rna". Only used when `gene_field="refseq"`
Returns
-------
pandas.DataFrame with gene annotation.
"""
if product not in ["rna", "protein"]:
raise ValueError(f"Argument product should be either 'rna' or 'protein'")
g = Genome(genome)
for anno_file in [f"{genome}.annotation.bed.gz", f"{genome}.annotation.bed"]:
bed = os.path.join(os.path.dirname(g.filename), anno_file)
if os.path.exists(bed):
break
else:
bed = None
if bed is None:
logger.info(f"No annotation file found for genome {genome}")
return
bed12_fields = [
"chrom",
"start",
"end",
"name",
def __init__(self, matchfile, genome="hg19", number=None, size=None):
# Create temporary files
tmpbed = NamedTemporaryFile(dir=mytmpdir()).name
tmpfasta = NamedTemporaryFile(dir=mytmpdir()).name
# Create bed-file with coordinates of random sequences
matched_gc_bedfile(tmpbed, matchfile, genome, number, size=size)
# Convert track to fasta
Genome(genome).track2fasta(tmpbed, fastafile=tmpfasta)
# Initialize super Fasta object
Fasta.__init__(self, tmpfasta)
# Delete the temporary files
os.remove(tmpbed)
os.remove(tmpfasta)
def create_random_genomic_bedfile(out, genome, size, n):
features = Genome(genome).get_random_sequences(n, size)
# Write result to bedfile
tmp = open(out, "w")
for chrom, start, end in features:
tmp.write("%s\t%d\t%d\n" % (chrom, start, end))
tmp.flush()
def set_genome(self, genome):
"""
set the genome to be used for:
- converting regions to sequences
- background for MOODS
"""
if not genome:
return
# raises error if checks fail
Genome(genome)
self.genome = genome