Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
rf.get_file(output_file)
class BedToolLinecache(BedTool):
"""Faster BedTool accessor by Ziga Avsec
Normal BedTools loops through the whole file to get the
line of interest. Hence the access it o(n)
Note: this might load the whole bedfile into memory
"""
def __getitem__(self, idx):
line = linecache.getline(self.fn, idx + 1)
return pybedtools.create_interval_from_list(line.strip().split("\t"))
class SeqDataset(Dataset):
"""
Args:
intervals_file: bed3 file containing intervals
fasta_file: file path; Genome sequence
target_file: file path; path to the targets in the csv format
"""
SEQ_WIDTH = 1002
def __init__(self,
intervals_file,
fasta_file,
dnase_file,
use_linecache=True):
# intervals
import pybedtools
from pybedtools import BedTool
from genomelake.extractors import FastaExtractor
from kipoi.data import Dataset
from kipoi.metadata import GenomicRanges
import linecache
class BedToolLinecache(BedTool):
def __getitem__(self, idx):
line = linecache.getline(self.fn, idx + 1)
return pybedtools.create_interval_from_list(line.strip().split("\t"))
class SeqDataset(Dataset):
"""
Args:
intervals_file: bed3+1 file containing intervals+labels
fasta_file: file path; Genome sequence
"""
def __init__(self, intervals_file, fasta_file):
self.bt = BedToolLinecache(intervals_file)
self.fasta_file = fasta_file
self.fasta_extractor = None
def __len__(self):
return len(self.bt)
def __getitem__(self, idx):
if self.fasta_extractor is None:
# --------------------------------------------
class BedToolLinecache(BedTool):
"""Fast BedTool accessor by Ziga Avsec
Normal BedTools loops through the whole file to get the
line of interest. Hence the access it o(n)
"""
def __getitem__(self, idx):
line = linecache.getline(self.fn, idx + 1)
return pybedtools.create_interval_from_list(line.strip().split("\t"))
class SeqDataset(Dataset):
"""
Args:
intervals_file: bed3 file containing intervals
fasta_file: file path; Genome sequence
target_file: file path; path to the targets in the csv format
"""
def __init__(self, intervals_file, fasta_file, target_file=None, use_linecache=False):
# intervals
if use_linecache:
self.bt = BedToolLinecache(intervals_file)
else:
self.bt = BedTool(intervals_file)
self.fasta_file = fasta_file
self.fasta_extractor = None # to be initialized later
import numpy as np
# --------------------------------------------
class BedToolLinecache(BedTool):
"""Fast BedTool accessor by Ziga Avsec
Normal BedTools loops through the whole file to get the
line of interest. Hence the access it o(n)
"""
def __getitem__(self, idx):
line = linecache.getline(self.fn, idx + 1)
return pybedtools.create_interval_from_list(line.strip().split("\t"))
class SeqDataset(Dataset):
"""
Args:
intervals_file: bed3 file containing intervals
fasta_file: file path; Genome sequence
target_file: file path; path to the targets in the csv format
"""
SEQ_WIDTH = 1001
def __init__(self, intervals_file, fasta_file, target_file=None, use_linecache=False):
# intervals
if use_linecache:
self.bt = BedToolLinecache(intervals_file)
else:
self.bt = BedTool(intervals_file)
def seq(self):
return self._seq
@seq.setter
def seq(self, value):
self._seq = value
def get_seq(self, fasta):
seq = fasta.get_seq(self.chrom,
self.grange,
self.strand)
return seq
@kipoi_dataloader()
class SplicingMaxEntDataset(Dataset):
"""
args:
MISO_AS:
doc: Whether the given annotation file is MISO alternative splicing annotation. default False.
fasta_file:
doc: Reference Genome sequence in fasta format
example:
md5: 936544855b253835442a0f253dd4b083
url: https://zenodo.org/record/1466099/files/3prime-example_files-hg19.chr22.fa?download=1
type: str
gtf_file:
doc: file path; Genome annotation GTF file
example:
md5: 174fd11303ae2c2369094bfcbe303c07
url: https://zenodo.org/record/1466099/files/3prime-example_files-hg19.chr22.gtf?download=1
label_col:
'fasta_file' : 'example_files/chr21.fa',
'num_chr_fasta' : True,
}
def parse_dtype(dtype):
dtypes = {'int':int, 'string':str, 'float':float, 'bool':bool}
if dtype is None:
return None
if dtype in list(dtypes.values()):
return dtype
if dtype not in dtypes:
raise Exception("Datatype '{0}' not recognized. Allowed are: {1}".format(dtype, str(list(dtypes.keys()))))
return dtypes[dtype]
class FastaBasedDataset(Dataset):
"""
Args:
intervals_file: bed3+ file containing intervals+labels
fasta_file: file path; Genome sequence
num_chr_fasta: if True, the tsv-loader will make sure that the chromosomes
don't start with chr
label_dtype: label data type
seq_len: required sequence length
use_strand: reverse-complement fasta sequence if bed file defines negative strand
force_upper: Force uppercase output of sequences
"""
output_schema = None
type = 'Dataset'
defined_as = 'kipoi_dataloaders.FastaBasedDataset'
info = None
args = OrderedDict()
return float(self.lines[idx].strip())
# File paths
intervals_file = "test_files/intervals.tsv"
target_file = "test_files/targets.tsv"
gtf_file = "test_files/gencode_v25_chr22.gtf.pkl.gz"
fasta_file = "test_files/hg38_chr22.fa"
preproc_transformer = "extractor_files/encodeSplines.pkl"
# bt = pybedtools.BedTool(intervals_file)
# intervals = [i for i in bt[:10]]
# --------------------------------------------
class SeqDistDataset(Dataset):
"""
Args:
intervals_file: file path; tsv file
Assumes bed-like `chrom start end id score strand` format.
fasta_file: file path; Genome sequence
gtf_file: file path; Genome annotation GTF file pickled using pandas.
preproc_transformer: file path; tranformer used for pre-processing.
target_file: file path; path to the targets
batch_size: int
"""
def __init__(self, intervals_file, fasta_file, gtf_file, preproc_transformer, target_file=None):
gtf = pd.read_pickle(gtf_file)
self.gtf = gtf[gtf["info"].str.contains('gene_type "protein_coding"')]
# distance transformer
if use_strand and interval.strand == "-":
dist = - dist
return dist[np.argmin(np.abs(dist))]
out[:] = np.array([[find_closest(self.landmarks[ldm_name], interval, self.use_strand)
for ldm_name in self.columns]
for interval in intervals], dtype=float)
return out
def _get_output_shape(self, num_intervals, width):
return (num_intervals, len(self.columns))
class TxtDataset(Dataset):
def __init__(self, path):
with open(path, "r") as f:
self.lines = f.readlines()
def __len__(self):
return len(self.lines)
def __getitem__(self, idx):
return int(self.lines[idx].strip())
# --------------------------------------------
class SeqDistDataset(Dataset):
"""
Args:
if self.ignore_targets or self.n_tasks == 0:
labels = {}
else:
labels = row.iloc[self.bed_columns:].values.astype(
self.label_dtype)
return interval, labels
def __len__(self):
return len(self.df)
def get_targets(self):
return self.df.iloc[:, self.bed_columns:].values.astype(self.label_dtype)
@kipoi_dataloader(override={"dependencies": deps, 'info.authors': package_authors})
class StringSeqIntervalDl(Dataset):
"""
info:
doc: >
Dataloader for a combination of fasta and tab-delimited input files such as bed files. The dataloader extracts
regions from the fasta file as defined in the tab-delimited `intervals_file`. Returned sequences are of the type
np.array([str]).
args:
intervals_file:
doc: bed3+ file path containing intervals + (optionally) labels
example:
url: https://raw.githubusercontent.com/kipoi/kipoiseq/master/tests/data/intervals_51bp.tsv
md5: a76e47b3df87fd514860cf27fdc10eb4
fasta_file:
doc: Reference genome FASTA file path.
example:
url: https://raw.githubusercontent.com/kipoi/kipoiseq/master/tests/data/hg38_chr22_32000000_32300000.fa
@property
def seq(self):
return self._seq
@seq.setter
def seq(self, value):
self._seq = value
def get_seq(self, fasta):
seq = fasta.get_seq(self.chrom,
self.grange,
self.strand)
return seq
class SplicingKmerDataset(Dataset):
"""
Args:
gtf_file: gtf file. Can be dowloaded from MISO or ensembl.
fasta_file: file path; Genome sequence
target_file: file path; path to the targets in MISO summary format.
overhang: length of overhang.
MISO_AS: whether the used annotation file is from MISO alternative splicing annotation.
"""
def __init__(self,
gtf_file,
fasta_file,
overhang=80,
MISO_AS=False): # intron + ~ bp exon from both side
self.genes = loadgene(gtf_file)
self.fasta_file = fasta_file