Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
"""Dataloader
"""
import pickle
import warnings
import gffutils
from pyfaidx import Fasta
from concise.preprocessing import encodeDNA
from kipoi.data import SampleIterator
from kipoi.metadata import GenomicRanges
from mmsplice.vcf_dataloader import ExonInterval
class ExonDataLoader5(SampleIterator):
"""
Load genome annotation (gtf) file along with a vcf file, return wt sequence and mut sequence.
Args:
gtf: gtf file or pickled gtf IntervalTree.
fasta_file: file path; Genome sequence
vcf_file: file path; vcf file with variants to score
"""
def __init__(self,
gtf_file,
fasta_file,
split_seq=True,
encode=True,
exon_cut_l=0,
exon_cut_r=0,
acceptor_intron_cut=6,
"""Dataloader
"""
import pickle
import warnings
import gffutils
from pyfaidx import Fasta
from concise.preprocessing import encodeDNA
from kipoi.data import SampleIterator
from kipoi.metadata import GenomicRanges
from mmsplice.vcf_dataloader import ExonInterval
class IntronDataLoader(SampleIterator):
"""
Load genome annotation (gtf) file along with a vcf file, return wt sequence and mut sequence.
Args:
gtf: gtf file or pickled gtf IntervalTree.
fasta_file: file path; Genome sequence
vcf_file: file path; vcf file with variants to score
"""
def __init__(self,
gtf_file,
fasta_file,
split_seq=True,
encode=True,
exon_cut_l=0,
exon_cut_r=0,
acceptor_intron_cut=6,
from kipoiseq.extractors import SingleVariantProteinVCFSeqExtractor, \
TranscriptSeqExtractor
from kipoi.data import SampleIterator
__all__ = [
'SingleVariantProteinDataLoader'
]
class SingleVariantProteinDataLoader(SampleIterator):
def __init__(self, gtf_file, fasta_file, vcf_file):
self.protein_vcf_extractor = SingleVariantProteinVCFSeqExtractor(
gtf_file, fasta_file, vcf_file)
self.transcript_extractor = TranscriptSeqExtractor(
gtf_file, fasta_file)
cds = self.transcript_extractor.cds_fetcher.cds
# only needed metadata
self.metadatas = ((cds.loc[~cds.index.duplicated(keep='first')]).drop(
columns=['Start', 'End']))
# generator for all sequences with variants
self.sequences = self._extractor()
def __iter__(self):
return self
"""
import numpy as np
from kipoi.data import SampleIterator
import pickle
from pyfaidx import Fasta
import gffutils
from concise.preprocessing import encodeDNA
import warnings
from kipoi.metadata import GenomicRanges
from mmsplice import MMSplice
from mmsplice.vcf_dataloader import ExonInterval
class IntronDataLoader(SampleIterator):
"""
Load genome annotation (gtf) file along with a vcf file, return wt sequence and mut sequence.
Args:
gtf: gtf file or pickled gtf IntervalTree.
fasta_file: file path; Genome sequence
vcf_file: file path; vcf file with variants to score
"""
def __init__(self,
gtf_file,
fasta_file,
split_seq=True,
encode=True,
exon_cut_l=0,
exon_cut_r=0,
acceptor_intron_cut=6,
for exon in gtf_db.children(gene, featuretype='exon'):
isLast = False # track whether is last exon
if firstLastNoExtend:
if (gene.strand == "+" and exon.end == gene.end) or (gene.strand == "-" and exon.start == gene.start):
overhang = (overhang[0], 0)
isLast = True
elif (gene.strand == "+" and exon.start == gene.start) or (gene.strand == "-" and exon.end == gene.end):
overhang = (0, overhang[1])
exon = ExonInterval.from_feature(exon, overhang)
exon.isLast = isLast
overhang = default_overhang
yield exon
@kipoi_dataloader(override={"dependencies": deps, 'info.authors': package_authors})
class MMSpliceDl(SampleIterator):
"""
info:
doc: >
Dataloader for splicing models. With inputs as gtf annotation file and fasta file,
each output is an exon sequence with flanking intronic seuqences. Intronic sequnce
lengths specified by the users. Returned sequences are of the type np.array([str])
args:
gtf_file:
doc: file path; Genome annotation GTF file
example:
url: https://raw.githubusercontent.com/kipoi/models/master/MMSplice/tests/data/test.gtf
md5: b20607afe91ec20d6ee79ed95ab0e85b
fasta_file:
doc: Reference Genome sequence in fasta format
example:
url: https://raw.githubusercontent.com/kipoi/models/master/MMSplice/tests/data/hg19.nochr.chr17.fa
"""Dataloader
"""
import pickle
import warnings
import gffutils
from pyfaidx import Fasta
from concise.preprocessing import encodeDNA
from kipoi.metadata import GenomicRanges
from kipoi.data import SampleIterator
from mmsplice.vcf_dataloader import ExonInterval
class ExonDataLoader(SampleIterator):
"""
Load genome annotation (gtf) file along with a vcf file, return wt sequence and mut sequence.
Args:
gtf: gtf file or pickled gtf IntervalTree.
fasta_file: file path; Genome sequence
vcf_file: file path; vcf file with variants to score
"""
def __init__(self,
gtf_file,
fasta_file,
split_seq=True,
encode=True,
exon_cut_l=0,
exon_cut_r=0,
acceptor_intron_cut=6,
"""Dataloader
"""
import pickle
import warnings
import gffutils
from pyfaidx import Fasta
from concise.preprocessing import encodeDNA
from kipoi.metadata import GenomicRanges
from kipoi.data import SampleIterator
from mmsplice.vcf_dataloader import ExonInterval
class IntronDataLoader(SampleIterator):
"""
Load genome annotation (gtf) file along with a vcf file, return wt sequence and mut sequence.
Args:
gtf: gtf file or pickled gtf IntervalTree.
fasta_file: file path; Genome sequence
vcf_file: file path; vcf file with variants to score
"""
def __init__(self,
gtf_file,
fasta_file,
split_seq=True,
encode=True,
exon_cut_l=0,
exon_cut_r=0,
acceptor_intron_cut=6,
"""Dataloader
"""
import pickle
import warnings
import gffutils
from pyfaidx import Fasta
from concise.preprocessing import encodeDNA
from kipoi.data import SampleIterator
from kipoi.metadata import GenomicRanges
from mmsplice.vcf_dataloader import ExonInterval
class IntronDataLoader(SampleIterator):
"""
Load genome annotation (gtf) file along with a vcf file, return wt sequence and mut sequence.
Args:
gtf: gtf file or pickled gtf IntervalTree.
fasta_file: file path; Genome sequence
vcf_file: file path; vcf file with variants to score
"""
def __init__(self,
gtf_file,
fasta_file,
split_seq=True,
encode=True,
exon_cut_l=0,
exon_cut_r=0,
acceptor_intron_cut=6,