How to use the kipoiseq.extractors.protein.CDSFetcher function in kipoiseq

To help you get started, we’ve selected a few kipoiseq examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github kipoi / kipoiseq / tests / extractors / test_protein.py View on Github external
def test_CDSFetcher__read_cds():
    cds = CDSFetcher._read_cds(gtf_file, duplicate_attr=True)
    assert cds.shape[0] == 7

    assert cds.iloc[0].Chromosome == '22'
    assert cds.iloc[0].Start == 598
    assert cds.iloc[0].End == 3050

    assert cds.iloc[3].Start == 3
    assert cds.iloc[3].End == 300
github kipoi / kipoiseq / kipoiseq / extractors / protein.py View on Github external
def _get_cds_from_gtf(df):
        """
        Create DataFrame with valid cds
        :param df:
        """
        biotype_str = CDSFetcher._get_biotype_str(df)
        df = (df
              .query("{} == 'protein_coding'".format(biotype_str))
              .query("(Feature == 'CDS') | (Feature == 'CCDS')")
              )
        df = df[df['tag'].notna()]  # grch37 have ccds without tags
        return df[df["tag"].str.contains("basic|CCDS")].set_index('transcript_id')
github kipoi / kipoiseq / kipoiseq / extractors / protein.py View on Github external
def _read_cds(gtf_file, duplicate_attr=False):
        """
        Read, extract and filter valid cds from the given gtf_file
        :param gtf_file:
        """
        df = pyranges.read_gtf(gtf_file, as_df=True,
                               duplicate_attr=duplicate_attr)
        cds = CDSFetcher._get_cds_from_gtf(df)
        cds = CDSFetcher._filter_valid_transcripts(cds)
        return cds
github kipoi / kipoiseq / kipoiseq / extractors / protein.py View on Github external
def _read_cds(gtf_file, duplicate_attr=False):
        """
        Read, extract and filter valid cds from the given gtf_file
        :param gtf_file:
        """
        df = pyranges.read_gtf(gtf_file, as_df=True,
                               duplicate_attr=duplicate_attr)
        cds = CDSFetcher._get_cds_from_gtf(df)
        cds = CDSFetcher._filter_valid_transcripts(cds)
        return cds
github kipoi / kipoiseq / kipoiseq / extractors / protein.py View on Github external
def __init__(self, gtf_file, fasta_file):
        self.fasta_file = str(fasta_file)
        self.gtf_file = str(gtf_file)
        self.fasta = FastaStringExtractor(self.fasta_file, use_strand=False)
        self.cds_fetcher = CDSFetcher(self.gtf_file)
        self.transcripts = self.cds_fetcher.transcripts
github kipoi / kipoiseq / kipoiseq / extractors / protein.py View on Github external
def __init__(self, gtf_file, fasta_file, vcf_file):
        self.gtf_file = str(gtf_file)
        self.fasta_file = str(fasta_file)
        self.vcf_file = str(vcf_file)
        self.cds_fetcher = CDSFetcher(self.gtf_file)
        # dataframe to pyranges
        pr_cds = pyranges.PyRanges(self.cds_fetcher.cds.reset_index())
        # match variant with transcript_id
        self.single_variant_matcher = SingleVariantMatcher(
            self.vcf_file, pranges=pr_cds)

        self.fasta = FastaStringExtractor(self.fasta_file)
        self.multi_sample_VCF = MultiSampleVCF(self.vcf_file)
        self.variant_seq_extractor = VariantSeqExtractor(self.fasta_file)