Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_CDSFetcher__read_cds():
cds = CDSFetcher._read_cds(gtf_file, duplicate_attr=True)
assert cds.shape[0] == 7
assert cds.iloc[0].Chromosome == '22'
assert cds.iloc[0].Start == 598
assert cds.iloc[0].End == 3050
assert cds.iloc[3].Start == 3
assert cds.iloc[3].End == 300
def cds_fetcher():
return CDSFetcher(gtf_file)
def _get_cds_from_gtf(df):
"""
Create DataFrame with valid cds
:param df:
"""
biotype_str = CDSFetcher._get_biotype_str(df)
df = (df
.query("{} == 'protein_coding'".format(biotype_str))
.query("(Feature == 'CDS') | (Feature == 'CCDS')")
)
df = df[df['tag'].notna()] # grch37 have ccds without tags
return df[df["tag"].str.contains("basic|CCDS")].set_index('transcript_id')
def _read_cds(gtf_file, duplicate_attr=False):
"""
Read, extract and filter valid cds from the given gtf_file
:param gtf_file:
"""
df = pyranges.read_gtf(gtf_file, as_df=True,
duplicate_attr=duplicate_attr)
cds = CDSFetcher._get_cds_from_gtf(df)
cds = CDSFetcher._filter_valid_transcripts(cds)
return cds
def _read_cds(gtf_file, duplicate_attr=False):
"""
Read, extract and filter valid cds from the given gtf_file
:param gtf_file:
"""
df = pyranges.read_gtf(gtf_file, as_df=True,
duplicate_attr=duplicate_attr)
cds = CDSFetcher._get_cds_from_gtf(df)
cds = CDSFetcher._filter_valid_transcripts(cds)
return cds
def __init__(self, gtf_file, fasta_file):
self.fasta_file = str(fasta_file)
self.gtf_file = str(gtf_file)
self.fasta = FastaStringExtractor(self.fasta_file, use_strand=False)
self.cds_fetcher = CDSFetcher(self.gtf_file)
self.transcripts = self.cds_fetcher.transcripts
def __init__(self, gtf_file, fasta_file, vcf_file):
self.gtf_file = str(gtf_file)
self.fasta_file = str(fasta_file)
self.vcf_file = str(vcf_file)
self.cds_fetcher = CDSFetcher(self.gtf_file)
# dataframe to pyranges
pr_cds = pyranges.PyRanges(self.cds_fetcher.cds.reset_index())
# match variant with transcript_id
self.single_variant_matcher = SingleVariantMatcher(
self.vcf_file, pranges=pr_cds)
self.fasta = FastaStringExtractor(self.fasta_file)
self.multi_sample_VCF = MultiSampleVCF(self.vcf_file)
self.variant_seq_extractor = VariantSeqExtractor(self.fasta_file)