Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
#assert gps.transcripts.isin(dfp.index).all()
transcript_id = 'ENST00000485079'
div3_error = 0
seq_mismatch_err = 0
err_transcripts = []
for transcript_id in tqdm(gps.transcripts):
# make sure all ids can be found in the proteome
dna_seq = gps.get_seq(transcript_id)
# dna_seq = dna_seq[:(len(dna_seq) // 3) * 3]
if len(dna_seq) % 3 != 0:
div3_error += 1
print("len(dna_seq) % 3 != 0: {}".format(transcript_id))
err_transcripts.append({"transcript_id": transcript_id, "div3_err": True})
continue
prot_seq = translate(dna_seq)
if dfp.loc[transcript_id].seq != prot_seq:
seq_mismatch_err += 1
print("seq.mismatch: {}".format(transcript_id))
n_mismatch = 0
for i in range(len(prot_seq)):
a = dfp.loc[transcript_id].seq[i]
b = prot_seq[i]
if a != b:
n_mismatch += 1
print("{} {} {}/{}".format(a,b,i,len(prot_seq)))
err_transcripts.append({"transcript_id": transcript_id, "div3_err": False,
"n-seq-mismatch": n_mismatch})
# print("prot:", dfp.loc[transcript_id].seq)
# print("seq: ", prot_seq)
err_transcripts = pd.DataFrame(err_transcripts)
# err_cds.to_csv("data/protein/err_cds.csv")
vcf_file = 'tests/data/singleSeq_vcf_ENST000000381176.vcf.gz'
protein_file = 'tests/data/demo_proteins.pep.all.fa'
txt_file = 'tests/data/dna_seq_ENST00000381176.txt'
'Interval.strand = "-"'
transcript_id = 'ENST00000381176'
f = open(txt_file)
control_seq_list = []
for seq in f:
control_seq_list.append(seq.replace('\n', ''))
vs = SingleSeqAminoAcidVCFSeqExtractor(fasta_file, vcf_file, gtf_file)
test_dna_seq = vs.extract(transcript_id)
ref_dna_seq = translate(cut_seq(rc_dna("".join(control_seq_list))))
assert test_dna_seq == ref_dna_seq, "Seq mismatch for Interval.strand = -"
def extract(self, transcript_id, sample_id=None):
intervals, strand = self.genome_cds_fetcher.get_cds_exons(transcript_id)
intervals = self.strand_default(intervals)
variant_interval_queryable = self.multi_sample_VCF.query_variants(intervals, sample_id=sample_id)
for seq in self.extract_sinlge(variant_interval_queryable, intervals=intervals):
seq = self.genome_cds_fetcher.prepare_seq(seq, strand)
yield translate(seq)
vcf_file = 'tests/data/singleVar_vcf_ENST000000381176.vcf.gz'
protein_file = 'tests/data/demo_proteins.pep.all.fa'
txt_file = 'tests/data/dna_seq_ENST00000319363.txt'
'Interval.strand = "+"'
transcript_id = 'ENST00000319363'
f = open(txt_file)
control_seq_list = []
for seq in f:
control_seq_list.append(seq.replace('\n', ''))
vs = SingleSeqAminoAcidVCFSeqExtractor(fasta_file, vcf_file, gtf_file)
test_dna_seq = vs.extract(transcript_id)
assert test_dna_seq == translate(cut_seq("".join(control_seq_list))), "Seq mismatch for Interval.strand = +"
def extract(self, transcript_id, sample_id=None):
intervals, strand = self.genome_cds_fetcher.get_cds_exons(transcript_id)
intervals = self.strand_default(intervals)
variant_interval_queryable = self.multi_sample_VCF.query_variants(intervals, sample_id=sample_id)
seq = self.extract_multiple(variant_interval_queryable)
seq = self.genome_cds_fetcher.prepare_seq(seq, strand)
return translate(seq)
def get_protein_seq(self, transcript_id: str):
"""
Extract amino acid sequence for given transcript_id
:param transcript_id:
:return: amino acid sequence
"""
return translate(self.get_seq(transcript_id), hg38=True)
def _prepare_seq(seqs: List[str], strand: str, tag: str):
"""
Prepare the dna sequence and translate it into amino acid sequence
:param seqs: current dna sequence
:param strand: dna strand, where the gene is located
:param tag: tags, which contain information about ambiguous start/end
:return: amino acid sequence
"""
return translate(TranscriptSeqExtractor._prepare_seq(seqs, strand, tag), True)