Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
'title': 'Max',
'description': 'The highest representation for any index.'
}
table_config = {
'namespace': 'interop',
'id': 'interop-indexmetrics-summary-table',
'table_title': 'Index Read Statistics Summary',
'col1_header': 'Run - Lane',
}
tdata = {}
for s_name in data:
for key in data[s_name]['summary']:
tdata["{} - {}".format(s_name,key)]=data[s_name]['summary'][key]
return table.plot(tdata, headers, table_config)
headers['percent_perfectIndex'] = {
'title': '% Perfect Index',
'description': 'Percent of reads with perfect index (0 mismatches)',
'max': 100,
'min': 0,
'scale': 'RdYlGn',
'suffix': '%'
}
table_config = {
'namespace': 'bcl2fastq',
'id': 'bcl2fastq-lane-stats-table',
'table_title': 'bcl2fastq Lane Statistics',
'col1_header': 'Run ID - Lane',
'no_beeswarm': True
}
return table.plot(self.bcl2fastq_bylane, headers, table_config)
self.write_data_file(self.sumstats_data, 'multiqc_stacks_sumstats')
### Write the sample table
config_table = {
'id': 'gstacks_table',
'namespace': 'stacks'
}
self.add_section (
name = 'Sample statistics',
anchor = 'stacks-gstacks',
description = 'The sample specific statistics for Stacks',
helptext = '''**Note!** The sample names have the following scheme ` | <input>`.
This data is obtained from the gstacks program run after builing sample and catalog loci merge
paired-ends and call variants.
These numbers are obtained from the `gstacks.log.distribs` file''',
plot = table.plot(self.cov_data, self.gsheaders, config_table)
)
# Write population sumstats table
config_table = {
'id': 'sumstats_table',
'namespace': 'stacks'
}
self.add_section (
name = 'Population summary statistics',
anchor = 'stacks-sumstats',
description = 'Population statistics as calculated from variant sites found in this run',
helptext = '''**Note!** The sample names have the following scheme ` | `,
where the population ID is defined in the input population map file.
This information is obtained from the Stacks program `population` and the file populations.sumstats_summary.tsv
''',
plot = table.plot(self.sumstats_data, self.sheaders, config_table)
)
'suffix': '%',
'scale': 'RdYlGn'
}
table_config = {
'namespace': 'interop',
'id': 'interop-runmetrics-summary-table',
'table_title': 'Read metrics summary',
'col1_header': 'Run - Read',
}
tdata = {}
for s_name in data:
for key in data[s_name]['summary']:
tdata["{} - {}".format(s_name,key)]=data[s_name]['summary'][key]
return table.plot(tdata, headers, table_config)
'pct_Blacklisted': 100. * v['blacklisted'] / float(v['total']),
'pct_Below_MAPQ': 100. * v['mapq'] / float(v['total']),
'pct_Missing_Flags': 100. * v['required flags'] / float(v['total']),
'pct_Forbidden_Flags': 100. * v['excluded flags'] / float(v['total']),
'pct_deepTools_Dupes': 100. * v['internal dupes'] / float(v['total']),
'pct_Duplication': 100. * v['dupes'] / float(v['total']),
'pct_Singletons': 100. * v['singletons'] / float(v['total']),
'pct_Strand_Filtered': 100. * v['strand'] / float(v['total'])
}
config = {'namespace': 'deepTools bamPEFragmentSize'}
self.add_section(
name = "Filtering metrics",
anchor = "estimateReadFiltering",
description = "Estimated percentages of alignments filtered independently for each setting in `estimateReadFiltering`",
plot = table.plot(tdata, header, config)
)
return len(self.deeptools_estimateReadFiltering)
the data more carefully for the possibility of contamination.
* We recommend to check each lane for the possibility of sample swaps.
When `[CHIPMIX] ~ 1` AND `[FREEMIX] ~ 0`, then it is possible that the sample
is swapped with another sample. When `[CHIPMIX] ~ 0` in `.bestSM` file,
`[CHIP_ID]` might be actually the swapped sample. Otherwise, the swapped
sample may not exist in the genotype data you have compared.
* When genotype data is not available but allele-frequency-based estimates of
`[FREEMIX] >= 0.03` and `[FREELK1]-[FREELK0]` is large, then it is possible
that the sample is contaminated with other sample. We recommend to use
per-sample data rather than per-lane data for checking this for low coverage
data, because the inference will be more confident when there are large number
of bases with depth 2 or higher.
_Copied from the [VerifyBAMID documentation](https://genome.sph.umich.edu/wiki/VerifyBamID) - see the link for more details._
''',
plot = table.plot(self.verifybamid_data, headers, tconfig)
)
basecov[sid]['cpg_all'] = float(dd[1])/float(dd[0])*100.0
for sid, dd in mdata[5].items():
if sid not in basecov:
basecov[sid] = {}
basecov[sid]['cpg_uniq'] = float(dd[1])/float(dd[0])*100.0
# base coverage >=1x table
if len(basecov) == 0:
return
self.add_section(
name = 'Coverage by At Least One Read',
anchor = 'biscuit-coverage-base-table',
description = 'The fraction of genome/genomic CpGs covered by at least one read.',
plot = table.plot(basecov, {
'all':{'title':'Genome (All)','max':100,'min':0,'suffix':'%'},
'uniq':{'title':'Genome (Unique)','max':100,'min':0,'suffix':'%'},
'cpg_all':{'title':'CpG (All)','max':100,'min':0,'suffix':'%'},
'cpg_uniq':{'title':'CpG (Unique)','max':100,'min':0,'suffix':'%'},
})
plot = table.plot(self.cov_data, self.gsheaders, config_table)
)
# Write population sumstats table
config_table = {
'id': 'sumstats_table',
'namespace': 'stacks'
}
self.add_section (
name = 'Population summary statistics',
anchor = 'stacks-sumstats',
description = 'Population statistics as calculated from variant sites found in this run',
helptext = '''**Note!** The sample names have the following scheme ` | `,
where the population ID is defined in the input population map file.
This information is obtained from the Stacks program `population` and the file populations.sumstats_summary.tsv
''',
plot = table.plot(self.sumstats_data, self.sheaders, config_table)
)
config_distribs = {
'id': 'distribs_plot',
'namespace': 'stacks',
'tt_label': '{point.y} loci, {point.x} samples/SNPs',
'data_labels': [
{'name': 'Samples per loci', 'ylab': '# loci', 'xlab': '# samples'},
{'name': 'SNPs per loci', 'ylab': '# loci', 'xlab': '# SNPs'}
]
}
self.add_section (
name = 'Population plots',
anchor = 'stacks-distribs',
description = 'Plots showing, 1) the number of loci shared by number of samples and 2) the number of SNPs per sample',
helptext = '''The distributions are obtained from the Stacks program `populations` and it's output file `populations.log.distribs`.
These numbers are Stacks' post-filtering.''',
'min': 0,
'format': '{:,.2f}',
'suffix': config.read_count_prefix,
'scale': 'OrRd',
'modify': lambda x: float(x) * config.read_count_multiplier,
}
pconfig = {
'id': 'slamdunk_filtering_table',
'min': 0,
}
self.add_section (
name = 'Filter statistics',
anchor = 'slamdunk_filtering',
description = 'This table shows the number of reads filtered with each filter criterion during filtering phase of slamdunk.',
plot = table.plot(self.slamdunk_data, headers, pconfig)
)
### Write the table
config_table = {
'id': 'longranger_table',
'namespace': 'longranger'
}
self.add_section (
name = 'Run stats',
anchor = 'longranger-run-stats',
description = 'Statistics gathered from Longranger reports. ' \
'There are more columns available but they are hidden by default.' + version_str,
helptext = '''Parses the files `summary.csv` and `_invocation` found in the
output directory of Longranger. If `_invocation` is not found
the sample IDs will be missing and they will be given a running
number. E.g., `longranger#1` and `longranger#2`.''',
plot = table.plot(self.longranger_data, self.headers, config_table)
)
### Bar plot of phasing stats
phase_pdata = {}
snps_phased_pct = {}
genes_phased_pct = {}
for s_name in self.longranger_data:
try:
phase_pdata[s_name] = {
'longest_phase_block': float(self.longranger_data[s_name]['longest_phase_block']),
'n50_phase_block': float(self.longranger_data[s_name]['n50_phase_block'])
}
except:
pass
try:
snps_phased_pct[s_name] = { 'snps_phased_pct': float(self.longranger_data[s_name]['snps_phased']) * 100.0 }