How to use the pyensembl.normalization.normalize_strand function in pyensembl

To help you get started, we’ve selected a few pyensembl examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github openvax / pyensembl / pyensembl / gtf.py View on Github external
def dataframe(
            self,
            contig=None,
            feature=None,
            strand=None,
            save_to_disk=False):
        """
        Load genome entries as a DataFrame, optionally restricted to
        particular contig or feature type.
        """
        if contig:
            contig = normalize_chromosome(contig)

        if strand:
            strand = normalize_strand(strand)

        if feature is not None:
            require_string(feature, "feature")

        key = (contig, feature, strand)

        if key not in self._dataframes:
            def _construct_df():
                full_df = self._load_full_dataframe_cached()

                assert len(full_df) > 0, \
                    "Dataframe representation of genomic database empty!"

                # rename since we're going to be filtering the entries but
                # may still want to access the full dataset
                df = full_df
github openvax / pyensembl / pyensembl / gtf.py View on Github external
def _load_full_dataframe_from_gtf(self):
        """
        Parse this genome source's GTF file and load it as a Pandas DataFrame
        """
        print("Reading GTF from %s" % self.gtf_path)
        df = read_gtf_as_dataframe(
            self.gtf_path,
            column_converters={
                "seqname": normalize_chromosome,
                "strand": normalize_strand,
            },
            infer_biotype_column=True)

        features = set(df["feature"])
        column_names = set(df.keys())

        # older Ensembl releases don't have "gene" or "transcript"
        # features, so fill in those rows if they're missing
        if "gene" not in features:
            # if we have to reconstruct gene feature rows then
            # fill in values for 'gene_name' and 'gene_biotype'
            # but only if they're actually present in the GTF
            df = create_missing_features(
                dataframe=df,
                unique_keys={"gene": "gene_id"},
                extra_columns={
github openvax / pyensembl / pyensembl / locus.py View on Github external
"""
        contig : str
            Chromosome or other sequence name in the reference assembly

        start : int
            Start position of locus on the contig

        end : int
            Inclusive end position on the contig

        strand : str
            Should we read the locus forwards ('+') or backwards ('-')?
        """

        self.contig = normalize_chromosome(contig)
        self.strand = normalize_strand(strand)

        start = int(start)
        end = int(end)

        if start == 0:
            raise ValueError("Expected start > 0 (using base 1 coordinates)")
        elif end == 0:
            raise ValueError("Expected end > 0 (using base 1 coordinates)")

        if end < start:
            raise ValueError(
                "Expected start <= end, got start = %d, end = %d" % (
                    start, end))
        self.start = start
        self.end = end
github openvax / pyensembl / pyensembl / locus.py View on Github external
def on_strand(self, strand):
        return normalize_strand(strand) == self.strand
github openvax / pyensembl / pyensembl / database.py View on Github external
def _load_gtf_as_dataframe(self, usecols=None, features=None):
        """
        Parse this genome source's GTF file and load it as a Pandas DataFrame
        """
        logger.info("Reading GTF from %s", self.gtf_path)
        df = read_gtf(
            self.gtf_path,
            column_converters={
                "seqname": normalize_chromosome,
                "strand": normalize_strand,
            },
            infer_biotype_column=True,
            usecols=usecols,
            features=features)

        column_names = set(df.keys())
        expect_gene_feature = features is None or "gene" in features
        expect_transcript_feature = features is None or "transcript" in features
        observed_features = set(df["feature"])

        # older Ensembl releases don't have "gene" or "transcript"
        # features, so fill in those rows if they're missing
        if expect_gene_feature and "gene" not in observed_features:
            # if we have to reconstruct gene feature rows then
            # fill in values for 'gene_name' and 'gene_biotype'
            # but only if they're actually present in the GTF
github openvax / pyensembl / pyensembl / database.py View on Github external
only on the feature type.
        """
        query = """
            SELECT %s%s
            FROM %s
            WHERE 1=1
        """ % ("DISTINCT " if distinct else "", column, feature)
        query_params = []

        if contig:
            contig = normalize_chromosome(contig)
            query += " AND seqname = ?"
            query_params.append(contig)

        if strand:
            strand = normalize_strand(strand)
            query += " AND strand = ?"
            query_params.append(strand)

        rows = self.run_sql_query(query, query_params=query_params)
        return [row[0] for row in rows if row is not None]