Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def dataframe(
self,
contig=None,
feature=None,
strand=None,
save_to_disk=False):
"""
Load genome entries as a DataFrame, optionally restricted to
particular contig or feature type.
"""
if contig:
contig = normalize_chromosome(contig)
if strand:
strand = normalize_strand(strand)
if feature is not None:
require_string(feature, "feature")
key = (contig, feature, strand)
if key not in self._dataframes:
def _construct_df():
full_df = self._load_full_dataframe_cached()
assert len(full_df) > 0, \
"Dataframe representation of genomic database empty!"
# rename since we're going to be filtering the entries but
# may still want to access the full dataset
df = full_df
def _load_full_dataframe_from_gtf(self):
"""
Parse this genome source's GTF file and load it as a Pandas DataFrame
"""
print("Reading GTF from %s" % self.gtf_path)
df = read_gtf_as_dataframe(
self.gtf_path,
column_converters={
"seqname": normalize_chromosome,
"strand": normalize_strand,
},
infer_biotype_column=True)
features = set(df["feature"])
column_names = set(df.keys())
# older Ensembl releases don't have "gene" or "transcript"
# features, so fill in those rows if they're missing
if "gene" not in features:
# if we have to reconstruct gene feature rows then
# fill in values for 'gene_name' and 'gene_biotype'
# but only if they're actually present in the GTF
df = create_missing_features(
dataframe=df,
unique_keys={"gene": "gene_id"},
extra_columns={
"""
contig : str
Chromosome or other sequence name in the reference assembly
start : int
Start position of locus on the contig
end : int
Inclusive end position on the contig
strand : str
Should we read the locus forwards ('+') or backwards ('-')?
"""
self.contig = normalize_chromosome(contig)
self.strand = normalize_strand(strand)
start = int(start)
end = int(end)
if start == 0:
raise ValueError("Expected start > 0 (using base 1 coordinates)")
elif end == 0:
raise ValueError("Expected end > 0 (using base 1 coordinates)")
if end < start:
raise ValueError(
"Expected start <= end, got start = %d, end = %d" % (
start, end))
self.start = start
self.end = end
def on_strand(self, strand):
return normalize_strand(strand) == self.strand
def _load_gtf_as_dataframe(self, usecols=None, features=None):
"""
Parse this genome source's GTF file and load it as a Pandas DataFrame
"""
logger.info("Reading GTF from %s", self.gtf_path)
df = read_gtf(
self.gtf_path,
column_converters={
"seqname": normalize_chromosome,
"strand": normalize_strand,
},
infer_biotype_column=True,
usecols=usecols,
features=features)
column_names = set(df.keys())
expect_gene_feature = features is None or "gene" in features
expect_transcript_feature = features is None or "transcript" in features
observed_features = set(df["feature"])
# older Ensembl releases don't have "gene" or "transcript"
# features, so fill in those rows if they're missing
if expect_gene_feature and "gene" not in observed_features:
# if we have to reconstruct gene feature rows then
# fill in values for 'gene_name' and 'gene_biotype'
# but only if they're actually present in the GTF
only on the feature type.
"""
query = """
SELECT %s%s
FROM %s
WHERE 1=1
""" % ("DISTINCT " if distinct else "", column, feature)
query_params = []
if contig:
contig = normalize_chromosome(contig)
query += " AND seqname = ?"
query_params.append(contig)
if strand:
strand = normalize_strand(strand)
query += " AND strand = ?"
query_params.append(strand)
rows = self.run_sql_query(query, query_params=query_params)
return [row[0] for row in rows if row is not None]