Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
resources_dir : str
name / path of resources directory
deduplicate : bool
deduplicate RSIDs and make SNPs available as `duplicate_snps`
deduplicate_XY_chrom : bool
deduplicate alleles in the non-PAR regions of X and Y for males; see `discrepant_XY_snps`
parallelize : bool
utilize multiprocessing to speedup calculations
processes : int
processes to launch if multiprocessing
rsids : tuple, optional
rsids to extract if loading a VCF file
"""
self._file = file
self._only_detect_source = only_detect_source
self._snps = get_empty_snps_dataframe()
self._duplicate_snps = pd.DataFrame()
self._discrepant_XY_snps = pd.DataFrame()
self._source = ""
self._phased = False
self._build = 0
self._build_detected = False
self._output_dir = output_dir
self._resources = Resources(resources_dir=resources_dir)
self._parallelizer = Parallelizer(parallelize=parallelize, processes=processes)
if file:
d = self._read_raw_data(file, only_detect_source, rsids)
# Replace multiple rsids separated by commas in index with the first rsid. E.g. rs1,rs2 -> rs1
multi_rsids = {
snps (pandas.DataFrame)
dataframe of parsed SNPs
source (str)
detected source of SNPs
phased (bool)
flag indicating if SNPs are phased
References
----------
1. Fluent Python by Luciano Ramalho (O'Reilly). Copyright 2015 Luciano Ramalho,
978-1-491-94600-8.
"""
phased = False
if self._only_detect_source:
df = get_empty_snps_dataframe()
else:
df, *extra = parser()
if len(extra) == 1:
phased = extra[0]
return {"snps": df, "source": source, "phased": phased}
Returns
-------
dict
dict with the following items:
snps (pandas.DataFrame)
dataframe of parsed SNPs
source (str)
detected source of SNPs
phased (bool)
flag indicating if SNPs are phased
"""
file = self._file
compression = "infer"
d = {
"snps": get_empty_snps_dataframe(),
"source": "",
"phased": False,
"build": 0,
}
# peek into files to determine the data format
if isinstance(file, str) and os.path.exists(file):
if ".zip" in file:
with zipfile.ZipFile(file) as z:
with z.open(z.namelist()[0], "r") as f:
first_line, comments, data = self._extract_comments(
f, decode=True
)
elif ".gz" in file:
with gzip.open(file, "rt") as f:
resources_dir : str
name / path of resources directory
deduplicate : bool
deduplicate RSIDs and make SNPs available as `duplicate_snps`
deduplicate_XY_chrom : bool
deduplicate alleles in the non-PAR regions of X and Y for males; see `discrepant_XY_snps`
parallelize : bool
utilize multiprocessing to speedup calculations
processes : int
processes to launch if multiprocessing
rsids : tuple, optional
rsids to extract if loading a VCF file
"""
self._file = file
self._only_detect_source = only_detect_source
self._snps = get_empty_snps_dataframe()
self._duplicate_snps = pd.DataFrame()
self._discrepant_XY_snps = pd.DataFrame()
self._source = ""
self._phased = False
self._build = 0
self._build_detected = False
self._output_dir = output_dir
self._resources = Resources(resources_dir=resources_dir)
self._parallelizer = Parallelizer(parallelize=parallelize, processes=processes)
if file:
d = self._read_raw_data(file, only_detect_source, rsids)
self._snps = d["snps"]
self._source = d["source"]