Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def get_data_shape(self):
"""
Gets array shape of datapoints in this dataset.
"""
if not len(self.metadata_df):
raise ValueError("No data in dataset.")
sample_X = load_from_disk(
os.path.join(self.data_dir, next(self.metadata_df.iterrows())[1]['X']))
return np.shape(sample_X)[1:]
def get_shard_size(self):
"""Gets size of shards on disk."""
if not len(self.metadata_df):
raise ValueError("No data in dataset.")
sample_y = load_from_disk(
os.path.join(
self.data_dir,
self.metadata_df.iterrows().next()[1]['y-transformed']))
return len(sample_y)
def get_shard(self, i):
"""Retrieves data for the i-th shard from disk."""
row = self.metadata_df.iloc[i]
X = np.array(load_from_disk(
os.path.join(self.data_dir, row['X-transformed'])))
y = np.array(load_from_disk(
os.path.join(self.data_dir, row['y-transformed'])))
w = np.array(load_from_disk(
os.path.join(self.data_dir, row['w-transformed'])))
ids = np.array(load_from_disk(
os.path.join(self.data_dir, row['ids'])), dtype=object)
return (X, y, w, ids)
def get_shard(self, i):
"""Retrieves data for the i-th shard from disk."""
row = self.metadata_df.iloc[i]
X = np.array(load_from_disk(os.path.join(self.data_dir, row['X'])))
if row['y'] is not None:
y = np.array(load_from_disk(os.path.join(self.data_dir, row['y'])))
else:
y = None
if row['w'] is not None:
# TODO (ytz): Under what condition does this exist but the file itself doesn't?
w_filename = os.path.join(self.data_dir, row['w'])
if os.path.exists(w_filename):
w = np.array(load_from_disk(w_filename))
else:
if len(y.shape) == 1:
w = np.ones(y.shape[0], np.float32)
else:
w = np.ones((y.shape[0], 1), np.float32)
else:
w = None
ids = np.array(
load_from_disk(os.path.join(self.data_dir, row['ids'])), dtype=object)
return (X, y, w, ids)
def _update_mean_and_std(self, df, X_stats, y_stats):
"""
Compute means/stds of X/y from sums/sum_squares of tensors.
"""
if X_stats:
X_transform = []
for _, row in df.iterrows():
Xt = load_from_disk(os.path.join(self.data_dir, row['X-transformed']))
Xs = np.sum(Xt,axis=0)
Xss = np.sum(np.square(Xt),axis=0)
save_to_disk(Xs, os.path.join(self.data_dir, row['X_sums']))
save_to_disk(Xss, os.path.join(self.data_dir, row['X_sum_squares']))
if y_stats:
y_transform = []
for _, row in df.iterrows():
yt = load_from_disk(os.path.join(self.data_dir, row['y-transformed']))
ys = np.sum(yt,axis=0)
yss = np.sum(np.square(yt),axis=0)
save_to_disk(ys, os.path.join(self.data_dir, row['y_sums']))
save_to_disk(yss, os.path.join(self.data_dir, row['y_sum_squares']))
if not reload:
if os.path.exists(base_dir):
shutil.rmtree(base_dir)
if not os.path.exists(base_dir):
os.makedirs(base_dir)
current_dir = os.path.dirname(os.path.realpath(__file__))
#Make directories to store the raw and featurized datasets.
data_dir = os.path.join(base_dir, "dataset")
train_dir = os.path.join(base_dir, "train_dataset")
valid_dir = os.path.join(base_dir, "valid_dataset")
# Load SWEET dataset
print("About to load SWEET dataset.")
dataset_file = os.path.join(
current_dir, "./sweet.csv.gz")
dataset = load_from_disk(dataset_file)
print("Columns of dataset: %s" % str(dataset.columns.values))
print("Number of examples in dataset: %s" % str(dataset.shape[0]))
# Featurize SWEET dataset
print("About to featurize SWEET dataset.")
featurizer = CircularFingerprint(size=1024)
SWEET_tasks = dataset.columns.values[1:].tolist()
loader = DataLoader(tasks=SWEET_tasks,
smiles_field="smiles",
featurizer=featurizer,
verbosity=verbosity)
if not reload or not os.path.exists(data_dir):
dataset = loader.featurize(dataset_file, data_dir)
regen = True
else:
def load_sider_convmol():
"""Load SIDER datasets. Does not do train/test split"""
# Featurize SIDER dataset
print("About to featurize SIDER dataset.")
current_dir = os.path.dirname(os.path.realpath(__file__))
dataset_file = os.path.join(
current_dir, "../sider/sider.csv.gz")
featurizer = dc.feat.ConvMolFeaturizer()
dataset = dc.utils.save.load_from_disk(dataset_file)
SIDER_tasks = dataset.columns.values[1:].tolist()
print("SIDER tasks: %s" % str(SIDER_tasks))
print("%d tasks in total" % len(SIDER_tasks))
loader = dc.data.CSVLoader(
tasks=SIDER_tasks, smiles_field="smiles", featurizer=featurizer)
dataset = loader.featurize(dataset_file)
print("%d datapoints in SIDER dataset" % len(dataset))
# Initialize transformers
transformers = [
dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)]
print("About to transform data")
for transformer in transformers:
dataset = transformer.transform(dataset)
def __len__(self):
"""
Finds number of elements in dataset.
"""
total = 0
for _, row in self.metadata_df.iterrows():
y = load_from_disk(os.path.join(self.data_dir, row['ids']))
total += len(y)
return total
if row['w'] is not None:
# TODO (ytz): Under what condition does this exist but the file itself doesn't?
w_filename = os.path.join(self.data_dir, row['w'])
if os.path.exists(w_filename):
w = np.array(load_from_disk(w_filename))
else:
if len(y.shape) == 1:
w = np.ones(y.shape[0], np.float32)
else:
w = np.ones((y.shape[0], 1), np.float32)
else:
w = None
ids = np.array(
load_from_disk(os.path.join(self.data_dir, row['ids'])), dtype=object)
return (X, y, w, ids)
def load_sider(featurizer='ECFP', split='index'):
current_dir = os.path.dirname(os.path.realpath(__file__))
# Load SIDER dataset
print("About to load SIDER dataset.")
dataset_file = os.path.join(
current_dir, "./sider.csv.gz")
dataset = dc.utils.save.load_from_disk(dataset_file)
print("Columns of dataset: %s" % str(dataset.columns.values))
print("Number of examples in dataset: %s" % str(dataset.shape[0]))
# Featurize SIDER dataset
print("About to featurize SIDER dataset.")
if featurizer == 'ECFP':
featurizer_func = dc.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer_func = dc.feat.ConvMolFeaturizer()
SIDER_tasks = dataset.columns.values[1:].tolist()
print("SIDER tasks: %s" % str(SIDER_tasks))
print("%d tasks in total" % len(SIDER_tasks))
loader = dc.load.DataLoader(tasks=SIDER_tasks,
smiles_field="smiles",