Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _all_filenames(directory):
"""Return list of filenames in a directory, excluding __pycache__ files."""
filenames = []
for root, _, files in pathlib.os.walk(str(directory)):
for f in files:
full_filename = root+pathlib.os.sep+f
if "__pycache__" not in full_filename:
filenames.append(full_filename)
filenames.sort()
return filenames
"""
try:
global DATA_DIRECTORY
global GLOBAL_CONFIG
metadata_path = os.path.join(
os.getcwd(), GLOBAL_CONFIG["PROJECT_NAME"], "metadata.json"
) # read metadata
with open(metadata_path, "r") as f:
metadata = json.load(f)
f.close()
all_entities = []
entity_files = metadata[
"entity_files"
] # get a list of all json files having entities' ids
for entity_file in entity_files:
partition_number = int(os.path.splitext(entity_file)[0].split("_")[-1])
entity_type = "_".join(
os.path.splitext(entity_file)[0]
.replace("entity_names_", "")
.split("_")[:-1]
).strip(
"_"
) # find the entity type
entity_file_path = os.path.join(DATA_DIRECTORY, entity_file)
entity_data = json.load(open(entity_file_path, "r"))
entity_dict = dict(
entity_ids=entity_data,
entity_type=entity_type,
partition_number=partition_number,
entity_file=entity_file,
) # creates a dict object for one partition
all_entities.append(entity_dict)
def load_binary_mnist(cfg, **kwcfg):
fname = cfg.data_dir / 'binary_mnist.h5'
if not fname.exists():
print('Downloading binary MNIST data...')
data.download_binary_mnist(fname)
f = h5py.File(pathlib.os.path.join(pathlib.os.environ['DAT'], 'binarized_mnist.hdf5'), 'r')
x_train = f['train'][::]
x_val = f['valid'][::]
x_test = f['test'][::]
train = torch.utils.data.TensorDataset(torch.from_numpy(x_train))
train_loader = torch.utils.data.DataLoader(train, batch_size=cfg.batch_size, shuffle=True, **kwcfg)
validation = torch.utils.data.TensorDataset(torch.from_numpy(x_val))
val_loader = torch.utils.data.DataLoader(validation, batch_size=cfg.test_batch_size, shuffle=False)
test = torch.utils.data.TensorDataset(torch.from_numpy(x_test))
test_loader = torch.utils.data.DataLoader(test, batch_size=cfg.test_batch_size, shuffle=False)
return train_loader, val_loader, test_loader
global SIMILARITY_SEARCH_CONFIG
global GLOBAL_CONFIG
global DATA_DIRECTORY
global CHECKPOINT_DIRECTORY
global FAISS_INDEX_NAME
global EMBEDDING_DIMENSIONS
global NUM_CLUSTER
global neighbors
SIMILARITY_SEARCH_CONFIG = load_config("SIMILARITY_SEARCH_CONFIG")
GLOBAL_CONFIG = load_config("GLOBAL_CONFIG")
DATA_DIRECTORY = os.path.join(
os.getcwd(), GLOBAL_CONFIG["PROJECT_NAME"], GLOBAL_CONFIG["DATA_DIRECTORY"]
)
CHECKPOINT_DIRECTORY = os.path.join(
os.getcwd(),
GLOBAL_CONFIG["PROJECT_NAME"],
GLOBAL_CONFIG["CHECKPOINT_DIRECTORY"],
)
FAISS_INDEX_NAME = SIMILARITY_SEARCH_CONFIG["FAISS_INDEX_NAME"]
EMBEDDING_DIMENSIONS = GLOBAL_CONFIG["EMBEDDING_DIMENSIONS"]
NUM_CLUSTER = SIMILARITY_SEARCH_CONFIG["NUM_CLUSTER"]
neighbors = SIMILARITY_SEARCH_CONFIG["NEAREST_NEIGHBORS"] + 1
def create_index_directory():
try:
index_directory = os.path.join(CHECKPOINT_DIRECTORY, "index")
os.makedirs(index_directory, exist_ok=True)
except Exception as e:
logging.error(f"Could not create index: {e}", exc_info=True)