Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
])
img_data_str = ['\x08\x03\x10\x04\x18\x02"\x18\x01\x04\x07\n\r\x10\x13\x16\x02\x05\x08\x0b\x0e\x11\x14\x17\x03\x06\t\x0c\x0f\x12\x15\x18(\x01',
'\x08\x03\x10\x02\x18\x01"\x06\x10\x16\x11\x17\x12\x18(\x00']
# write fake data to lmdb
self.path_lmdb_num_ord = os.path.join(self.dir_tmp, 'imgs_num_ord_lmdb')
db = lmdb.open(self.path_lmdb_num_ord, map_size=int(1e12))
with db.begin(write=True) as in_txn:
for idx, data_str in enumerate(img_data_str):
in_txn.put('{:0>10d}'.format(idx), data_str)
db.close()
self.path_lmdb_rand_ord = os.path.join(self.dir_tmp, 'imgs_rand_ord_lmdb')
db = lmdb.open(self.path_lmdb_rand_ord, map_size=int(1e12))
with db.begin(write=True) as in_txn:
for data_str in img_data_str:
in_txn.put('{:0>10d}'.format(np.random.randint(10, 1000)), data_str)
db.close()
self.path_lmdb_non_num = os.path.join(self.dir_tmp, 'imgs_non_num_lmdb')
db = lmdb.open(self.path_lmdb_non_num, map_size=int(1e12))
with db.begin(write=True) as in_txn:
for data_str in img_data_str:
in_txn.put('key' + data_str, data_str)
db.close()
assert_not_equal(self.path_lmdb_num_ord, self.path_lmdb_rand_ord)
assert_not_equal(self.path_lmdb_num_ord, self.path_lmdb_non_num)
def createDataset(inputPath, gtFile, outputPath, checkValid=True):
"""
Create LMDB dataset for training and evaluation.
ARGS:
inputPath : input folder path where starts imagePath
outputPath : LMDB output path
gtFile : list of image path and label
checkValid : if true, check the validity of every image
"""
os.makedirs(outputPath, exist_ok=True)
env = lmdb.open(outputPath, map_size=1099511627776)
cache = {}
cnt = 1
with open(gtFile, 'r', encoding='utf-8') as data:
datalist = data.readlines()
nSamples = len(datalist)
for i in range(nSamples):
imagePath, label = datalist[i].strip('\n').split('\t')
imagePath = os.path.join(inputPath, imagePath)
# # only use alphanumeric data
# if re.search('[^a-zA-Z0-9]', label):
# continue
if not os.path.exists(imagePath):
def __init__(self, parPathDB=None, parSizeBatch=-1, scaleFactor=-1.):
if parPathDB is None:
#FIXME: check this point, LMDBBatcher is not initialized correctly
return
try:
self.cfg = DatasetImage2dInfo(parPathDB)
self.cfg.loadDBInfo(isBuildSearchIndex=False)
tpathTrainDB = self.cfg.pathDbTrain
tpathValDB = self.cfg.pathDbVal
self.dbTrain = lmdb.open(tpathTrainDB, readonly=True)
self.dbVal = lmdb.open(tpathValDB, readonly=True)
with self.dbTrain.begin() as txnTrain, self.dbVal.begin() as txnVal:
self.lbl = self.cfg.labels
self.numLbl = len(self.lbl)
self.numTrain = self.dbTrain.stat()['entries']
self.numVal = self.dbVal.stat()['entries']
with txnTrain.cursor() as cursTrain, txnVal.cursor() as cursVal:
self.keysTrain = np.array([key for key, _ in cursTrain])
self.keysVal = np.array([key for key, _ in cursVal])
timg,_ = ImageTransformer2D.decodeLmdbItem2NNSampple(txnTrain.get(self.keysTrain[0]))
self.shapeImg = timg.shape
if parSizeBatch > 1:
self.sizeBatch = parSizeBatch
if scaleFactor > 0:
self.scaleFactor = scaleFactor
self.loadMeanProto()
except lmdb.Error as err:
def main(args):
net_type = args.net
if net_type == 'p':
net = 'pnet'
elif net_type == 'r':
net = 'rnet'
else:
assert net_type == 'o'
net = 'onet'
logger = get_logger()
db = lmdb.open('data/%s_nonface_train'%net)
with db.begin() as txn:
size = int(txn.get('size'))
logger.info('random read')
for i in np.random.permutation(size):
face_key = '%08d_data'%i
offset_key = '%08d_offset'%i
txn.get(face_key)
txn.get(offset_key)
logger.info('done')
logger.info('sequential read')
for i in range(size):
face_key = '%08d_data'%i
offset_key = '%08d_offset'%i
txn.get(face_key)
txn.get(offset_key)
logger.info('done')
if not all_exist:
print("Creating datasets")
tr_sentences = [txt for txt,lab in tqdm(dataset.load_train_data(), desc="counting train samples")]
te_sentences = [txt for txt,lab in tqdm(dataset.load_test_data(), desc="counting test samples")]
n_tr_samples = len(tr_sentences)
n_te_samples = len(te_sentences)
del tr_sentences
del te_sentences
print("[{}/{}] train/test samples".format(n_tr_samples, n_te_samples))
###################
# transform train #
###################
with lmdb.open(tr_path, map_size=1099511627776) as env:
with env.begin(write=True) as txn:
for i, (sentence, label) in enumerate(tqdm(dataset.load_train_data(), desc="transform train...", total= n_tr_samples)):
xtxt = vectorizer.transform([sentence])[0]
lab = label
txt_key = 'txt-%09d' % i
lab_key = 'lab-%09d' % i
txn.put(lab_key.encode(), np.array([lab]).tobytes())
txn.put(txt_key.encode(), np.array(xtxt).tobytes())
txn.put('nsamples'.encode(), np.array([i+1]).tobytes())
##################
# transform test #
def getCorrespLmdbData(lmdbs_root, N):
# Define LMDBs
image_lmdb = lmdb.open(os.path.join(lmdbs_root, 'image_lmdb'), readonly=True)
keypoint_loc_lmdb = lmdb.open(os.path.join(lmdbs_root, 'keypoint_loc_lmdb'), readonly=True)
keypoint_class_lmdb = lmdb.open(os.path.join(lmdbs_root, 'keypoint_class_lmdb'), readonly=True)
viewpoint_label_lmdb = lmdb.open(os.path.join(lmdbs_root, 'viewpoint_label_lmdb'), readonly=True)
images_dict = utils.getFirstNLmdbImgs(image_lmdb, N)
keypoint_loc_dict = utils.getFirstNLmdbImgs(keypoint_loc_lmdb, N)
keypoint_class_dict = utils.getFirstNLmdbVecs(keypoint_class_lmdb, N)
viewpoint_label_dict = utils.getFirstNLmdbVecs(viewpoint_label_lmdb, N)
return images_dict.keys(), images_dict, keypoint_loc_dict, keypoint_class_dict, viewpoint_label_dict
def list_zones(dirname):
print "dirname:", dirname
env = lmdb.open(dirname, max_dbs=2, map_size=500*1024*1024)
db_zones = env.open_db("zones_db", dupsort=True)
zonedict = dict()
with lmdb.Transaction(env, db_zones, write=False) as txn_zones:
for k, v in txn_zones.cursor():
dn = dname2str(k)
ki = v.rstrip("\x00")
try:
zonedict[dn].insert(0, ki)
except KeyError:
zonedict[dn] = [ ki ]
for zone in zonedict.keys():
print zone, zonedict[zone]
def __init__(self, parPathDB=None, parSizeBatch=-1, scaleFactor=-1.):
if parPathDB is None:
#FIXME: check this point, LMDBBatcher is not initialized correctly
return
try:
self.cfg = DatasetImage2dInfo(parPathDB)
self.cfg.loadDBInfo(isBuildSearchIndex=False)
tpathTrainDB = self.cfg.pathDbTrain
tpathValDB = self.cfg.pathDbVal
self.dbTrain = lmdb.open(tpathTrainDB, readonly=True)
self.dbVal = lmdb.open(tpathValDB, readonly=True)
with self.dbTrain.begin() as txnTrain, self.dbVal.begin() as txnVal:
self.lbl = self.cfg.labels
self.numLbl = len(self.lbl)
self.numTrain = self.dbTrain.stat()['entries']
self.numVal = self.dbVal.stat()['entries']
with txnTrain.cursor() as cursTrain, txnVal.cursor() as cursVal:
self.keysTrain = np.array([key for key, _ in cursTrain])
self.keysVal = np.array([key for key, _ in cursVal])
timg,_ = ImageTransformer2D.decodeLmdbItem2NNSampple(txnTrain.get(self.keysTrain[0]))
self.shapeImg = timg.shape
if parSizeBatch > 1:
self.sizeBatch = parSizeBatch
if scaleFactor > 0:
self.scaleFactor = scaleFactor
self.loadMeanProto()
def __init__(self, options):
with LmdbAdapter.env_lock:
LOGGER.info("lmdb adapter init")
super().__init__(options)
self.path = options["path"]
create_if_missing = bool(options.get("create_if_missing", "True"))
if self.path not in LmdbAdapter.env_dict:
if create_if_missing:
os.makedirs(self.path, exist_ok=True)
LOGGER.info("path not in dict db path:{}".format(self.path))
self.env = lmdb.open(self.path, create=create_if_missing, max_dbs=128, sync=False, map_size=LMDB_MAP_SIZE, writemap=True)
self.sub_db = self.env.open_db(DEFAULT_DB)
self.txn = self.env.begin(db=self.sub_db, write=True)
LmdbAdapter.count_dict[self.path] = 0
LmdbAdapter.env_dict[self.path] = self.env
LmdbAdapter.sub_env_dict[self.path] = self.sub_db
LmdbAdapter.txn_dict[self.path] = self.txn
else:
LOGGER.info("path in dict:{}".format(self.path))
self.env = LmdbAdapter.env_dict[self.path]
self.sub_db = LmdbAdapter.sub_env_dict[self.path]
self.txn = LmdbAdapter.txn_dict[self.path]
self.cursor = self.txn.cursor()
LmdbAdapter.count_dict[self.path] = LmdbAdapter.count_dict[self.path] + 1
def createDataset(outputPath, imagePathList, labelList, lexiconList=None, checkValid=True):
"""
Create LMDB dataset for CRNN training.
ARGS:
outputPath : LMDB output path
imagePathList : list of image path
labelList : list of corresponding groundtruth texts
lexiconList : (optional) list of lexicon lists
checkValid : if true, check the validity of every image
"""
assert(len(imagePathList) == len(labelList))
nSamples = len(imagePathList)
env = lmdb.open(outputPath, map_size=8589934592) # minimum disk space required in Byte
cache = {}
cnt = 1
for i in list(range(nSamples)):
imagePath = imagePathList[i]
label = labelList[i]
if not os.path.exists(imagePath):
print('%s does not exist' % imagePath)
continue
with open(imagePath, 'rb') as f:
imageBin = f.read()
if checkValid:
if not checkImageIsValid(imageBin):
print('%s is not a valid image' % imagePath)
continue
imageKey = 'image-%09d' % cnt