Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_too_small_max_file_size_init(self):
with tempfile.TemporaryDirectory(prefix="tsinf_format_test") as tempdir:
# Fail immediately if the max_size is so small we can't even create a file
filename = os.path.join(tempdir, "samples.tmp")
self.assertRaises(
lmdb.MapFullError,
formats.SampleData,
path=filename,
sequence_length=1,
max_file_size=1,
)
def _write_to_lmdb(db, key, value):
"""
Write (key,value) to db
"""
success = False
while not success:
txn = db.begin(write=True)
try:
txn.put(key, value)
txn.commit()
success = True
except lmdb.MapFullError:
txn.abort()
# double the map_size
curr_limit = db.info()['map_size']
new_limit = curr_limit*2
db.set_mapsize(new_limit) # double it
])
img_data_str = ['\x08\x03\x10\x04\x18\x02"\x18\x01\x04\x07\n\r\x10\x13\x16\x02\x05\x08\x0b\x0e\x11\x14\x17\x03\x06\t\x0c\x0f\x12\x15\x18(\x01',
'\x08\x03\x10\x02\x18\x01"\x06\x10\x16\x11\x17\x12\x18(\x00']
# write fake data to lmdb
self.path_lmdb_num_ord = os.path.join(self.dir_tmp, 'imgs_num_ord_lmdb')
db = lmdb.open(self.path_lmdb_num_ord, map_size=int(1e12))
with db.begin(write=True) as in_txn:
for idx, data_str in enumerate(img_data_str):
in_txn.put('{:0>10d}'.format(idx), data_str)
db.close()
self.path_lmdb_rand_ord = os.path.join(self.dir_tmp, 'imgs_rand_ord_lmdb')
db = lmdb.open(self.path_lmdb_rand_ord, map_size=int(1e12))
with db.begin(write=True) as in_txn:
for data_str in img_data_str:
in_txn.put('{:0>10d}'.format(np.random.randint(10, 1000)), data_str)
db.close()
self.path_lmdb_non_num = os.path.join(self.dir_tmp, 'imgs_non_num_lmdb')
db = lmdb.open(self.path_lmdb_non_num, map_size=int(1e12))
with db.begin(write=True) as in_txn:
for data_str in img_data_str:
in_txn.put('key' + data_str, data_str)
db.close()
assert_not_equal(self.path_lmdb_num_ord, self.path_lmdb_rand_ord)
assert_not_equal(self.path_lmdb_num_ord, self.path_lmdb_non_num)
def test_num_entries_does_not_exist(self):
path_lmdb = os.path.join(self.dir_tmp, 'test_num_entries_does_not_exist_lmdb')
assert_false(os.path.exists(path_lmdb))
assert_raises(lmdb.Error, r.num_entries, path_lmdb)
def test_too_small_max_file_size_add(self):
with tempfile.TemporaryDirectory(prefix="tsinf_format_test") as tempdir:
base_size = 2 ** 16 # Big enough to allow the initial file to be created
# Fail during adding a large amount of data
with self.assertRaises(lmdb.MapFullError):
filename = os.path.join(tempdir, "samples.tmp")
with formats.SampleData(
path=filename, sequence_length=1, max_file_size=base_size
) as small_sample_file:
small_sample_file.add_site(
0,
alleles=["0", "1"],
genotypes=np.zeros(base_size, dtype=np.int8),
)
# Work around https://github.com/tskit-dev/tsinfer/issues/201
small_sample_file.data.store.close()
def createDataset(inputPath, gtFile, outputPath, checkValid=True):
"""
Create LMDB dataset for training and evaluation.
ARGS:
inputPath : input folder path where starts imagePath
outputPath : LMDB output path
gtFile : list of image path and label
checkValid : if true, check the validity of every image
"""
os.makedirs(outputPath, exist_ok=True)
env = lmdb.open(outputPath, map_size=1099511627776)
cache = {}
cnt = 1
with open(gtFile, 'r', encoding='utf-8') as data:
datalist = data.readlines()
nSamples = len(datalist)
for i in range(nSamples):
imagePath, label = datalist[i].strip('\n').split('\t')
imagePath = os.path.join(inputPath, imagePath)
# # only use alphanumeric data
# if re.search('[^a-zA-Z0-9]', label):
# continue
if not os.path.exists(imagePath):
parser = OptionParser()
parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode")
parser.add_option("-d", "--db", dest="dbpath",help="db path", metavar="DB")
(options, args) = parser.parse_args()
if options.verbose == 1 : VERBOSE = 1
db_path = options.dbpath
if db_path == None :
parser.print_help()
sys.exit(1)
startTime = time.time()
# env == db coz max_dbs=0
env = lmdb.Environment(db_path,map_size=24*(1023**3),subdir=False,readonly=False,create=False,max_dbs=0,lock=False)
txn = lmdb.Transaction(env,db=None,write=True)
linecount = 0
while 1 :
try : line = sys.stdin.readline()
except KeyboardInterrupt : break
if not line : break
try : line = line.strip()
except : continue
if not line : continue
linecount += 1
if linecount % 1000 == 0 :
sys.stderr.write("[linecount]" + "\t" + str(linecount) + "\n")
key,value = line.split('\t',1)
if not key or not value : continue
parser = OptionParser()
parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode")
parser.add_option("-d", "--db", dest="dbpath",help="db path", metavar="DB")
(options, args) = parser.parse_args()
if options.verbose == 1 : VERBOSE = 1
db_path = options.dbpath
if db_path == None :
parser.print_help()
sys.exit(1)
# env == db coz max_dbs=0
env = lmdb.Environment(db_path,map_size=24*(1023**3),subdir=False,readonly=True,create=False,max_dbs=0,lock=False)
txn = lmdb.Transaction(env,db=None,write=False)
startTime = time.time()
linecount = 0
while 1 :
try : line = sys.stdin.readline()
except KeyboardInterrupt : break
if not line : break
try : line = line.strip()
except : continue
if not line : continue
linecount += 1
if linecount % 1000 == 0 :
sys.stderr.write("[linecount]" + "\t" + str(linecount) + "\n")
key,value = line.split('\t',1)
parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode")
parser.add_option("-d", "--db", dest="dbpath",help="db path", metavar="DB")
(options, args) = parser.parse_args()
if options.verbose == 1 : VERBOSE = 1
db_path = options.dbpath
if db_path == None :
parser.print_help()
sys.exit(1)
startTime = time.time()
# env == db coz max_dbs=0
env = lmdb.Environment(db_path,map_size=24*(1023**3),subdir=False,readonly=False,create=False,max_dbs=0,lock=False)
txn = lmdb.Transaction(env,db=None,write=True)
linecount = 0
while 1 :
try : line = sys.stdin.readline()
except KeyboardInterrupt : break
if not line : break
try : line = line.strip()
except : continue
if not line : continue
linecount += 1
if linecount % 1000 == 0 :
sys.stderr.write("[linecount]" + "\t" + str(linecount) + "\n")
key,value = line.split('\t',1)
if not key or not value : continue
def _write_batch_to_lmdb(db, batch):
"""
Write a batch of (key,value) to db
"""
try:
with db.begin(write=True) as lmdb_txn:
for key, datum in batch:
lmdb_txn.put(key, datum.SerializeToString())
except lmdb.MapFullError:
# double the map_size
curr_limit = db.info()['map_size']
new_limit = curr_limit * 2
try:
db.set_mapsize(new_limit) # double it
except AttributeError as e:
version = tuple(int(x) for x in lmdb.__version__.split('.'))
if version < (0, 87):
raise ImportError('py-lmdb is out of date (%s vs 0.87)' % lmdb.__version__)
else:
raise e
# try again
_write_batch_to_lmdb(db, batch)