logo
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Readme
Files and versions

53 lines
1.5 KiB

"""
compress processed LMDB
"""
import argparse
import io
import multiprocessing as mp
import numpy as np
import lmdb
from tqdm import tqdm
import msgpack
import msgpack_numpy
msgpack_numpy.patch()
def compress_dump(item):
key, dump = item
img_dump = {k.decode('utf-8'): v for k, v in msgpack.loads(dump).items()}
with io.BytesIO() as writer:
np.savez_compressed(writer, **img_dump, allow_pickle=True)
return key, writer.getvalue()
def main(opts):
if opts.db[-1] == '/':
opts.db = opts.db[:-1]
out_name = f'{opts.db}_compressed'
env = lmdb.open(opts.db, readonly=True)
txn = env.begin()
out_env = lmdb.open(out_name, map_size=1024**4)
out_txn = out_env.begin(write=True)
with mp.Pool(opts.nproc) as pool, tqdm(total=txn.stat()['entries']) as pbar:
for i, (key, value) in enumerate(
pool.imap_unordered(compress_dump, txn.cursor(),
chunksize=128)):
out_txn.put(key=key, value=value)
if i % 1000 == 0:
out_txn.commit()
out_txn = out_env.begin(write=True)
pbar.update(1)
out_txn.commit()
out_env.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--db", default=None, type=str,
help="processed LMDB")
parser.add_argument('--nproc', type=int,
help='number of cores used')
args = parser.parse_args()
main(args)