Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
if sortby:
df.sort_values(sortby, inplace=True, ascending=False)
if selection:
selection_str = parse_selection_string(selection, df_name='df')
mask = pd.eval(selection_str)
df = df[mask]
dbase_query_pairs = [(d, q) for d, q in
zip(df['Name'].values, df['ShapeQuery'].values)]
query_names = {q for q in df['ShapeQuery'].values}
query_mol2s = {}
multiconf_query = False
for idx, cont in enumerate(split_multimol2(query_path)):
if idx >= 1:
multiconf_query = True
break
cnt = -1
if query_path.endswith('.gz'):
for id_, cont in split_multimol2(query_path):
cnt += 1
cont = b''.join(cont).decode('utf-8').split('\n')
if multiconf_query:
mol_idx = '%s_%d' % (id_.decode('utf-8'), cnt)
else:
mol_idx = id_
if mol_idx in query_names:
if id_suffix:
if query_path.endswith('.gz'):
for id_, cont in split_multimol2(query_path):
cnt += 1
cont = b''.join(cont).decode('utf-8').split('\n')
if multiconf_query:
mol_idx = '%s_%d' % (id_.decode('utf-8'), cnt)
else:
mol_idx = id_
if mol_idx in query_names:
if id_suffix:
cont[1] = mol_idx + '\n'
query_mol2s[mol_idx] = ''.join(cont)
else:
for id_, cont in split_multimol2(query_path):
cnt += 1
if multiconf_query:
mol_idx = '%s_%d' % (id_, cnt)
else:
mol_idx = id_
if mol_idx in query_names:
if id_suffix:
cont[1] = mol_idx + '\n'
query_mol2s[mol_idx] = ''.join(cont)
out_path_base = os.path.join(output_dir, os.path.basename(inp_mol2_path)
.split('.mol2')[0])
out_path_q = '%s_%s' % (out_path_base, 'query.mol2')
out_path_d = '%s_%s' % (out_path_base, 'dbase.mol2')
with tempfile.TemporaryDirectory() as tmpdirname:
if query_path.endswith('.gz'):
for id_, cont in split_multimol2(query_path):
cnt += 1
cont = b''.join(cont).decode('utf-8').split('\n')
if multiconf_query:
mol_idx = '%s_%d' % (id_.decode('utf-8'), cnt)
else:
mol_idx = id_
"""
with open_file(out_mol2_path, write_mode) as outfile:
prev_molecule = ''
if inp_mol2_path.endswith('.gz'):
for i, (id_, cont) in enumerate(split_multimol2(inp_mol2_path)):
if prev_molecule != id_:
cnt = 0
else:
cnt += 1
mol_idx = b'%s_%d' % (id_, cnt)
cont[1] = mol_idx + b'\n'
outfile.write(b''.join(cont))
prev_molecule = id_
else:
for i, (id_, cont) in enumerate(split_multimol2(inp_mol2_path)):
if prev_molecule != id_:
cnt = 0
else:
if inp_mol2_path.endswith('.gz'):
for i, (id_, cont) in enumerate(split_multimol2(inp_mol2_path)):
if prev_molecule != id_:
cnt = 0
else:
cnt += 1
mol_idx = b'%s_%d' % (id_, cnt)
cont[1] = mol_idx + b'\n'
outfile.write(b''.join(cont))
prev_molecule = id_
else:
for i, (id_, cont) in enumerate(split_multimol2(inp_mol2_path)):
if prev_molecule != id_:
cnt = 0
else:
cnt += 1
mol_idx = '%s_%d' % (id_, cnt)
cont[1] = mol_idx + '\n'
outfile.write(''.join(cont))
prev_molecule = id_
if verbose:
elapsed = time.time() - start
n_molecules = i + 1
sys.stdout.write(' | scanned %d molecules | %d mol/sec\n' %
(n_molecules, n_molecules / elapsed))
for mol2_file in mol2_files:
if verbose:
start = time.time()
sys.stdout.write('Processing %s' % os.path.basename(mol2_file))
sys.stdout.flush()
cnt = 0
if mol2_file.endswith('.gz'):
data_processor_fn = data_processor_gz
else:
data_processor_fn = data_processor
for chunk in lazy_imap(data_processor=data_processor_fn,
data_generator=split_multimol2(mol2_file),
n_cpus=n_cpus):
_ = [f.write('%s\n' % mol2_id)for mol2_id in chunk if mol2_id]
cnt += len(chunk)
if verbose:
elapsed = time.time() - start
sys.stdout.write(' | %d mol/sec\n' % (cnt / elapsed))
sys.stdout.flush()
dbase_open_file(output_mol2_path_dbase, dbase_write_mode) as opd:
for i in selection_indices:
mol2_q_cont = ('DID NOT FIND %s\n'
% (df_atom.ix[i]['query']))
mol2_d_cont = ('DID NOT FIND %s\n'
% (df_atom.ix[i]['dbase']))
for idx, mol2 in enumerate(split_multimol2(
input_mol2_path_query)):
if idx == i:
mol2_q_cont = mol2[1]
break
for idx, mol2 in enumerate(split_multimol2(
input_mol2_path_dbase)):
if idx == i:
mol2_d_cont = mol2[1]
break
if query_write_mode == 'wb':
opq.write(b''.join(mol2_q_cont))
else:
opq.write(''.join(mol2_q_cont))
if dbase_write_mode == 'wb':
opd.write(b''.join(mol2_d_cont))
else:
opd.write(''.join(mol2_d_cont))
if verbose:
if verbose:
start = time.time()
sys.stdout.write('Processing %s/%s' % (d_base, q_base))
sys.stdout.flush()
cnt = 0
if q_path.endswith('.gz'):
data_processor_fn = data_processor_gz
else:
data_processor_fn = data_processor
for chunk in lazy_imap(data_processor=data_processor_fn,
data_generator=zip(split_multimol2(d_path),
split_multimol2(q_path)),
n_cpus=n_cpus):
for dbase_id, query_id, atoms, charges in chunk:
dct_results['dbase'].append(dbase_id)
dct_results['query'].append(query_id)
dct_results['atoms'].append(atoms)
dct_results['charges'].append(charges)
cnt += len(chunk)
"""
q_pdmol = PandasMol2()
d_pdmol = PandasMol2()
for q_mol2, d_mol2 in zip(split_multimol2(q_path),
split_multimol2(d_path)):
for mol2_file in mol2_files:
if verbose:
start = time.time()
sys.stdout.write('Processing %s' % os.path.basename(mol2_file))
sys.stdout.flush()
cnt = 0
if mol2_file.endswith('.gz'):
data_processor_fn = data_processor_gz
else:
data_processor_fn = data_processor
for chunk in lazy_imap(data_processor=data_processor_fn,
data_generator=split_multimol2(
mol2_file),
n_cpus=n_cpus):
_ = [f.write('%s\n' % mol2_id) for mol2_id
in chunk if mol2_id]
cnt += len(chunk)
if verbose:
elapsed = time.time() - start
sys.stdout.write(' | %d mol/sec\n' % (cnt / elapsed))
sys.stdout.flush()