How to use the biopandas.mol2.split_multimol2 function in biopandas

To help you get started, we’ve selected a few biopandas examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github rasbt / screenlamp / tools / sort_rocs_mol2.py View on Github external
if sortby:
        df.sort_values(sortby, inplace=True, ascending=False)

    if selection:
        selection_str = parse_selection_string(selection, df_name='df')
        mask = pd.eval(selection_str)
        df = df[mask]

    dbase_query_pairs = [(d, q) for d, q in
                         zip(df['Name'].values, df['ShapeQuery'].values)]
    query_names = {q for q in df['ShapeQuery'].values}

    query_mol2s = {}

    multiconf_query = False
    for idx, cont in enumerate(split_multimol2(query_path)):
        if idx >= 1:
            multiconf_query = True
            break

    cnt = -1

    if query_path.endswith('.gz'):
        for id_, cont in split_multimol2(query_path):
            cnt += 1
            cont = b''.join(cont).decode('utf-8').split('\n')
            if multiconf_query:
                mol_idx = '%s_%d' % (id_.decode('utf-8'), cnt)
            else:
                mol_idx = id_
            if mol_idx in query_names:
                if id_suffix:
github rasbt / screenlamp / tools / sort_rocs_mol2.py View on Github external
if query_path.endswith('.gz'):
        for id_, cont in split_multimol2(query_path):
            cnt += 1
            cont = b''.join(cont).decode('utf-8').split('\n')
            if multiconf_query:
                mol_idx = '%s_%d' % (id_.decode('utf-8'), cnt)
            else:
                mol_idx = id_
            if mol_idx in query_names:
                if id_suffix:
                    cont[1] = mol_idx + '\n'
                query_mol2s[mol_idx] = ''.join(cont)

    else:
        for id_, cont in split_multimol2(query_path):
            cnt += 1
            if multiconf_query:
                mol_idx = '%s_%d' % (id_, cnt)
            else:
                mol_idx = id_
            if mol_idx in query_names:
                if id_suffix:
                    cont[1] = mol_idx + '\n'
                query_mol2s[mol_idx] = ''.join(cont)

    out_path_base = os.path.join(output_dir, os.path.basename(inp_mol2_path)
                                 .split('.mol2')[0])
    out_path_q = '%s_%s' % (out_path_base, 'query.mol2')
    out_path_d = '%s_%s' % (out_path_base, 'dbase.mol2')

    with tempfile.TemporaryDirectory() as tmpdirname:
github rasbt / screenlamp / tools / enumerate_conformers.py View on Github external
if query_path.endswith('.gz'):
        for id_, cont in split_multimol2(query_path):
            cnt += 1
            cont = b''.join(cont).decode('utf-8').split('\n')
            if multiconf_query:
                mol_idx = '%s_%d' % (id_.decode('utf-8'), cnt)
            else:
                mol_idx = id_
    """

    with open_file(out_mol2_path, write_mode) as outfile:

        prev_molecule = ''

        if inp_mol2_path.endswith('.gz'):
            for i, (id_, cont) in enumerate(split_multimol2(inp_mol2_path)):
                if prev_molecule != id_:
                    cnt = 0
                else:
                    cnt += 1

                mol_idx = b'%s_%d' % (id_, cnt)

                cont[1] = mol_idx + b'\n'
                outfile.write(b''.join(cont))
                prev_molecule = id_

        else:
            for i, (id_, cont) in enumerate(split_multimol2(inp_mol2_path)):
                if prev_molecule != id_:
                    cnt = 0
                else:
github rasbt / screenlamp / tools / enumerate_conformers.py View on Github external
if inp_mol2_path.endswith('.gz'):
            for i, (id_, cont) in enumerate(split_multimol2(inp_mol2_path)):
                if prev_molecule != id_:
                    cnt = 0
                else:
                    cnt += 1

                mol_idx = b'%s_%d' % (id_, cnt)

                cont[1] = mol_idx + b'\n'
                outfile.write(b''.join(cont))
                prev_molecule = id_

        else:
            for i, (id_, cont) in enumerate(split_multimol2(inp_mol2_path)):
                if prev_molecule != id_:
                    cnt = 0
                else:
                    cnt += 1

                mol_idx = '%s_%d' % (id_, cnt)

                cont[1] = mol_idx + '\n'
                outfile.write(''.join(cont))
                prev_molecule = id_

    if verbose:
        elapsed = time.time() - start
        n_molecules = i + 1
        sys.stdout.write(' | scanned %d molecules | %d mol/sec\n' %
                         (n_molecules, n_molecules / elapsed))
github rasbt / screenlamp / tools / funcgroup_distance_to_id.py View on Github external
for mol2_file in mol2_files:
            if verbose:
                start = time.time()
                sys.stdout.write('Processing %s' % os.path.basename(mol2_file))
                sys.stdout.flush()

            cnt = 0

            if mol2_file.endswith('.gz'):
                data_processor_fn = data_processor_gz
            else:
                data_processor_fn = data_processor

            for chunk in lazy_imap(data_processor=data_processor_fn,
                                   data_generator=split_multimol2(mol2_file),
                                   n_cpus=n_cpus):
                _ = [f.write('%s\n' % mol2_id)for mol2_id in chunk if mol2_id]
                cnt += len(chunk)

            if verbose:
                elapsed = time.time() - start
                sys.stdout.write(' | %d mol/sec\n' % (cnt / elapsed))
                sys.stdout.flush()
github rasbt / screenlamp / tools / funcgroup_matching_selection.py View on Github external
dbase_open_file(output_mol2_path_dbase, dbase_write_mode) as opd:
                for i in selection_indices:

                    mol2_q_cont = ('DID NOT FIND %s\n'
                                   % (df_atom.ix[i]['query']))

                    mol2_d_cont = ('DID NOT FIND %s\n'
                                   % (df_atom.ix[i]['dbase']))

                    for idx, mol2 in enumerate(split_multimol2(
                            input_mol2_path_query)):
                        if idx == i:
                            mol2_q_cont = mol2[1]
                            break

                    for idx, mol2 in enumerate(split_multimol2(
                            input_mol2_path_dbase)):
                        if idx == i:
                            mol2_d_cont = mol2[1]
                            break

                    if query_write_mode == 'wb':
                        opq.write(b''.join(mol2_q_cont))
                    else:
                        opq.write(''.join(mol2_q_cont))

                    if dbase_write_mode == 'wb':
                        opd.write(b''.join(mol2_d_cont))
                    else:
                        opd.write(''.join(mol2_d_cont))

        if verbose:
github rasbt / screenlamp / tools / funcgroup_matching.py View on Github external
if verbose:
        start = time.time()
        sys.stdout.write('Processing %s/%s' % (d_base, q_base))
        sys.stdout.flush()

    cnt = 0

    if q_path.endswith('.gz'):
        data_processor_fn = data_processor_gz
    else:
        data_processor_fn = data_processor

    for chunk in lazy_imap(data_processor=data_processor_fn,
                           data_generator=zip(split_multimol2(d_path),
                                              split_multimol2(q_path)),
                           n_cpus=n_cpus):

        for dbase_id, query_id, atoms, charges in chunk:
            dct_results['dbase'].append(dbase_id)
            dct_results['query'].append(query_id)
            dct_results['atoms'].append(atoms)
            dct_results['charges'].append(charges)

        cnt += len(chunk)
    """

    q_pdmol = PandasMol2()
    d_pdmol = PandasMol2()

    for q_mol2, d_mol2 in zip(split_multimol2(q_path),
                              split_multimol2(d_path)):
github rasbt / screenlamp / tools / funcgroup_presence_to_id.py View on Github external
for mol2_file in mol2_files:
            if verbose:
                start = time.time()
                sys.stdout.write('Processing %s' % os.path.basename(mol2_file))
                sys.stdout.flush()

            cnt = 0

            if mol2_file.endswith('.gz'):
                data_processor_fn = data_processor_gz
            else:
                data_processor_fn = data_processor

            for chunk in lazy_imap(data_processor=data_processor_fn,
                                   data_generator=split_multimol2(
                                      mol2_file),
                                   n_cpus=n_cpus):

                _ = [f.write('%s\n' % mol2_id) for mol2_id
                     in chunk if mol2_id]
                cnt += len(chunk)

            if verbose:
                elapsed = time.time() - start
                sys.stdout.write(' | %d mol/sec\n' % (cnt / elapsed))
                sys.stdout.flush()

biopandas

Machine Learning Library Extensions

BSD-3-Clause
Latest version published 5 months ago

Package Health Score

74 / 100
Full package analysis

Similar packages