How to use the selfies.encoder function in selfies

To help you get started, we’ve selected a few selfies examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github aspuru-guzik-group / selfies / VariationalAutoEncoder_with_SELFIES / chemistryVAE.py View on Github external
- selfies alphabet
        - longest selfies string
        - smiles encoding (equivalent to file content)
        - smiles alphabet (character based)
        - longest smiles string
    """

    df = pd.read_csv(filename_data_set_file_smiles)
    smiles_list = np.asanyarray(df.smiles)
    smiles_alphabet=list(set(''.join(smiles_list)))
    largest_smiles_len=len(max(smiles_list, key=len))
    selfies_list=[]    
    selfies_len=[]
    print('--> Translating SMILES to SELFIES...')
    for individual_smile in smiles_list:
        individual_selfie=selfies.encoder(individual_smile)
        selfies_list.append(individual_selfie)
        selfies_len.append(len(individual_selfie)-len(individual_selfie.replace('[',''))) # len of SELFIES
    selfies_alphabet_pre=list(set(''.join(selfies_list)[1:-1].split('][')))
    selfies_alphabet=[]
    for selfies_element in selfies_alphabet_pre:
        selfies_alphabet.append('['+selfies_element+']')        
    largest_selfies_len=max(selfies_len)
    print('Finished translating SMILES to SELFIES.')
    return(selfies_list, selfies_alphabet, largest_selfies_len, smiles_list, smiles_alphabet, largest_smiles_len)
github jensengroup / GB-GA / string_crossover.py View on Github external
def mol2string(mol):
    Chem.Kekulize(mol, clearAromaticFlags=True)
    smiles = Chem.MolToSmiles(mol, canonical=False)

    if string_type == 'selfies':
        return encoder(smiles).split('][')

    if string_type == 'deepsmiles':
        string = converter.encode(smiles)
        return list(string)
    
    return list(smiles)
github aspuru-guzik-group / selfies / examples / selfies_example.py View on Github external
print('test_molecule1: '+test_molecule1+'\n')
print('selfies1: '+selfies1+'\n')
print('smiles1: '+smiles1+'\n')
print('equal: '+str(test_molecule1==smiles1)+'\n\n\n')

test_molecule2='CC(C)c1noc(-c2cc[nH+]c(N3CCN(C(=O)[C@H]4C[C@H]4C)CC3)c2)n1' # from ZINC database
selfies2=encoder(test_molecule2)
smiles2=decoder(selfies2)
print('test_molecule2: '+test_molecule2+'\n')
print('selfies2: '+selfies2+'\n')
print('smiles2: '+smiles2+'\n')
print('equal: '+str(test_molecule2==smiles2)+'\n\n\n')


test_molecule3='CCOC(=O)C1(C(=O)OCC)C23c4c5c6c7c8c4-c4c2c2c9c%10c4C4%11c%12c-%10c%10c%13c%14c%15c%16c%17c%18c%19c%20c%21c%22c%23c%24c(c-7c(c7c%12c%13c(c7%24)c(c%19%23)c%18%14)C84C%11(C(=O)OCC)C(=O)OCC)C%224C(C(=O)OCC)(C(=O)OCC)C64c4c-5c5c6c(c4-%21)C%204C(C(=O)OCC)(C(=O)OCC)C%174c4c-6c(c-2c(c4-%16)C92C(C(=O)OCC)(C(=O)OCC)C%10%152)C513' # from PubChem
selfies3=encoder(test_molecule3)
smiles3=decoder(selfies3)
print('test_molecule1: '+test_molecule3+'\n')
print('selfies1: '+selfies3+'\n')
print('smiles1: '+smiles3+'\n')
print('equal: '+str(test_molecule3==smiles3)+'\n\n\n')


#Create a random Molecule
my_alphabet=['[Branch1_1]','[Branch1_2]','[Branch1_3]','[Ring1]','[O]','[=O]','[N]','[=N]','[C]','[=C]','[#C]','[S]','[=S]','[P]','[F]']; # this is a very small alphabet from which the random selfies are generated
                                                                                                                                          # This alphabet can be extended with additional elements. For example, see the list start_alphabet in the function smiles_to_selfies.
                                                                                                                                          # Also when you run the three test-molecules above, you see the brackets that are used, and can use some of them.
                                                                                                                                          
len_of_molecule=30 # Number of selfies symbols of the random string. The final SMILES string will not necessarily be of the same size, because some elements of this alphabet stop the derivation (such as Flour, as it can form only a single bond)
                  
rnd_selfies=''
for ii in range(len_of_molecule):
github aspuru-guzik-group / selfies / examples / selfies_examples.py View on Github external
print('test_molecule1: '+test_molecule1+'\n')
print('selfies1: '+selfies1+'\n')
print('smiles1: '+smiles1+'\n')
print('equal: '+str(test_molecule1==smiles1)+'\n\n\n')

test_molecule2='CC(C)c1noc(-c2cc[nH+]c(N3CCN(C(=O)[C@H]4C[C@H]4C)CC3)c2)n1' # from ZINC database
selfies2=encoder(test_molecule2)
smiles2=decoder(selfies2)
print('test_molecule2: '+test_molecule2+'\n')
print('selfies2: '+selfies2+'\n')
print('smiles2: '+smiles2+'\n')
print('equal: '+str(test_molecule2==smiles2)+'\n\n\n')


test_molecule3='CCOC(=O)C1(C(=O)OCC)C23c4c5c6c7c8c4-c4c2c2c9c%10c4C4%11c%12c-%10c%10c%13c%14c%15c%16c%17c%18c%19c%20c%21c%22c%23c%24c(c-7c(c7c%12c%13c(c7%24)c(c%19%23)c%18%14)C84C%11(C(=O)OCC)C(=O)OCC)C%224C(C(=O)OCC)(C(=O)OCC)C64c4c-5c5c6c(c4-%21)C%204C(C(=O)OCC)(C(=O)OCC)C%174c4c-6c(c-2c(c4-%16)C92C(C(=O)OCC)(C(=O)OCC)C%10%152)C513' # from PubChem
selfies3=encoder(test_molecule3)
smiles3=decoder(selfies3)
print('test_molecule3: '+test_molecule3+'\n')
print('selfies3: '+selfies3+'\n')
print('smiles3: '+smiles3+'\n')
print('equal: '+str(test_molecule3==smiles3)+'\n\n\n')

test_molecule4='Cc1c(C)c(S(=O)(=O)NC(=N)NCCC[C@H](NC(=O)[C@@H]2CCCN2C(=O)[C@H](CCC(=O)NC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCNC(=O)OC(C)(C)C)NC(=O)[C@H](C)NC(=O)[C@@H]2CCCN2C(=O)[C@@H]2CCCN2C(=O)[C@H](CCCCNC(=O)OC(C)(C)C)NC(=O)[C@H](CCCCNC(=O)OC(C)(C)C)NC(=O)[C@H](COC(C)(C)C)NC(=O)[C@H](CCC(=O)OC(C)(C)C)NC(=O)[C@H](CCCCNC(=O)OC(C)(C)C)NC(=O)[C@H](CCCNC(=N)NS(=O)(=O)c2c(C)c(C)c3c(c2C)CCC(C)(C)O3)NC(=O)[C@H](CCC(=O)NC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@H](CCC(=O)NC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@@H](NC(=O)[C@H](CCCNC(=N)NS(=O)(=O)c2c(C)c(C)c3c(c2C)CCC(C)(C)O3)NC(=O)[C@H](CCC(=O)NC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@H](Cc2cn(C(=O)OC(C)(C)C)cn2)NC(=O)[C@H](CCC(=O)OC(C)(C)C)NC(=O)[C@@H]2CCCN2C(=O)[C@H](COC(C)(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](COC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@H](COC(C)(C)C)NC(=O)CNC(=O)OC(C)(C)C)C(C)C)C(=O)O)c(C)c2c1OC(C)(C)CC2'
selfies4=encoder(test_molecule4)
smiles4=decoder(selfies4)
print('test_molecule4: '+test_molecule4+'\n')
print('selfies4: '+selfies4+'\n')
print('smiles4: '+smiles4+'\n')
print('equal: '+str(test_molecule4==smiles4)+'\n\n\n')
github aspuru-guzik-group / selfies / examples / selfies_examples.py View on Github external
print('test_molecule2: '+test_molecule2+'\n')
print('selfies2: '+selfies2+'\n')
print('smiles2: '+smiles2+'\n')
print('equal: '+str(test_molecule2==smiles2)+'\n\n\n')


test_molecule3='CCOC(=O)C1(C(=O)OCC)C23c4c5c6c7c8c4-c4c2c2c9c%10c4C4%11c%12c-%10c%10c%13c%14c%15c%16c%17c%18c%19c%20c%21c%22c%23c%24c(c-7c(c7c%12c%13c(c7%24)c(c%19%23)c%18%14)C84C%11(C(=O)OCC)C(=O)OCC)C%224C(C(=O)OCC)(C(=O)OCC)C64c4c-5c5c6c(c4-%21)C%204C(C(=O)OCC)(C(=O)OCC)C%174c4c-6c(c-2c(c4-%16)C92C(C(=O)OCC)(C(=O)OCC)C%10%152)C513' # from PubChem
selfies3=encoder(test_molecule3)
smiles3=decoder(selfies3)
print('test_molecule3: '+test_molecule3+'\n')
print('selfies3: '+selfies3+'\n')
print('smiles3: '+smiles3+'\n')
print('equal: '+str(test_molecule3==smiles3)+'\n\n\n')

test_molecule4='Cc1c(C)c(S(=O)(=O)NC(=N)NCCC[C@H](NC(=O)[C@@H]2CCCN2C(=O)[C@H](CCC(=O)NC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCNC(=O)OC(C)(C)C)NC(=O)[C@H](C)NC(=O)[C@@H]2CCCN2C(=O)[C@@H]2CCCN2C(=O)[C@H](CCCCNC(=O)OC(C)(C)C)NC(=O)[C@H](CCCCNC(=O)OC(C)(C)C)NC(=O)[C@H](COC(C)(C)C)NC(=O)[C@H](CCC(=O)OC(C)(C)C)NC(=O)[C@H](CCCCNC(=O)OC(C)(C)C)NC(=O)[C@H](CCCNC(=N)NS(=O)(=O)c2c(C)c(C)c3c(c2C)CCC(C)(C)O3)NC(=O)[C@H](CCC(=O)NC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@H](CCC(=O)NC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@@H](NC(=O)[C@H](CCCNC(=N)NS(=O)(=O)c2c(C)c(C)c3c(c2C)CCC(C)(C)O3)NC(=O)[C@H](CCC(=O)NC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@H](Cc2cn(C(=O)OC(C)(C)C)cn2)NC(=O)[C@H](CCC(=O)OC(C)(C)C)NC(=O)[C@@H]2CCCN2C(=O)[C@H](COC(C)(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](COC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@H](COC(C)(C)C)NC(=O)CNC(=O)OC(C)(C)C)C(C)C)C(=O)O)c(C)c2c1OC(C)(C)CC2'
selfies4=encoder(test_molecule4)
smiles4=decoder(selfies4)
print('test_molecule4: '+test_molecule4+'\n')
print('selfies4: '+selfies4+'\n')
print('smiles4: '+smiles4+'\n')
print('equal: '+str(test_molecule4==smiles4)+'\n\n\n')



#Create a random Molecule, test robustness


my_alphabet=selfies_alphabet()                                                 # this is a very small alphabet from which the random selfies are generated
                                                                               # This alphabet can be extended with additional elements. For example, see the list start_alphabet in the function smiles_to_selfies.
                                                                               # Also when you run the three test-molecules above, you see the brackets that are used, and can use some of them.
github aspuru-guzik-group / selfies / examples / selfies_examples.py View on Github external
#
#
# For comments, bug reports or feature ideas, please send an email to
# mario.krenn@utoronto.ca and alan@aspuru.com
# =============================================================================

from random import randint
from selfies import encoder, decoder, selfies_alphabet





# Now we encode three molecules from SMILES -> SELFIES, and decode them from SELFIES -> SMILES
test_molecule1='CN1C(=O)C2=C(c3cc4c(s3)-c3sc(-c5ncc(C#N)s5)cc3C43OCCO3)N(C)C(=O)C2=C1c1cc2c(s1)-c1sc(-c3ncc(C#N)s3)cc1C21OCCO1' # non-fullerene acceptors for organic solar cells
selfies1=encoder(test_molecule1)
smiles1=decoder(selfies1)
print('test_molecule1: '+test_molecule1+'\n')
print('selfies1: '+selfies1+'\n')
print('smiles1: '+smiles1+'\n')
print('equal: '+str(test_molecule1==smiles1)+'\n\n\n')

test_molecule2='CC(C)c1noc(-c2cc[nH+]c(N3CCN(C(=O)[C@H]4C[C@H]4C)CC3)c2)n1' # from ZINC database
selfies2=encoder(test_molecule2)
smiles2=decoder(selfies2)
print('test_molecule2: '+test_molecule2+'\n')
print('selfies2: '+selfies2+'\n')
print('smiles2: '+smiles2+'\n')
print('equal: '+str(test_molecule2==smiles2)+'\n\n\n')


test_molecule3='CCOC(=O)C1(C(=O)OCC)C23c4c5c6c7c8c4-c4c2c2c9c%10c4C4%11c%12c-%10c%10c%13c%14c%15c%16c%17c%18c%19c%20c%21c%22c%23c%24c(c-7c(c7c%12c%13c(c7%24)c(c%19%23)c%18%14)C84C%11(C(=O)OCC)C(=O)OCC)C%224C(C(=O)OCC)(C(=O)OCC)C64c4c-5c5c6c(c4-%21)C%204C(C(=O)OCC)(C(=O)OCC)C%174c4c-6c(c-2c(c4-%16)C92C(C(=O)OCC)(C(=O)OCC)C%10%152)C513' # from PubChem
github aspuru-guzik-group / selfies / examples / selfies_example.py View on Github external
# 0.1.1 (04.06.2019): 
#       - initial release    
#
#
# For comments, bug reports or feature ideas, please send an email to
# mario.krenn@utoronto.ca and alan@aspuru.com
# =============================================================================

from random import randint
from selfies import encoder, decoder

print('SELFIES 0.2.0 - example file')

# Now we encode three molecules from SMILES -> SELFIES, and decode them from SELFIES -> SMILES
test_molecule1='CN1C(=O)C2=C(c3cc4c(s3)-c3sc(-c5ncc(C#N)s5)cc3C43OCCO3)N(C)C(=O)C2=C1c1cc2c(s1)-c1sc(-c3ncc(C#N)s3)cc1C21OCCO1' # non-fullerene acceptors for organic solar cells
selfies1=encoder(test_molecule1)
smiles1=decoder(selfies1)
print('test_molecule1: '+test_molecule1+'\n')
print('selfies1: '+selfies1+'\n')
print('smiles1: '+smiles1+'\n')
print('equal: '+str(test_molecule1==smiles1)+'\n\n\n')

test_molecule2='CC(C)c1noc(-c2cc[nH+]c(N3CCN(C(=O)[C@H]4C[C@H]4C)CC3)c2)n1' # from ZINC database
selfies2=encoder(test_molecule2)
smiles2=decoder(selfies2)
print('test_molecule2: '+test_molecule2+'\n')
print('selfies2: '+selfies2+'\n')
print('smiles2: '+smiles2+'\n')
print('equal: '+str(test_molecule2==smiles2)+'\n\n\n')


test_molecule3='CCOC(=O)C1(C(=O)OCC)C23c4c5c6c7c8c4-c4c2c2c9c%10c4C4%11c%12c-%10c%10c%13c%14c%15c%16c%17c%18c%19c%20c%21c%22c%23c%24c(c-7c(c7c%12c%13c(c7%24)c(c%19%23)c%18%14)C84C%11(C(=O)OCC)C(=O)OCC)C%224C(C(=O)OCC)(C(=O)OCC)C64c4c-5c5c6c(c4-%21)C%204C(C(=O)OCC)(C(=O)OCC)C%174c4c-6c(c-2c(c4-%16)C92C(C(=O)OCC)(C(=O)OCC)C%10%152)C513' # from PubChem
github aspuru-guzik-group / selfies / examples / selfies_examples.py View on Github external
# Now we encode three molecules from SMILES -> SELFIES, and decode them from SELFIES -> SMILES
test_molecule1='CN1C(=O)C2=C(c3cc4c(s3)-c3sc(-c5ncc(C#N)s5)cc3C43OCCO3)N(C)C(=O)C2=C1c1cc2c(s1)-c1sc(-c3ncc(C#N)s3)cc1C21OCCO1' # non-fullerene acceptors for organic solar cells
selfies1=encoder(test_molecule1)
smiles1=decoder(selfies1)
print('test_molecule1: '+test_molecule1+'\n')
print('selfies1: '+selfies1+'\n')
print('smiles1: '+smiles1+'\n')
print('equal: '+str(test_molecule1==smiles1)+'\n\n\n')

test_molecule2='CC(C)c1noc(-c2cc[nH+]c(N3CCN(C(=O)[C@H]4C[C@H]4C)CC3)c2)n1' # from ZINC database
selfies2=encoder(test_molecule2)
smiles2=decoder(selfies2)
print('test_molecule2: '+test_molecule2+'\n')
print('selfies2: '+selfies2+'\n')
print('smiles2: '+smiles2+'\n')
print('equal: '+str(test_molecule2==smiles2)+'\n\n\n')


test_molecule3='CCOC(=O)C1(C(=O)OCC)C23c4c5c6c7c8c4-c4c2c2c9c%10c4C4%11c%12c-%10c%10c%13c%14c%15c%16c%17c%18c%19c%20c%21c%22c%23c%24c(c-7c(c7c%12c%13c(c7%24)c(c%19%23)c%18%14)C84C%11(C(=O)OCC)C(=O)OCC)C%224C(C(=O)OCC)(C(=O)OCC)C64c4c-5c5c6c(c4-%21)C%204C(C(=O)OCC)(C(=O)OCC)C%174c4c-6c(c-2c(c4-%16)C92C(C(=O)OCC)(C(=O)OCC)C%10%152)C513' # from PubChem
selfies3=encoder(test_molecule3)
smiles3=decoder(selfies3)
print('test_molecule3: '+test_molecule3+'\n')
print('selfies3: '+selfies3+'\n')
print('smiles3: '+smiles3+'\n')
print('equal: '+str(test_molecule3==smiles3)+'\n\n\n')

test_molecule4='Cc1c(C)c(S(=O)(=O)NC(=N)NCCC[C@H](NC(=O)[C@@H]2CCCN2C(=O)[C@H](CCC(=O)NC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCNC(=O)OC(C)(C)C)NC(=O)[C@H](C)NC(=O)[C@@H]2CCCN2C(=O)[C@@H]2CCCN2C(=O)[C@H](CCCCNC(=O)OC(C)(C)C)NC(=O)[C@H](CCCCNC(=O)OC(C)(C)C)NC(=O)[C@H](COC(C)(C)C)NC(=O)[C@H](CCC(=O)OC(C)(C)C)NC(=O)[C@H](CCCCNC(=O)OC(C)(C)C)NC(=O)[C@H](CCCNC(=N)NS(=O)(=O)c2c(C)c(C)c3c(c2C)CCC(C)(C)O3)NC(=O)[C@H](CCC(=O)NC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@H](CCC(=O)NC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@@H](NC(=O)[C@H](CCCNC(=N)NS(=O)(=O)c2c(C)c(C)c3c(c2C)CCC(C)(C)O3)NC(=O)[C@H](CCC(=O)NC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@H](Cc2cn(C(=O)OC(C)(C)C)cn2)NC(=O)[C@H](CCC(=O)OC(C)(C)C)NC(=O)[C@@H]2CCCN2C(=O)[C@H](COC(C)(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](COC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@H](COC(C)(C)C)NC(=O)CNC(=O)OC(C)(C)C)C(C)C)C(=O)O)c(C)c2c1OC(C)(C)CC2'

selfies

SELFIES (SELF-referencIng Embedded Strings) is a general-purpose, sequence-based, robust representation of semantically constrained graphs.

Apache-2.0
Latest version published 6 months ago

Package Health Score

74 / 100
Full package analysis

Similar packages