Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# Now we encode three molecules from SMILES -> SELFIES, and decode them from SELFIES -> SMILES
test_molecule1='CN1C(=O)C2=C(c3cc4c(s3)-c3sc(-c5ncc(C#N)s5)cc3C43OCCO3)N(C)C(=O)C2=C1c1cc2c(s1)-c1sc(-c3ncc(C#N)s3)cc1C21OCCO1' # non-fullerene acceptors for organic solar cells
selfies1=encoder(test_molecule1)
smiles1=decoder(selfies1)
print('test_molecule1: '+test_molecule1+'\n')
print('selfies1: '+selfies1+'\n')
print('smiles1: '+smiles1+'\n')
print('equal: '+str(test_molecule1==smiles1)+'\n\n\n')
test_molecule2='CC(C)c1noc(-c2cc[nH+]c(N3CCN(C(=O)[C@H]4C[C@H]4C)CC3)c2)n1' # from ZINC database
selfies2=encoder(test_molecule2)
smiles2=decoder(selfies2)
print('test_molecule2: '+test_molecule2+'\n')
print('selfies2: '+selfies2+'\n')
print('smiles2: '+smiles2+'\n')
print('equal: '+str(test_molecule2==smiles2)+'\n\n\n')
test_molecule3='CCOC(=O)C1(C(=O)OCC)C23c4c5c6c7c8c4-c4c2c2c9c%10c4C4%11c%12c-%10c%10c%13c%14c%15c%16c%17c%18c%19c%20c%21c%22c%23c%24c(c-7c(c7c%12c%13c(c7%24)c(c%19%23)c%18%14)C84C%11(C(=O)OCC)C(=O)OCC)C%224C(C(=O)OCC)(C(=O)OCC)C64c4c-5c5c6c(c4-%21)C%204C(C(=O)OCC)(C(=O)OCC)C%174c4c-6c(c-2c(c4-%16)C92C(C(=O)OCC)(C(=O)OCC)C%10%152)C513' # from PubChem
selfies3=encoder(test_molecule3)
smiles3=decoder(selfies3)
print('test_molecule3: '+test_molecule3+'\n')
print('selfies3: '+selfies3+'\n')
print('smiles3: '+smiles3+'\n')
print('equal: '+str(test_molecule3==smiles3)+'\n\n\n')
test_molecule4='Cc1c(C)c(S(=O)(=O)NC(=N)NCCC[C@H](NC(=O)[C@@H]2CCCN2C(=O)[C@H](CCC(=O)NC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCNC(=O)OC(C)(C)C)NC(=O)[C@H](C)NC(=O)[C@@H]2CCCN2C(=O)[C@@H]2CCCN2C(=O)[C@H](CCCCNC(=O)OC(C)(C)C)NC(=O)[C@H](CCCCNC(=O)OC(C)(C)C)NC(=O)[C@H](COC(C)(C)C)NC(=O)[C@H](CCC(=O)OC(C)(C)C)NC(=O)[C@H](CCCCNC(=O)OC(C)(C)C)NC(=O)[C@H](CCCNC(=N)NS(=O)(=O)c2c(C)c(C)c3c(c2C)CCC(C)(C)O3)NC(=O)[C@H](CCC(=O)NC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@H](CCC(=O)NC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@@H](NC(=O)[C@H](CCCNC(=N)NS(=O)(=O)c2c(C)c(C)c3c(c2C)CCC(C)(C)O3)NC(=O)[C@H](CCC(=O)NC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@H](Cc2cn(C(=O)OC(C)(C)C)cn2)NC(=O)[C@H](CCC(=O)OC(C)(C)C)NC(=O)[C@@H]2CCCN2C(=O)[C@H](COC(C)(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](COC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@H](COC(C)(C)C)NC(=O)CNC(=O)OC(C)(C)C)C(C)C)C(=O)O)c(C)c2c1OC(C)(C)CC2'
selfies4=encoder(test_molecule4)
molecule=''
while len(molecule)==0:
is_decoding_error=0
if type_of_encoding==0: # SMILES
molecule_pre=''
for ii in sample_latent_space(latent_dimension):
molecule_pre+=encoding_alphabet[ii]
molecule=molecule_pre.replace(' ','')
if type_of_encoding==1: # SELFIES
molecule_pre=''
for ii in sample_latent_space(latent_dimension):
molecule_pre+=encoding_alphabet[ii]
molecule_pre2=molecule_pre.replace(' ','')
molecule=selfies.decoder(molecule_pre2)
total_samples+=1
if is_decoding_error==0:
is_it_correct=is_correct_smiles(molecule)
else:
is_it_correct=0
if is_it_correct==1:
total_correct+=1
same_mol_identifier=0
for jj in range(len(all_correct_molecules)):
if molecule==all_correct_molecules[jj]:
same_mol_identifier=1
break
if same_mol_identifier==0:
def string2mol(string):
if string_type == 'selfies':
string = ']['.join(string)
try:
smiles = decoder(string,PrintErrorMessage=False)
except:
return None
else:
string = ''.join(string)
if string_type == 'smiles':
smiles = string
if string_type == 'deepsmiles':
try:
smiles = converter.decode(string)
except deepsmiles.DecodeError as e:
return None
try:
mol = Chem.MolFromSmiles(smiles)
print('selfies1: '+selfies1+'\n')
print('smiles1: '+smiles1+'\n')
print('equal: '+str(test_molecule1==smiles1)+'\n\n\n')
test_molecule2='CC(C)c1noc(-c2cc[nH+]c(N3CCN(C(=O)[C@H]4C[C@H]4C)CC3)c2)n1' # from ZINC database
selfies2=encoder(test_molecule2)
smiles2=decoder(selfies2)
print('test_molecule2: '+test_molecule2+'\n')
print('selfies2: '+selfies2+'\n')
print('smiles2: '+smiles2+'\n')
print('equal: '+str(test_molecule2==smiles2)+'\n\n\n')
test_molecule3='CCOC(=O)C1(C(=O)OCC)C23c4c5c6c7c8c4-c4c2c2c9c%10c4C4%11c%12c-%10c%10c%13c%14c%15c%16c%17c%18c%19c%20c%21c%22c%23c%24c(c-7c(c7c%12c%13c(c7%24)c(c%19%23)c%18%14)C84C%11(C(=O)OCC)C(=O)OCC)C%224C(C(=O)OCC)(C(=O)OCC)C64c4c-5c5c6c(c4-%21)C%204C(C(=O)OCC)(C(=O)OCC)C%174c4c-6c(c-2c(c4-%16)C92C(C(=O)OCC)(C(=O)OCC)C%10%152)C513' # from PubChem
selfies3=encoder(test_molecule3)
smiles3=decoder(selfies3)
print('test_molecule3: '+test_molecule3+'\n')
print('selfies3: '+selfies3+'\n')
print('smiles3: '+smiles3+'\n')
print('equal: '+str(test_molecule3==smiles3)+'\n\n\n')
test_molecule4='Cc1c(C)c(S(=O)(=O)NC(=N)NCCC[C@H](NC(=O)[C@@H]2CCCN2C(=O)[C@H](CCC(=O)NC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCNC(=O)OC(C)(C)C)NC(=O)[C@H](C)NC(=O)[C@@H]2CCCN2C(=O)[C@@H]2CCCN2C(=O)[C@H](CCCCNC(=O)OC(C)(C)C)NC(=O)[C@H](CCCCNC(=O)OC(C)(C)C)NC(=O)[C@H](COC(C)(C)C)NC(=O)[C@H](CCC(=O)OC(C)(C)C)NC(=O)[C@H](CCCCNC(=O)OC(C)(C)C)NC(=O)[C@H](CCCNC(=N)NS(=O)(=O)c2c(C)c(C)c3c(c2C)CCC(C)(C)O3)NC(=O)[C@H](CCC(=O)NC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@H](CCC(=O)NC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@@H](NC(=O)[C@H](CCCNC(=N)NS(=O)(=O)c2c(C)c(C)c3c(c2C)CCC(C)(C)O3)NC(=O)[C@H](CCC(=O)NC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@H](Cc2cn(C(=O)OC(C)(C)C)cn2)NC(=O)[C@H](CCC(=O)OC(C)(C)C)NC(=O)[C@@H]2CCCN2C(=O)[C@H](COC(C)(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](COC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@H](COC(C)(C)C)NC(=O)CNC(=O)OC(C)(C)C)C(C)C)C(=O)O)c(C)c2c1OC(C)(C)CC2'
selfies4=encoder(test_molecule4)
smiles4=decoder(selfies4)
print('test_molecule4: '+test_molecule4+'\n')
print('selfies4: '+selfies4+'\n')
print('smiles4: '+smiles4+'\n')
print('equal: '+str(test_molecule4==smiles4)+'\n\n\n')
#Create a random Molecule, test robustness
# - initial release
#
#
# For comments, bug reports or feature ideas, please send an email to
# mario.krenn@utoronto.ca and alan@aspuru.com
# =============================================================================
from random import randint
from selfies import encoder, decoder
print('SELFIES 0.2.0 - example file')
# Now we encode three molecules from SMILES -> SELFIES, and decode them from SELFIES -> SMILES
test_molecule1='CN1C(=O)C2=C(c3cc4c(s3)-c3sc(-c5ncc(C#N)s5)cc3C43OCCO3)N(C)C(=O)C2=C1c1cc2c(s1)-c1sc(-c3ncc(C#N)s3)cc1C21OCCO1' # non-fullerene acceptors for organic solar cells
selfies1=encoder(test_molecule1)
smiles1=decoder(selfies1)
print('test_molecule1: '+test_molecule1+'\n')
print('selfies1: '+selfies1+'\n')
print('smiles1: '+smiles1+'\n')
print('equal: '+str(test_molecule1==smiles1)+'\n\n\n')
test_molecule2='CC(C)c1noc(-c2cc[nH+]c(N3CCN(C(=O)[C@H]4C[C@H]4C)CC3)c2)n1' # from ZINC database
selfies2=encoder(test_molecule2)
smiles2=decoder(selfies2)
print('test_molecule2: '+test_molecule2+'\n')
print('selfies2: '+selfies2+'\n')
print('smiles2: '+smiles2+'\n')
print('equal: '+str(test_molecule2==smiles2)+'\n\n\n')
test_molecule3='CCOC(=O)C1(C(=O)OCC)C23c4c5c6c7c8c4-c4c2c2c9c%10c4C4%11c%12c-%10c%10c%13c%14c%15c%16c%17c%18c%19c%20c%21c%22c%23c%24c(c-7c(c7c%12c%13c(c7%24)c(c%19%23)c%18%14)C84C%11(C(=O)OCC)C(=O)OCC)C%224C(C(=O)OCC)(C(=O)OCC)C64c4c-5c5c6c(c4-%21)C%204C(C(=O)OCC)(C(=O)OCC)C%174c4c-6c(c-2c(c4-%16)C92C(C(=O)OCC)(C(=O)OCC)C%10%152)C513' # from PubChem
selfies3=encoder(test_molecule3)
print('selfies2: '+selfies2+'\n')
print('smiles2: '+smiles2+'\n')
print('equal: '+str(test_molecule2==smiles2)+'\n\n\n')
test_molecule3='CCOC(=O)C1(C(=O)OCC)C23c4c5c6c7c8c4-c4c2c2c9c%10c4C4%11c%12c-%10c%10c%13c%14c%15c%16c%17c%18c%19c%20c%21c%22c%23c%24c(c-7c(c7c%12c%13c(c7%24)c(c%19%23)c%18%14)C84C%11(C(=O)OCC)C(=O)OCC)C%224C(C(=O)OCC)(C(=O)OCC)C64c4c-5c5c6c(c4-%21)C%204C(C(=O)OCC)(C(=O)OCC)C%174c4c-6c(c-2c(c4-%16)C92C(C(=O)OCC)(C(=O)OCC)C%10%152)C513' # from PubChem
selfies3=encoder(test_molecule3)
smiles3=decoder(selfies3)
print('test_molecule3: '+test_molecule3+'\n')
print('selfies3: '+selfies3+'\n')
print('smiles3: '+smiles3+'\n')
print('equal: '+str(test_molecule3==smiles3)+'\n\n\n')
test_molecule4='Cc1c(C)c(S(=O)(=O)NC(=N)NCCC[C@H](NC(=O)[C@@H]2CCCN2C(=O)[C@H](CCC(=O)NC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCNC(=O)OC(C)(C)C)NC(=O)[C@H](C)NC(=O)[C@@H]2CCCN2C(=O)[C@@H]2CCCN2C(=O)[C@H](CCCCNC(=O)OC(C)(C)C)NC(=O)[C@H](CCCCNC(=O)OC(C)(C)C)NC(=O)[C@H](COC(C)(C)C)NC(=O)[C@H](CCC(=O)OC(C)(C)C)NC(=O)[C@H](CCCCNC(=O)OC(C)(C)C)NC(=O)[C@H](CCCNC(=N)NS(=O)(=O)c2c(C)c(C)c3c(c2C)CCC(C)(C)O3)NC(=O)[C@H](CCC(=O)NC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@H](CCC(=O)NC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@@H](NC(=O)[C@H](CCCNC(=N)NS(=O)(=O)c2c(C)c(C)c3c(c2C)CCC(C)(C)O3)NC(=O)[C@H](CCC(=O)NC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@H](Cc2cn(C(=O)OC(C)(C)C)cn2)NC(=O)[C@H](CCC(=O)OC(C)(C)C)NC(=O)[C@@H]2CCCN2C(=O)[C@H](COC(C)(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](COC(c2ccccc2)(c2ccccc2)c2ccccc2)NC(=O)[C@H](COC(C)(C)C)NC(=O)CNC(=O)OC(C)(C)C)C(C)C)C(=O)O)c(C)c2c1OC(C)(C)CC2'
selfies4=encoder(test_molecule4)
smiles4=decoder(selfies4)
print('test_molecule4: '+test_molecule4+'\n')
print('selfies4: '+selfies4+'\n')
print('smiles4: '+smiles4+'\n')
print('equal: '+str(test_molecule4==smiles4)+'\n\n\n')
#Create a random Molecule, test robustness
my_alphabet=selfies_alphabet() # this is a very small alphabet from which the random selfies are generated
# This alphabet can be extended with additional elements. For example, see the list start_alphabet in the function smiles_to_selfies.
# Also when you run the three test-molecules above, you see the brackets that are used, and can use some of them.
len_of_molecule=50 # Number of selfies symbols of the random string. The final SMILES string will not necessarily be of the same size, because some elements of this alphabet stop the derivation (such as Flour, as it can form only a single bond)
#Create a random Molecule, test robustness
my_alphabet=selfies_alphabet() # this is a very small alphabet from which the random selfies are generated
# This alphabet can be extended with additional elements. For example, see the list start_alphabet in the function smiles_to_selfies.
# Also when you run the three test-molecules above, you see the brackets that are used, and can use some of them.
len_of_molecule=50 # Number of selfies symbols of the random string. The final SMILES string will not necessarily be of the same size, because some elements of this alphabet stop the derivation (such as Flour, as it can form only a single bond)
rnd_selfies=''
for ii in range(len_of_molecule):
rnd_selfies+=my_alphabet[randint(0,len(my_alphabet)-1)]
smiles4=decoder(rnd_selfies)
print('Random Molecule: '+str(smiles4)+'\n')
from selfies import encoder, decoder
print('SELFIES 0.2.0 - example file')
# Now we encode three molecules from SMILES -> SELFIES, and decode them from SELFIES -> SMILES
test_molecule1='CN1C(=O)C2=C(c3cc4c(s3)-c3sc(-c5ncc(C#N)s5)cc3C43OCCO3)N(C)C(=O)C2=C1c1cc2c(s1)-c1sc(-c3ncc(C#N)s3)cc1C21OCCO1' # non-fullerene acceptors for organic solar cells
selfies1=encoder(test_molecule1)
smiles1=decoder(selfies1)
print('test_molecule1: '+test_molecule1+'\n')
print('selfies1: '+selfies1+'\n')
print('smiles1: '+smiles1+'\n')
print('equal: '+str(test_molecule1==smiles1)+'\n\n\n')
test_molecule2='CC(C)c1noc(-c2cc[nH+]c(N3CCN(C(=O)[C@H]4C[C@H]4C)CC3)c2)n1' # from ZINC database
selfies2=encoder(test_molecule2)
smiles2=decoder(selfies2)
print('test_molecule2: '+test_molecule2+'\n')
print('selfies2: '+selfies2+'\n')
print('smiles2: '+smiles2+'\n')
print('equal: '+str(test_molecule2==smiles2)+'\n\n\n')
test_molecule3='CCOC(=O)C1(C(=O)OCC)C23c4c5c6c7c8c4-c4c2c2c9c%10c4C4%11c%12c-%10c%10c%13c%14c%15c%16c%17c%18c%19c%20c%21c%22c%23c%24c(c-7c(c7c%12c%13c(c7%24)c(c%19%23)c%18%14)C84C%11(C(=O)OCC)C(=O)OCC)C%224C(C(=O)OCC)(C(=O)OCC)C64c4c-5c5c6c(c4-%21)C%204C(C(=O)OCC)(C(=O)OCC)C%174c4c-6c(c-2c(c4-%16)C92C(C(=O)OCC)(C(=O)OCC)C%10%152)C513' # from PubChem
selfies3=encoder(test_molecule3)
smiles3=decoder(selfies3)
print('test_molecule1: '+test_molecule3+'\n')
print('selfies1: '+selfies3+'\n')
print('smiles1: '+smiles3+'\n')
print('equal: '+str(test_molecule3==smiles3)+'\n\n\n')
#Create a random Molecule