Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
xtcs.append(these_trajs)
if struct is None:
struct = '%s-%u-protein.pdb'%(proj,ii)
struct_already = True
struct = os.path.join(subdir, struct)
struct = sorted(glob(struct))
elif isinstance(struct, str) and not struct_already:
struct = os.path.join(subdir,struct)
struct = sorted(glob(struct))
if isinstance(struct,list):
struct=struct[0]
ii += 1
src = _source(xtcs, top=struct)
return src, xtcs
def partial_fit(self, X):
""" incrementally update the estimates
Parameters
----------
X: array, list of arrays, PyEMMA reader
input data.
"""
from pyemma.coordinates import source
self._estimate(source(X), partial_fit=True)
self._estimated = True
return self
pos :
ndarray with the positions of the sample
geom_smpl :
sampled geometries. Can be of two types:
* default: :obj:`mdtraj.Trajectory` with :obj:`n_points`-frames
* if keep_all_samples = True: list of length :obj:`n_geom_samples`. Each element is an :obj:`mdtraj.Trajectory` object of :obj:`n_points`-frames.
"""
MD_trajectories = _bmutils.listify_if_not_list(MD_trajectories)
if isinstance(MD_trajectories[0], _md.Trajectory):
src = MD_trajectories
else:
src = _source(MD_trajectories, top=MD_top)
# Find out if we already have a clustering object
try:
projected_trajectories.dtrajs
cl = projected_trajectories
except:
idata = _bmutils.data_from_input(projected_trajectories)
cl = _bmutils.regspace_cluster_to_target([dd[:,proj_idxs] for dd in idata], n_points, n_try_max=10, verbose=verbose)
pos = cl.clustercenters
cat_smpl = cl.sample_indexes_by_cluster(_np.arange(cl.n_clusters), n_geom_samples)
geom_smpl = _bmutils.save_traj_wrapper(src, _np.vstack(cat_smpl), None, top=MD_top, stride=proj_stride)
atom_slice = _bmutils.parse_atom_sel(atom_selection, geom_smpl.top)
prot_lig = np.concatenate((prot_index, lig_coord), axis=0)
#print 'prot_lig', prot_lig
#feat.add_selection(lig_heavy)
#feat.add_backbone_torsions(selstr="(resid >= 105) and (resid <= 115)")
#feat.add_backbone_torsions(selstr="(resid >= 132) and (resid <= 146)")
#feat.add_angles(np.array([[1635, 1639, 2640]]), cossin=False)
#feat.add_angles(np.array([[1635, 1639, 2640]]), cossin=False)
feat.add_dihedrals(np.array([[1638, 1622, 2634, 2640]]), cossin=False)
#feat.add_dihedrals(np.array([[1638, 2634, 2638, 2640]]), cossin=False)
feat.add_distances(prot_lig)
print feat.dimension()
inp = coor.source(traj_list, feat)
#pca_obj = coor.pca(inp, dim=5)
pca_obj = coor.tica(inp, dim=5, lag=2)
Y = pca_obj.get_output()
print Y
print Y[0][:,0]
num_clusters = 4
input_data = coor.cluster_kmeans(data=Y, k=num_clusters, max_iter=1000)
mapped_data = [np.array(dtraj)[:,0] for dtraj in input_data.get_output()]
frame_dict = {}
for num in range(num_clusters):
frame_dict[num] = []
for index, entry in enumerate(mapped_data[0]):
frame_dict[entry].append(index)
* ``paths_dict[idxs][type_of_path]["proj"]`` : ndarray of shape (n_points, proj_dim) with the
coordinates of the projection along the path
* paths_dict[idxs][type_of_path]["proj"] : geometries along the path
idata :
list of ndarrays with the the data in :obj:`projected_trajectories`
"""
if not isinstance(MD_trajectories, list):
MD_trajectories = [MD_trajectories]
if isinstance(MD_trajectories[0], _md.Trajectory):
src = MD_trajectories
else:
src = _source(MD_trajectories, top=MD_top)
idata = _data_from_input(projected_trajectories)
assert len(MD_trajectories) == len(idata), "Mismatch between the number of " \
"MD_trajectories and projected_trajectories: %u vs %u"%(len(MD_trajectories), len(idata))
#TODO: assert total_n_frames (strided) coincies with the n_frames in data
# What's the hightest dimensionlatiy that the input data allows?
input_dim = idata[0].shape[1]
if proj_idxs is None:
proj_idxs = _np.arange(n_projs)
else:
if isinstance(proj_idxs, int):
proj_idxs = [proj_idxs]
proj_dim = _np.max((proj_dim, _np.max(proj_idxs)+1))
def main(fpath, molid, args):
prefix = os.path.join(fpath,molid,molid)
trajfiles = glob.glob(prefix+'*-centered.dcd')
topfiles = glob.glob(prefix+'*-centered.pdb')
jsonfile = os.path.join(prefix+'-acc_its.json')
#Pre-process trajectory files, check for unbound ligands
trajfiles = utils.check_bound_ligand(trajfiles,
topfiles[0])
#Select features to analyze in trajectory
feat = coor.featurizer(topfiles[0])
lig_atoms = feat.select("resname LIG and not type H")
feat.add_selection(lig_atoms)
inp = coor.source(trajfiles, feat)
#Define some input parameters
dt = 8 #Frames in MD taken every 8ps == 0.008 ns
lag_list = np.arange(1, 40,5) #Give a range of lag times to try
#Initailize object to assign trajectories to clusters
data = msm.ConstructMSM(inp)
#Select the apprioriate lagtime from the implied timescale plots
#data.plotImpliedTimescales(data.Y, dt, lag_list, outfname=prefix)
#Selecting a lagtime of 150ps, every BLUES iteration is 176ps
lagtime = 150
#Discretize the trajectories
#Estimate our Markov model from the discrete trajectories
keep_all_samples = boolean, default is False
In principle, once the closest-to-ref geometry has been kept, the other geometries are discarded, and the
output sample contains only n_point geometries. HOWEVER, there are special cases where the user might
want to keep all sampled geometries. Typical use-case is when the n_points is low and many representatives
per clustercenters will be much more informative than the other way around
(i know, this is confusing TODO: write this better)
projected data: nd.array or list of nd.arrays OR pyemma.clustering object
Although 2D is the most usual case, the dimensionality of the clustering and the one of the visualization (2D)
do not necessarily have to be the same
"""
src = _source(MDtrajectory_files, top=topology)
# Find out if we already have a clustering object
try:
projected_data.dtrajs
cl = projected_data
except:
idata = _data_from_input(projected_data)
cl = _cluster_to_target([dd[:,idxs] for dd in idata], n_points, n_try_max=10, verbose=verbose)
pos = cl.clustercenters
cat_smpl = cl.sample_indexes_by_cluster(_np.arange(cl.n_clusters), n_geom_samples)
geom_smpl = _save_traj(src, _np.vstack(cat_smpl), None, stride=proj_stride)
if n_geom_samples>1:
if not keep_all_samples:
geom_smpl = _re_warp(geom_smpl, [n_geom_samples] * cl.n_clusters)
# Of the most populated geom, get the most compact
-------
moldata: pyemma FeatureReader or list of md.Trajectories, depending on the input
"""
if isinstance(inp, (_FeatureReader, _FragmentedTrajectoryReader)):
moldata = inp
# Everything else gets listified
else:
inp = listify_if_not_list(inp)
# We have geometries so don't do anything
if isinstance(inp[0], _md.Trajectory):
moldata = inp
elif isinstance(inp[0], str):
moldata = _source(inp, top=MD_top)
# TODO consider letting pyemma fail here instead of catching this
else:
raise TypeError("Please revise tyour input, it should be a str ",type(inp[0]))
return moldata
prot_lig = np.concatenate((prot_index, lig_coord), axis=0)
#print 'prot_lig', prot_lig
#feat.add_selection(lig_heavy)
#feat.add_backbone_torsions(selstr="(resid >= 105) and (resid <= 115)")
#feat.add_backbone_torsions(selstr="(resid >= 132) and (resid <= 146)")
feat.add_angles(np.array([[132, 126, 141]]), cossin=False)
feat.add_angles(np.array([[134, 128, 141]]), cossin=False)
#feat.add_dihedrals(np.array([[1638, 1622, 2634, 2640]]), cossin=False)
#feat.add_dihedrals(np.array([[1638, 2634, 2638, 2640]]), cossin=False)
feat.add_distances(prot_lig)
print feat.dimension()
inp = coor.source(traj_list, feat)
pca_obj = coor.pca(inp, dim=4)
#pca_obj = coor.tica(inp, dim=4, lag=2)
Y = pca_obj.get_output()
print Y
print Y[0][:,0]
num_clusters = 4
input_data = coor.cluster_kmeans(data=Y, k=num_clusters, max_iter=1000)
mapped_data = [np.array(dtraj)[:,0] for dtraj in input_data.get_output()]
frame_dict = {}
for num in range(num_clusters):
frame_dict[num] = []
for index, entry in enumerate(mapped_data[0]):
frame_dict[entry].append(index)