Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
for i in range(n_windows)])
X_window = X_window.reshape(n_samples * n_windows, -1, order='F')
else:
n_windows = n_features // self.window_size
remainder = n_features % self.window_size
if remainder == 0:
window_idx = np.array_split(np.arange(0, n_features),
n_windows)
else:
split_idx = np.arange(self.window_size,
n_windows * (self.window_size + 1),
self.window_size)
window_idx = np.split(np.arange(0, n_features), split_idx)[:-1]
X_window = X[:, window_idx].reshape(n_samples * n_windows, -1)
sfa = SFA(self.n_coefs, False, self.norm_mean,
self.norm_std, self.n_bins, self.quantiles,
self.variance_selection, self.variance_threshold)
count = CountVectorizer(ngram_range=(1, 1))
X_sfa = sfa.fit_transform(X_window)
X_sfa = np.apply_along_axis(lambda x: ''.join(x),
1,
X_sfa).reshape(n_samples, -1)
word_size = len(X_sfa[0, 0])
if word_size == 1:
count.set_params(tokenizer=self._tok)
if self.numerosity_reduction:
X_sfa = np.apply_along_axis(numerosity_reduction, 1, X_sfa)
else:
X_sfa = np.apply_along_axis(lambda x: ' '.join(x), 1, X_sfa)
count.fit(X_sfa)
order='F')
else:
n_windows = n_features // window_size
remainder = n_features % window_size
if remainder == 0:
window_idx = np.array_split(np.arange(0, n_features),
n_windows)
else:
split_idx = np.arange(window_size,
(n_windows + 1) * window_size,
window_size)
window_idx = np.split(np.arange(0, n_features),
split_idx)[:-1]
X_window = X[:, window_idx].reshape(n_samples * n_windows, -1)
sfa = SFA(self.n_coefs, True, self.norm_mean,
self.norm_std, self.n_bins, 'entropy',
self.variance_selection, self.variance_threshold)
count = CountVectorizer(ngram_range=(1, 2))
y_window = np.repeat(y_ind, n_windows)
X_sfa = sfa.fit_transform(X_window, y_window)
X_sfa = np.apply_along_axis(lambda x: ''.join(x),
1,
X_sfa).reshape(n_samples, -1)
word_size = len(X_sfa[0, 0])
if word_size == 1:
count.set_params(tokenizer=self._tok)
X_sfa = np.apply_along_axis(lambda x: ' '.join(x), 1, X_sfa)
tf = count.fit_transform(X_sfa)
_, pval = chi2(tf, y_ind)
for i in range(n_windows)])
X_window = X_window.reshape(n_samples * n_windows, -1, order='F')
else:
n_windows = n_features // self.window_size
remainder = n_features % self.window_size
if remainder == 0:
window_idx = np.array_split(np.arange(0, n_features),
n_windows)
else:
split_idx = np.arange(self.window_size,
n_windows * (self.window_size + 1),
self.window_size)
window_idx = np.split(np.arange(0, n_features), split_idx)[:-1]
X_window = X[:, window_idx].reshape(n_samples * n_windows, -1)
sfa = SFA(self.n_coefs, False, self.norm_mean,
self.norm_std, self.n_bins, self.quantiles,
self.variance_selection, self.variance_threshold)
count = CountVectorizer(ngram_range=(1, 1))
X_sfa = sfa.fit_transform(X_window)
X_sfa = np.apply_along_axis(lambda x: ''.join(x),
1,
X_sfa).reshape(n_samples, -1)
word_size = len(X_sfa[0, 0])
if word_size == 1:
count.set_params(tokenizer=self._tok)
if self.numerosity_reduction:
X_sfa = np.apply_along_axis(numerosity_reduction, 1, X_sfa)
else:
X_sfa = np.apply_along_axis(lambda x: ' '.join(x), 1, X_sfa)
tf = count.fit_transform(X_sfa)
for i in range(n_windows)])
X_window = X_window.reshape(n_samples * n_windows, -1, order='F')
else:
n_windows = n_features // self.window_size
remainder = n_features % self. window_size
if remainder == 0:
window_idx = np.array_split(np.arange(0, n_features),
n_windows)
else:
split_idx = np.arange(self.window_size,
n_windows * (self.window_size + 1),
self.window_size)
window_idx = np.split(np.arange(0, n_features), split_idx)[:-1]
X_window = X[:, window_idx].reshape(n_samples * n_windows, -1)
sfa = SFA(self.n_coefs, False, self.norm_mean,
self.norm_std, self.n_bins, self.quantiles,
self.variance_selection, self.variance_threshold)
tfidf = TfidfVectorizer(ngram_range=(1, 1), smooth_idf=self.smooth_idf,
sublinear_tf=self.sublinear_tf)
X_sfa = sfa.fit_transform(X_window)
X_sfa = np.apply_along_axis(lambda x: ''.join(x),
1,
X_sfa).reshape(n_samples, -1)
word_size = len(X_sfa[0, 0])
if word_size == 1:
tfidf.set_params(tokenizer=self._tok)
if self.numerosity_reduction:
X_sfa = np.apply_along_axis(numerosity_reduction, 1, X_sfa)
else:
X_sfa = np.apply_along_axis(lambda x: ' '.join(x), 1, X_sfa)
Returns
-------
X_new : sparse matrix, shape = (n_samples, n_words)
Document-term matrix.
"""
X = check_array(X)
n_samples, n_timestamps = X.shape
if y is not None:
check_classification_targets(y)
window_size, window_step = self._check_params(n_timestamps)
n_windows = (n_timestamps - window_size + window_step) // window_step
X_windowed = _windowed_view(
X, n_samples, n_timestamps, window_size, window_step
)
X_windowed = X_windowed.reshape(n_samples * n_windows, window_size)
sfa = SymbolicFourierApproximation(
n_coefs=self.word_size, drop_sum=self.drop_sum, anova=self.anova,
norm_mean=self.norm_mean, norm_std=self.norm_std,
n_bins=self.n_bins, strategy=self.strategy, alphabet=self.alphabet
)
if y is None:
y_repeated = None
else:
y_repeated = np.repeat(y, n_windows)
X_sfa = sfa.fit_transform(X_windowed, y_repeated)
X_word = np.asarray([''.join(X_sfa[i])
-------
self : object
"""
X, y = check_X_y(X, y)
n_samples, n_timestamps = X.shape
check_classification_targets(y)
le = LabelEncoder()
y_ind = le.fit_transform(y)
self.classes_ = le.classes_
n_classes = self.classes_.size
window_size, window_step = self._check_params(n_timestamps)
n_windows = (n_timestamps - window_size + window_step) // window_step
X_windowed = _windowed_view(
X, n_samples, n_timestamps, window_size, window_step
)
X_windowed = X_windowed.reshape(n_samples * n_windows, window_size)
sfa = SymbolicFourierApproximation(
n_coefs=self.word_size, drop_sum=self.drop_sum, anova=self.anova,
norm_mean=self.norm_mean, norm_std=self.norm_std,
n_bins=self.n_bins, strategy=self.strategy, alphabet=self.alphabet
)
y_repeated = np.repeat(y, n_windows)
X_sfa = sfa.fit_transform(X_windowed, y_repeated)
X_word = np.asarray([''.join(X_sfa[i])
for i in range(n_samples * n_windows)])
X_word = X_word.reshape(n_samples, n_windows)
----------
X : array-like, shape = (n_samples, n_timestamps)
Returns
-------
X_new : array, shape = (n_samples,)
Transformed data. Each row is a string consisting of words
separated by a whitespace.
"""
X = check_array(X, dtype=None)
n_samples, n_timestamps = X.shape
window_size, window_step = self._check_params(n_timestamps)
n_windows = (n_timestamps - window_size + window_step) // window_step
X_window = _windowed_view(X, n_samples, n_timestamps,
window_size, window_step)
X_word = np.asarray([[''.join(X_window[i, j])
for j in range(n_windows)]
for i in range(n_samples)])
if self.numerosity_reduction:
not_equal = np.c_[X_word[:, 1:] != X_word[:, :-1],
np.full(n_samples, True)]
X_bow = np.asarray([' '.join(X_word[i, not_equal[i]])
for i in range(n_samples)])
else:
X_bow = np.asarray([' '.join(X_word[i]) for i in range(n_samples)])
return X_bow
check_is_fitted(self, ['_relevant_features_list', '_sfa_list',
'_vectorizer_list', 'vocabulary_'])
X = check_array(X, dtype='float64')
n_samples, n_timestamps = X.shape
X_features = coo_matrix((n_samples, 0), dtype=np.int64)
for (window_size, window_step, sfa,
vectorizer, relevant_features) in zip(
self._window_sizes, self._window_steps, self._sfa_list,
self._vectorizer_list, self._relevant_features_list):
n_windows = ((n_timestamps - window_size + window_step)
// window_step)
X_windowed = _windowed_view(
X, n_samples, n_timestamps, window_size, window_step
)
X_windowed = X_windowed.reshape(n_samples * n_windows, window_size)
X_sfa = sfa.transform(X_windowed)
X_word = np.asarray([''.join(X_sfa[i])
for i in range(n_samples * n_windows)])
X_word = X_word.reshape(n_samples, n_windows)
X_bow = np.asarray([' '.join(X_word[i]) for i in range(n_samples)])
X_counts = vectorizer.transform(X_bow)[:, relevant_features]
X_features = hstack([X_features, X_counts])
if not self.sparse:
return X_features.A
return csr_matrix(X_features)
Class labels for each data sample.
Returns
-------
self : object
"""
X = check_array(X)
n_samples, n_timestamps = X.shape
if y is not None:
check_classification_targets(y)
window_size, window_step = self._check_params(n_timestamps)
n_windows = (n_timestamps - window_size + window_step) // window_step
X_windowed = _windowed_view(
X, n_samples, n_timestamps, window_size, window_step
)
X_windowed = X_windowed.reshape(n_samples * n_windows, window_size)
sfa = SymbolicFourierApproximation(
n_coefs=self.word_size, drop_sum=self.drop_sum, anova=self.anova,
norm_mean=self.norm_mean, norm_std=self.norm_std,
n_bins=self.n_bins, strategy=self.strategy, alphabet=self.alphabet
)
if y is None:
y_repeated = None
else:
y_repeated = np.repeat(y, n_windows)
X_sfa = sfa.fit_transform(X_windowed, y_repeated)
X_word = np.asarray([''.join(X_sfa[i])
of the sliding window is equal to the size of the sliding window, making the
subseries non-overlapping. It is common to use a step of 1 for the sliding
window, which is the default behavior. It is implemented as
:class:`pyts.bag_of_words.BagOfWords`.
"""
# Author: Johann Faouzi
# License: BSD-3-Clause
import matplotlib.pyplot as plt
import numpy as np
from pyts.bag_of_words import BagOfWords
from pyts.datasets import load_gunpoint
# Load the dataset and perform the transformation
X, _, _, _ = load_gunpoint(return_X_y=True)
window_size, word_size = 30, 5
bow = BagOfWords(window_size=window_size, word_size=word_size,
window_step=window_size, numerosity_reduction=False)
X_bow = bow.transform(X)
# Plot the considered subseries
plt.figure(figsize=(10, 4))
splits_series = np.linspace(0, X.shape[1], 1 + X.shape[1] // window_size,
dtype='int64')
for start, end in zip(splits_series[:-1],
np.clip(splits_series[1:] + 1, 0, X.shape[1])):
plt.plot(np.arange(start, end), X[0, start:end], 'o-', lw=1, ms=1)
# Plot the corresponding letters
splits_letters = np.linspace(0, X.shape[1],
1 + word_size * X.shape[1] // window_size)