Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
print("\n... Processing", mat_file_name, '...')
mat = sp.io.loadmat(os.path.join('../datasets', mat_file))
X = mat['X']
y = mat['y'].ravel()
outliers_fraction = np.sum(y) / len(y)
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
classifiers = {
'Angle-based Outlier Detector (ABOD)': ABOD(n_neighbors=10,
contamination=outliers_fraction),
'Cluster-based Local Outlier Factor (CBLOF)':
CBLOF(contamination=outliers_fraction, check_estimator=False),
'Feature Bagging': FeatureBagging(LOF(), contamination=outliers_fraction),
'Histogram-base Outlier Detection (HBOS)': HBOS(
contamination=outliers_fraction),
'Isolation Forest': IForest(contamination=outliers_fraction),
'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
'Average KNN': KNN(method='mean', contamination=outliers_fraction),
'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction),
'Minimum Covariance Determinant (MCD)': MCD(
contamination=outliers_fraction),
'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction)
}
stat_mat_all = np.zeros([len(classifiers), 10])
report_list = ['train_roc_orig', 'train_p@n_orig', 'train_roc_psa',
'train_p@n_psa',
'test_time_orig', 'test_roc_orig', 'test_p@n_orig',
if __name__ == "__main__":
contamination = 0.1 # percentage of outliers
n_train = 200 # number of training points
n_test = 100 # number of testing points
# Generate sample data
X_train, y_train, X_test, y_test = \
generate_data(n_train=n_train,
n_test=n_test,
n_features=2,
contamination=contamination,
random_state=42)
# train Feature Bagging detector
clf_name = 'FeatureBagging'
clf = FeatureBagging()
clf.fit(X_train)
# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_ # raw outlier scores
# get the prediction on the test data
y_test_pred = clf.predict(X_test) # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_test) # outlier scores
# evaluate and print the results
print("\nOn Training Data:")
evaluate_print(clf_name, y_train, y_train_scores)
print("\nOn Test Data:")
evaluate_print(clf_name, y_test, y_test_scores)
if __name__ == "__main__":
contamination = 0.1 # percentage of outliers
n_train = 2000 # number of training points
n_test = 100 # number of testing points
# Generate sample data
X_train, y_train, X_test, y_test = \
generate_data(n_train=n_train,
n_test=n_test,
n_features=2,
contamination=contamination,
random_state=42)
# train ABOD detector
clf_name = 'FeatureBagging'
clf = FeatureBagging()
clf.fit(X_train)
# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_ # raw outlier scores
# get the prediction on the test data
y_test_pred = clf.predict(X_test) # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_test) # outlier scores
# evaluate and print the results
print("\nOn Training Data:")
evaluate_print(clf_name, y_train, y_train_scores)
print("\nOn Test Data:")
evaluate_print(clf_name, y_test, y_test_scores)
# 60% data for training and 40% for testing
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.4, random_state=random_state)
# standardizing data for processing
X_train_norm, X_test_norm = standardizer(X_train, X_test)
classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD(
contamination=outliers_fraction),
'Cluster-based Local Outlier Factor': CBLOF(
n_clusters=10,
contamination=outliers_fraction,
check_estimator=False,
random_state=random_state),
'Feature Bagging': FeatureBagging(contamination=outliers_fraction,
random_state=random_state),
'Histogram-base Outlier Detection (HBOS)': HBOS(
contamination=outliers_fraction),
'Isolation Forest': IForest(contamination=outliers_fraction,
random_state=random_state),
'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
'Local Outlier Factor (LOF)': LOF(
contamination=outliers_fraction),
'Minimum Covariance Determinant (MCD)': MCD(
contamination=outliers_fraction, random_state=random_state),
'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
'Principal Component Analysis (PCA)': PCA(
contamination=outliers_fraction, random_state=random_state),
}
classifiers_indices = {
'Angle-based Outlier Detector (ABOD)': 0,
def __init__(self, base_estimator=None, n_estimators=10, contamination=0.1,
max_features=1.0, bootstrap_features=False,
check_detector=True, check_estimator=False, n_jobs=1,
random_state=None, combination='average', verbose=0,
estimator_params=None):
super(FeatureBagging, self).__init__(contamination=contamination)
self.base_estimator = base_estimator
self.n_estimators = n_estimators
self.max_features = max_features
self.bootstrap_features = bootstrap_features
self.check_detector = check_detector
self.check_estimator = check_estimator
self.combination = combination
self.n_jobs = n_jobs
self.random_state = random_state
self.verbose = verbose
if estimator_params is not None:
self.estimator_params = estimator_params
else:
self.estimator_params = {}
print('Number of outliers: %i' % n_outliers)
print(
'Ground truth shape is {shape}. Outlier are 1 and inliers are 0.\n'.format(
shape=ground_truth.shape))
print(ground_truth, '\n')
random_state = np.random.RandomState(42)
# Define nine outlier detection tools to be compared
classifiers = {
'Angle-based Outlier Detector (ABOD)':
ABOD(contamination=outliers_fraction),
'Cluster-based Local Outlier Factor (CBLOF)':
CBLOF(contamination=outliers_fraction,
check_estimator=False, random_state=random_state),
'Feature Bagging':
FeatureBagging(LOF(n_neighbors=35),
contamination=outliers_fraction,
random_state=random_state),
'Histogram-base Outlier Detection (HBOS)': HBOS(
contamination=outliers_fraction),
'Isolation Forest': IForest(contamination=outliers_fraction,
random_state=random_state),
'K Nearest Neighbors (KNN)': KNN(
contamination=outliers_fraction),
'Average KNN': KNN(method='mean',
contamination=outliers_fraction),
# 'Median KNN': KNN(method='median',
# contamination=outliers_fraction),
'Local Outlier Factor (LOF)':
LOF(n_neighbors=35, contamination=outliers_fraction),
# 'Local Correlation Integral (LOCI)':
# LOCI(contamination=outliers_fraction),
n_estimators_total = 500
mat_file = 'cardio.mat'
mat_file_name = mat_file.replace('.mat', '')
print("\n... Processing", mat_file_name, '...')
mat = sp.io.loadmat(os.path.join('../datasets', mat_file))
X = mat['X']
y = mat['y']
X = StandardScaler().fit_transform(X)
classifiers = {
1: ABOD(n_neighbors=10),
2: CBLOF(check_estimator=False),
3: FeatureBagging(LOF()),
4: HBOS(),
5: IForest(),
6: KNN(),
7: LOF(),
8: MCD(),
9: OCSVM(),
10: PCA(),
}
idx_clf_mapping = {
1: 'ABOD',
2: 'CBLOF',
3: 'FeatureBagging',
4: 'HBOS',
5: 'IForest',
6: 'KNN',