Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# standardize data to be digestible for most algorithms
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.4, random_state=42)
contamination = y.sum() / len(y)
base_estimators = [
LOF(n_neighbors=5, contamination=contamination),
LOF(n_neighbors=15, contamination=contamination),
LOF(n_neighbors=25, contamination=contamination),
LOF(n_neighbors=35, contamination=contamination),
LOF(n_neighbors=45, contamination=contamination),
HBOS(contamination=contamination),
PCA(contamination=contamination),
OCSVM(contamination=contamination),
KNN(n_neighbors=5, contamination=contamination),
KNN(n_neighbors=15, contamination=contamination),
KNN(n_neighbors=25, contamination=contamination),
KNN(n_neighbors=35, contamination=contamination),
KNN(n_neighbors=45, contamination=contamination),
IForest(n_estimators=50, contamination=contamination),
IForest(n_estimators=100, contamination=contamination),
LOF(n_neighbors=5, contamination=contamination),
LOF(n_neighbors=15, contamination=contamination),
LOF(n_neighbors=25, contamination=contamination),
LOF(n_neighbors=35, contamination=contamination),
LOF(n_neighbors=45, contamination=contamination),
HBOS(contamination=contamination),
PCA(contamination=contamination),
OCSVM(contamination=contamination),
if __name__ == "__main__":
contamination = 0.1 # percentage of outliers
n_train = 200 # number of training points
n_test = 100 # number of testing points
# Generate sample data
X_train, y_train, X_test, y_test = \
generate_data(n_train=n_train,
n_test=n_test,
n_features=2,
contamination=contamination,
random_state=42)
# train PCA detector
clf_name = 'PCA'
clf = PCA()
clf.fit(X_train)
# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_ # raw outlier scores
# get the prediction on the test data
y_test_pred = clf.predict(X_test) # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_test) # outlier scores
# evaluate and print the results
print("\nOn Training Data:")
evaluate_print(clf_name, y_train, y_train_scores)
print("\nOn Test Data:")
evaluate_print(clf_name, y_test, y_test_scores)
HBOS(contamination=contamination),
HBOS(contamination=contamination),
HBOS(contamination=contamination),
HBOS(contamination=contamination),
HBOS(contamination=contamination),
HBOS(contamination=contamination),
HBOS(contamination=contamination),
HBOS(contamination=contamination),
HBOS(contamination=contamination),
HBOS(contamination=contamination),
PCA(contamination=contamination),
OCSVM(contamination=contamination),
PCA(contamination=contamination),
OCSVM(contamination=contamination),
PCA(contamination=contamination),
OCSVM(contamination=contamination),
PCA(contamination=contamination),
OCSVM(contamination=contamination),
PCA(contamination=contamination),
OCSVM(contamination=contamination),
KNN(n_neighbors=5, contamination=contamination),
KNN(n_neighbors=15, contamination=contamination),
KNN(n_neighbors=25, contamination=contamination),
KNN(n_neighbors=35, contamination=contamination),
KNN(n_neighbors=45, contamination=contamination),
KNN(n_neighbors=5, contamination=contamination),
KNN(n_neighbors=15, contamination=contamination),
KNN(n_neighbors=25, contamination=contamination),
KNN(n_neighbors=35, contamination=contamination),
KNN(n_neighbors=45, contamination=contamination),
HBOS(contamination=contamination),
HBOS(contamination=contamination),
HBOS(contamination=contamination),
HBOS(contamination=contamination),
HBOS(contamination=contamination),
HBOS(contamination=contamination),
HBOS(contamination=contamination),
HBOS(contamination=contamination),
HBOS(contamination=contamination),
HBOS(contamination=contamination),
HBOS(contamination=contamination),
HBOS(contamination=contamination),
HBOS(contamination=contamination),
PCA(contamination=contamination),
PCA(contamination=contamination),
PCA(contamination=contamination),
PCA(contamination=contamination),
PCA(contamination=contamination),
PCA(contamination=contamination),
PCA(contamination=contamination),
PCA(contamination=contamination),
KNN(n_neighbors=5, contamination=contamination),
KNN(n_neighbors=15, contamination=contamination),
KNN(n_neighbors=25, contamination=contamination),
KNN(n_neighbors=35, contamination=contamination),
KNN(n_neighbors=45, contamination=contamination),
KNN(n_neighbors=50, contamination=contamination),
KNN(n_neighbors=55, contamination=contamination),
KNN(n_neighbors=65, contamination=contamination),
KNN(n_neighbors=75, contamination=contamination),
classifiers = {
'Angle-based Outlier Detector (ABOD)': ABOD(n_neighbors=10,
contamination=outliers_fraction),
'Cluster-based Local Outlier Factor (CBLOF)':
CBLOF(contamination=outliers_fraction, check_estimator=False),
'Feature Bagging': FeatureBagging(LOF(), contamination=outliers_fraction),
'Histogram-base Outlier Detection (HBOS)': HBOS(
contamination=outliers_fraction),
'Isolation Forest': IForest(contamination=outliers_fraction),
'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
'Average KNN': KNN(method='mean', contamination=outliers_fraction),
'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction),
'Minimum Covariance Determinant (MCD)': MCD(
contamination=outliers_fraction),
'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction)
}
stat_mat_all = np.zeros([len(classifiers), 10])
report_list = ['train_roc_orig', 'train_p@n_orig', 'train_roc_psa',
'train_p@n_psa',
'test_time_orig', 'test_roc_orig', 'test_p@n_orig',
'test_time_psa', 'test_roc_psa', 'test_p@n_psa']
classifier_names = ['ABOD', 'CBLOF', 'FB', 'HBOS', 'IF', 'KNN', 'AKNN', 'LOF',
'MCD', 'OCSVM', 'PCA']
for j in range(n_iter):
stat_mat = np.zeros([len(classifiers), 10])
for i, (clf_name, clf) in enumerate(classifiers.items()):
################## original version
PCA(contamination=contamination),
OCSVM(contamination=contamination),
KNN(n_neighbors=5, contamination=contamination),
KNN(n_neighbors=15, contamination=contamination),
KNN(n_neighbors=25, contamination=contamination),
KNN(n_neighbors=35, contamination=contamination),
KNN(n_neighbors=45, contamination=contamination),
IForest(n_estimators=50, contamination=contamination),
IForest(n_estimators=100, contamination=contamination),
LOF(n_neighbors=5, contamination=contamination),
LOF(n_neighbors=15, contamination=contamination),
LOF(n_neighbors=25, contamination=contamination),
LOF(n_neighbors=35, contamination=contamination),
LOF(n_neighbors=45, contamination=contamination),
HBOS(contamination=contamination),
PCA(contamination=contamination),
OCSVM(contamination=contamination),
KNN(n_neighbors=5, contamination=contamination),
KNN(n_neighbors=15, contamination=contamination),
KNN(n_neighbors=25, contamination=contamination),
KNN(n_neighbors=35, contamination=contamination),
KNN(n_neighbors=45, contamination=contamination),
IForest(n_estimators=50, contamination=contamination),
IForest(n_estimators=100, contamination=contamination),
LOF(n_neighbors=5, contamination=contamination),
LOF(n_neighbors=15, contamination=contamination),
LOF(n_neighbors=25, contamination=contamination),
LOF(n_neighbors=35, contamination=contamination),
LOF(n_neighbors=45, contamination=contamination),
HBOS(contamination=contamination),
PCA(contamination=contamination),
OCSVM(contamination=contamination),
contamination=outliers_fraction,
check_estimator=False,
random_state=random_state),
'Feature Bagging': FeatureBagging(contamination=outliers_fraction,
random_state=random_state),
'Histogram-base Outlier Detection (HBOS)': HBOS(
contamination=outliers_fraction),
'Isolation Forest': IForest(contamination=outliers_fraction,
random_state=random_state),
'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
'Local Outlier Factor (LOF)': LOF(
contamination=outliers_fraction),
'Minimum Covariance Determinant (MCD)': MCD(
contamination=outliers_fraction, random_state=random_state),
'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
'Principal Component Analysis (PCA)': PCA(
contamination=outliers_fraction, random_state=random_state),
}
classifiers_indices = {
'Angle-based Outlier Detector (ABOD)': 0,
'Cluster-based Local Outlier Factor': 1,
'Feature Bagging': 2,
'Histogram-base Outlier Detection (HBOS)': 3,
'Isolation Forest': 4,
'K Nearest Neighbors (KNN)': 5,
'Local Outlier Factor (LOF)': 6,
'Minimum Covariance Determinant (MCD)': 7,
'One-class SVM (OCSVM)': 8,
'Principal Component Analysis (PCA)': 9,
}
for clf_name, clf in classifiers.items():
def __init__(self, n_components=None, n_selected_components=None,
contamination=0.1, copy=True, whiten=False, svd_solver='auto',
tol=0.0, iterated_power='auto', random_state=None,
weighted=True, standardization=True):
super(PCA, self).__init__(contamination=contamination)
self.n_components = n_components
self.n_selected_components = n_selected_components
self.copy = copy
self.whiten = whiten
self.svd_solver = svd_solver
self.tol = tol
self.iterated_power = iterated_power
self.random_state = random_state
self.weighted = weighted
self.standardization = standardization
'Isolation Forest': IForest(contamination=outliers_fraction,
random_state=random_state),
'K Nearest Neighbors (KNN)': KNN(
contamination=outliers_fraction),
'Average KNN': KNN(method='mean',
contamination=outliers_fraction),
# 'Median KNN': KNN(method='median',
# contamination=outliers_fraction),
'Local Outlier Factor (LOF)':
LOF(n_neighbors=35, contamination=outliers_fraction),
# 'Local Correlation Integral (LOCI)':
# LOCI(contamination=outliers_fraction),
'Minimum Covariance Determinant (MCD)': MCD(
contamination=outliers_fraction, random_state=random_state),
'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
'Principal Component Analysis (PCA)': PCA(
contamination=outliers_fraction, random_state=random_state),
# 'Stochastic Outlier Selection (SOS)': SOS(
# contamination=outliers_fraction),
'Locally Selective Combination (LSCP)': LSCP(
detector_list, contamination=outliers_fraction,
random_state=random_state),
# 'Connectivity-Based Outlier Factor (COF)':
# COF(n_neighbors=35, contamination=outliers_fraction),
# 'Subspace Outlier Detection (SOD)':
# SOD(contamination=outliers_fraction),
}
# Show all detectors
for i, clf in enumerate(classifiers.keys()):
print('Model', i + 1, clf)