How to use the autogluon.TabularPrediction.Dataset function in autogluon

To help you get started, we’ve selected a few autogluon examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github awslabs / autogluon / tests / unittests / test_tabularHPO.py View on Github external
mx.random.seed(seed_val)
            dataset = datasets[idx]
            print("Evaluating Benchmark Dataset %s (%d of %d)" % (dataset['name'], idx+1, len(datasets)))
            directory = dataset['name'] + "/"
            train_file_path = directory + train_file
            test_file_path = directory + test_file
            if (not os.path.exists(train_file_path)) or (not os.path.exists(test_file_path)):
                # fetch files from s3:
                print("%s data not found locally, so fetching from %s" % (dataset['name'],  dataset['url']))
                os.system("wget " + dataset['url'] + " -O temp.zip && unzip -o temp.zip && rm temp.zip")
            
            savedir = directory + 'AutogluonOutput/'
            shutil.rmtree(savedir, ignore_errors=True) # Delete AutoGluon output directory to ensure previous runs' information has been removed.
            label_column = dataset['label_column']
            train_data = task.Dataset(file_path=train_file_path)
            test_data = task.Dataset(file_path=test_file_path)
            y_test = test_data[label_column]
            test_data = test_data.drop(labels=[label_column], axis=1)
            if fast_benchmark:
                train_data = train_data.head(subsample_size) # subsample for fast_benchmark
            predictor = None # reset from last Dataset
            if fast_benchmark:
                predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir, 
                    hyperparameter_tune=hyperparameter_tune, hyperparameters=hyperparameters,
                    time_limits=time_limits, num_trials=num_trials, verbosity=verbosity)
            else:
                predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir, 
                                     hyperparameter_tune=hyperparameter_tune, verbosity=verbosity)
            results = predictor.fit_summary(verbosity=0)
            if predictor.problem_type != dataset['problem_type']:
                warnings.warn("For dataset %s: Autogluon inferred problem_type = %s, but should = %s" % (dataset['name'], predictor.problem_type, dataset['problem_type']))
            predictor = None  # We delete predictor here to test loading previously-trained predictor from file
github awslabs / autogluon / tests / unittests / test_tabular.py View on Github external
mx.random.seed(seed_val)
            dataset = datasets[idx]
            print("Evaluating Benchmark Dataset %s (%d of %d)" % (dataset['name'], idx+1, len(datasets)))
            directory = dataset['name'] + "/"
            train_file_path = directory + train_file
            test_file_path = directory + test_file
            if (not os.path.exists(train_file_path)) or (not os.path.exists(test_file_path)):
                # fetch files from s3:
                print("%s data not found locally, so fetching from %s" % (dataset['name'],  dataset['url']))
                os.system("wget " + dataset['url'] + " -O temp.zip && unzip -o temp.zip && rm temp.zip")
            
            savedir = directory + 'AutogluonOutput/'
            shutil.rmtree(savedir, ignore_errors=True) # Delete AutoGluon output directory to ensure previous runs' information has been removed.
            label_column = dataset['label_column']
            train_data = task.Dataset(file_path=train_file_path)
            test_data = task.Dataset(file_path=test_file_path)
            y_test = test_data[label_column]
            test_data = test_data.drop(labels=[label_column], axis=1)
            if fast_benchmark:
                train_data = train_data.head(subsample_size) # subsample for fast_benchmark
            predictor = None # reset from last Dataset
            if fast_benchmark:
                predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir, 
                    hyperparameter_tune=hyperparameter_tune, hyperparameters=hyperparameters,
                    time_limits=time_limits, num_trials=num_trials, verbosity=verbosity)
            else:
                predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir, 
                                     hyperparameter_tune=hyperparameter_tune, verbosity=verbosity)
            results = predictor.fit_summary(verbosity=0)
            if predictor.problem_type != dataset['problem_type']:
                warnings.warn("For dataset %s: Autogluon inferred problem_type = %s, but should = %s" % (dataset['name'], predictor.problem_type, dataset['problem_type']))
            predictor = None  # We delete predictor here to test loading previously-trained predictor from file
github awslabs / autogluon / tests / unittests / test_tabularHPO.py View on Github external
np.random.seed(seed_val)
            mx.random.seed(seed_val)
            dataset = datasets[idx]
            print("Evaluating Benchmark Dataset %s (%d of %d)" % (dataset['name'], idx+1, len(datasets)))
            directory = dataset['name'] + "/"
            train_file_path = directory + train_file
            test_file_path = directory + test_file
            if (not os.path.exists(train_file_path)) or (not os.path.exists(test_file_path)):
                # fetch files from s3:
                print("%s data not found locally, so fetching from %s" % (dataset['name'],  dataset['url']))
                os.system("wget " + dataset['url'] + " -O temp.zip && unzip -o temp.zip && rm temp.zip")
            
            savedir = directory + 'AutogluonOutput/'
            shutil.rmtree(savedir, ignore_errors=True) # Delete AutoGluon output directory to ensure previous runs' information has been removed.
            label_column = dataset['label_column']
            train_data = task.Dataset(file_path=train_file_path)
            test_data = task.Dataset(file_path=test_file_path)
            y_test = test_data[label_column]
            test_data = test_data.drop(labels=[label_column], axis=1)
            if fast_benchmark:
                train_data = train_data.head(subsample_size) # subsample for fast_benchmark
            predictor = None # reset from last Dataset
            if fast_benchmark:
                predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir, 
                    hyperparameter_tune=hyperparameter_tune, hyperparameters=hyperparameters,
                    time_limits=time_limits, num_trials=num_trials, verbosity=verbosity)
            else:
                predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir, 
                                     hyperparameter_tune=hyperparameter_tune, verbosity=verbosity)
            results = predictor.fit_summary(verbosity=0)
            if predictor.problem_type != dataset['problem_type']:
                warnings.warn("For dataset %s: Autogluon inferred problem_type = %s, but should = %s" % (dataset['name'], predictor.problem_type, dataset['problem_type']))
github awslabs / autogluon / examples / tabular / example_simple_tabular.py View on Github external
""" Example script for predicting columns of tables, demonstrating simple use-case """

from autogluon import TabularPrediction as task

# Training time:
train_data = task.Dataset(file_path='https://autogluon.s3-us-west-2.amazonaws.com/datasets/Inc/train.csv') # can be local CSV file as well, returns Pandas DataFrame
train_data = train_data.head(500) # subsample for faster demo
print(train_data.head())
label_column = 'class' # specifies which column do we want to predict
savedir = 'ag_models/' # where to save trained models

predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir) # since tuning_data = None, automatically determines train/validation split
results = predictor.fit_summary() # display summary of models trained during fit()

# Inference time:
test_data = task.Dataset(file_path='https://autogluon.s3-us-west-2.amazonaws.com/datasets/Inc/test.csv') # another Pandas DataFrame
y_test = test_data[label_column]
test_data = test_data.drop(labels=[label_column],axis=1) # delete labels from test data since we wouldn't have them in practice
print(test_data.head())

predictor = task.load(savedir) # Unnecessary, we reload predictor just to demonstrate how to load previously-trained predictor from file
y_pred = predictor.predict(test_data)
github awslabs / autogluon / examples / tabular / example_advanced_tabular.py View on Github external
train_data = train_data.head(100) # subsample for faster demo
print(train_data.head())
label_column = 'class' # specifies which column do we want to predict
savedir = 'ag_hpo_models/' # where to save trained models

hyperparams = {'NN': {'num_epochs': 10, 'activation': 'relu', 'dropout_prob': ag.Real(0.0,0.5)}, 
               'GBM': {'num_boost_round': 1000, 'learning_rate': ag.Real(0.01,0.1,log=True)} }

predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir, 
                     hyperparameter_tune=True, hyperparameters=hyperparams, 
                     num_trials=5, time_limits=1*60, num_bagging_folds=0, stack_ensemble_levels=0) # since tuning_data = None, automatically determines train/validation split

results = predictor.fit_summary() # display detailed summary of fit() process

# Inference time:
test_data = task.Dataset(file_path='https://autogluon.s3-us-west-2.amazonaws.com/datasets/Inc/test.csv') # another Pandas DataFrame
print(test_data.head())

perf = predictor.evaluate(test_data) # shorthand way to evaluate our predictor if test-labels available

# Otherwise we make predictions and can evaluate them later:
y_test = test_data[label_column]
test_data = test_data.drop(labels=[label_column],axis=1) # Delete labels from test data since we wouldn't have them in practice
y_pred = predictor.predict(test_data)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
github awslabs / autogluon / examples / tabular / example_advanced_tabular.py View on Github external
""" Example script for predicting columns of tables, demonstrating more advanced usage of fit(). 
    Note that all settings demonstrated here are just chosen for demonstration purposes (to minimize runtime),
    and do not represent wise choices to use in practice.
"""

import autogluon as ag
from autogluon import TabularPrediction as task

# Training time:
train_data = task.Dataset(file_path='https://autogluon.s3-us-west-2.amazonaws.com/datasets/Inc/train.csv') # can be local CSV file as well, returns Pandas DataFrame
train_data = train_data.head(100) # subsample for faster demo
print(train_data.head())
label_column = 'class' # specifies which column do we want to predict
savedir = 'ag_hpo_models/' # where to save trained models

hyperparams = {'NN': {'num_epochs': 10, 'activation': 'relu', 'dropout_prob': ag.Real(0.0,0.5)}, 
               'GBM': {'num_boost_round': 1000, 'learning_rate': ag.Real(0.01,0.1,log=True)} }

predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir, 
                     hyperparameter_tune=True, hyperparameters=hyperparams, 
                     num_trials=5, time_limits=1*60, num_bagging_folds=0, stack_ensemble_levels=0) # since tuning_data = None, automatically determines train/validation split

results = predictor.fit_summary() # display detailed summary of fit() process

# Inference time:
test_data = task.Dataset(file_path='https://autogluon.s3-us-west-2.amazonaws.com/datasets/Inc/test.csv') # another Pandas DataFrame