Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_cegb_affects_behavior(self):
X = np.random.random((100, 5))
X[:, [1, 3]] = 0
y = np.random.random(100)
names = ['col_%d' % i for i in range(5)]
ds = lgb.Dataset(X, feature_name=names).construct()
ds.set_label(y)
base = lgb.Booster(train_set=ds)
for k in range(10):
base.update()
with tempfile.NamedTemporaryFile() as f:
basename = f.name
base.save_model(basename)
with open(basename, 'rt') as f:
basetxt = f.read()
# Set extremely harsh penalties, so CEGB will block most splits.
cases = [{'cegb_penalty_feature_coupled': [50, 100, 10, 25, 30]},
{'cegb_penalty_feature_lazy': [1, 2, 3, 4, 5]},
{'cegb_penalty_split': 1}]
for case in cases:
booster = lgb.Booster(train_set=ds, params=case)
for k in range(10):
booster.update()
with tempfile.NamedTemporaryFile() as f:
self.assertEqual(bst.current_iteration(), 20)
self.assertEqual(bst.num_trees(), 20)
self.assertEqual(bst.num_model_per_iteration(), 1)
bst.save_model("model.txt")
pred_from_matr = bst.predict(X_test)
with tempfile.NamedTemporaryFile() as f:
tname = f.name
with open(tname, "w+b") as f:
dump_svmlight_file(X_test, y_test, f)
pred_from_file = bst.predict(tname)
os.remove(tname)
np.testing.assert_allclose(pred_from_matr, pred_from_file)
# check saved model persistence
bst = lgb.Booster(params, model_file="model.txt")
os.remove("model.txt")
pred_from_model_file = bst.predict(X_test)
# we need to check the consistency of model file here, so test for exact equal
np.testing.assert_array_equal(pred_from_matr, pred_from_model_file)
# check early stopping is working. Make it stop very early, so the scores should be very close to zero
pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 1.5}
pred_early_stopping = bst.predict(X_test, **pred_parameter)
# scores likely to be different, but prediction should still be the same
np.testing.assert_array_equal(np.sign(pred_from_matr), np.sign(pred_early_stopping))
# test that shape is checked during prediction
bad_X_test = X_test[:, 1:]
bad_shape_error_msg = "The number of features in data*"
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
bst.predict, bad_X_test)
def test_add_features_same_booster_behaviour(self):
self.maxDiff = None
X = np.random.random((100, 5))
X[:, [1, 3]] = 0
names = ['col_%d' % i for i in range(5)]
for j in range(1, 5):
d1 = lgb.Dataset(X[:, :j], feature_name=names[:j]).construct()
d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct()
d1.add_features_from(d2)
d = lgb.Dataset(X, feature_name=names).construct()
y = np.random.random(100)
d1.set_label(y)
d.set_label(y)
b1 = lgb.Booster(train_set=d1)
b = lgb.Booster(train_set=d)
for k in range(10):
b.update()
b1.update()
with tempfile.NamedTemporaryFile() as df:
dname = df.name
with tempfile.NamedTemporaryFile() as d1f:
d1name = d1f.name
b1.save_model(d1name)
b.save_model(dname)
with open(dname, 'rt') as df:
dtxt = df.read()
with open(d1name, 'rt') as d1f:
d1txt = d1f.read()
self.assertEqual(dtxt, d1txt)
ds = lgb.Dataset(X, feature_name=names).construct()
ds.set_label(y)
base = lgb.Booster(train_set=ds)
for k in range(10):
base.update()
with tempfile.NamedTemporaryFile() as f:
basename = f.name
base.save_model(basename)
with open(basename, 'rt') as f:
basetxt = f.read()
# Set extremely harsh penalties, so CEGB will block most splits.
cases = [{'cegb_penalty_feature_coupled': [50, 100, 10, 25, 30]},
{'cegb_penalty_feature_lazy': [1, 2, 3, 4, 5]},
{'cegb_penalty_split': 1}]
for case in cases:
booster = lgb.Booster(train_set=ds, params=case)
for k in range(10):
booster.update()
with tempfile.NamedTemporaryFile() as f:
casename = f.name
booster.save_model(casename)
with open(casename, 'rt') as f:
casetxt = f.read()
self.assertNotEqual(basetxt, casetxt)
vectorize_sample_x(X_data, 0, premise_shingles, question_shingles, xgb_relevancy_shingle2id)
y_pred = lgb_relevancy.predict(X_data)
print('{}\n\n'.format(y_pred[0]))
if run_mode == 'query2':
# Ручная проверка модели на вводимых в консоли вопросах.
# Список предпосылок читается из заданного файла.
# Загружаем данные обученной модели.
with open(os.path.join(tmp_folder, config_filename), 'r') as f:
model_config = json.load(f)
tokenizer = PhraseSplitter.create_splitter(model_config['lemmatize'])
lgb_relevancy = lightgbm.Booster(model_file=model_config['model_filename'])
xgb_relevancy_shingle2id = model_config['shingle2id']
xgb_relevancy_shingle_len = model_config['shingle_len']
xgb_relevancy_nb_features = model_config['nb_features']
xgb_relevancy_lemmalize = model_config['lemmatize']
premises = []
prompt = ':> '
added_phrases = set()
if task in 'relevancy partial_relevancy'.split():
# Поиск лучшей предпосылки, релевантной введенному вопросу
prompt = 'question:> '
if True:
for fname in ['profile_facts_1.dat']:
def load(self, json_desc):
self.library_version = json_desc.get("library_version", self.library_version)
self.algorithm_name = json_desc.get("algorithm_name", self.algorithm_name)
self.algorithm_short_name = json_desc.get(
"algorithm_short_name", self.algorithm_short_name
)
self.uid = json_desc.get("uid", self.uid)
self.model_file = json_desc.get("model_file", self.model_file)
self.model_file_path = json_desc.get("model_file_path", self.model_file_path)
self.params = json_desc.get("params", self.params)
log.debug("LightgbmLearner load model from %s" % self.model_file_path)
self.model = lgb.Booster(model_file=self.model_file_path)
def executeLightGBMModel(params, model=None):
global lightGBMModel
if model == 'textextraction':
if lightGBMModel is None:
lightGBMModel = lgb.Booster(model_file='LightGBM_model_text_extraction.txt')
vectorKeys = [# They are in this order for a reason - thats what was in our training data file.
'layer_0.max_depth',
'layer_0.min_data_in_leaf',
'layer_0.boosting_rounds',
'layer_1.input_window',
'layer_0.num_leaves',
'layer_1.min_data_in_leaf',
'layer_1.boosting_rounds',
'layer_1.learning_rate',
'layer_1.num_leaves',
'layer_0.bagging_fraction',
'layer_1.max_depth',
'layer_0.learning_rate',
'layer_0.input_window',
'layer_0.feature_fraction']
def _load_models(self):
models = []
path = '/assets/models'
# localize model artifacts for all folds
abs_path = pkg_resources.resource_filename('backend', path)
artifacts = os.listdir(abs_path)
if len(artifacts) != self._n_folds:
raise ValueError('Number of model artifacts does not match n_folds')
for artifact in artifacts:
# load model from artifact
arti_path = os.path.join(abs_path, artifact)
clf = lgb.Booster(model_file=arti_path)
models.append(clf)
return models
def load(self, models_folder):
self.logger.info('Loading LGB_ReqInterpretation model files')
with open(os.path.join(models_folder, 'lgb_req_interpretation.config'), 'r') as f:
self.model_config = json.load(f)
model_path = os.path.join(models_folder, os.path.basename(self.model_config['model_filename']))
self.model = lightgbm.Booster(model_file=model_path)
vectorizer_path = os.path.join(models_folder, os.path.basename(self.model_config['vectorizer_filename']))
with open(vectorizer_path, 'rb') as f:
self.vectorizer = pickle.load(f)
self.no_expansion_phrases = set(self.model_config['no_expansion_phrases'])
def __init__(self):
scalingModelData = json.loads(pkg_resources.resource_string(__name__, "../atpe_models/scaling_model.json"))
self.featureScalingModels = {}
for key in self.atpeModelFeatureKeys:
self.featureScalingModels[key] = sklearn.preprocessing.StandardScaler()
self.featureScalingModels[key].scale_ = numpy.array(scalingModelData[key]['scales'])
self.featureScalingModels[key].mean_ = numpy.array(scalingModelData[key]['means'])
self.featureScalingModels[key].var_ = numpy.array(scalingModelData[key]['variances'])
self.parameterModels = {}
self.parameterModelConfigurations = {}
for param in self.atpeParameters:
modelData = pkg_resources.resource_string(__name__, "../atpe_models/model-" + param + '.txt')
with hypermax.file_utils.ClosedNamedTempFile(modelData) as model_file_name:
self.parameterModels[param] = lightgbm.Booster(model_file=model_file_name)
configString = pkg_resources.resource_string(__name__, "../atpe_models/model-" + param + '-configuration.json')
data = json.loads(configString)
self.parameterModelConfigurations[param] = data
self.lastATPEParameters = None
self.lastLockedParameters = []
self.atpeParamDetails = None