Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
from mindsdb.config import CONFIG
from mindsdb.libs.constants.mindsdb import *
from mindsdb.libs.phases.base_module import BaseModule
from mindsdb.libs.data_types.mindsdb_logger import log
from mindsdb.libs.helpers.text_helpers import hashtext
from mindsdb.external_libs.stats import calculate_sample_size
import random
import traceback
import pandas
import numpy as np
class DataExtractor(BaseModule):
phase_name = PHASE_DATA_EXTRACTOR
def _get_data_frame_from_when_conditions(self):
"""
:return:
"""
columns = self.transaction.lmd['columns']
when_conditions = self.transaction.hmd['model_when_conditions']
when_conditions_list = []
# here we want to make a list of the type ( ValueForField1, ValueForField2,..., ValueForFieldN ), ...
for when_condition in when_conditions:
cond_list = [None] * len(columns) # empty list with blanks for values
from __future__ import unicode_literals, print_function, division
from mindsdb.config import CONFIG
from mindsdb.libs.constants.mindsdb import *
from mindsdb.libs.phases.base_module import BaseModule
from mindsdb.libs.workers.train import TrainWorker
from mindsdb.libs.data_types.transaction_metadata import TransactionMetadata
import _thread
import time
class ModelTrainer(BaseModule):
phase_name = PHASE_MODEL_TRAINER
def run(self):
"""
Run the training process, we can perhaps iterate over all hyper parameters here and spun off model variations
TODO: checkout the RISELab distributed ML projects for this
:return: None
"""
model_name = self.transaction.persistent_model_metadata.model_name
train_meta_data = self.transaction.train_metadata # type: TransactionMetadata
from mindsdb.config import CONFIG
from mindsdb.libs.constants.mindsdb import *
from mindsdb.libs.phases.base_module import BaseModule
from collections import OrderedDict
from mindsdb.libs.workers.train import TrainWorker
from bson.objectid import ObjectId
import _thread
import time
class DataEncoder(BaseModule):
phase_name = PHASE_DATA_ENCODER
def run(self):
"""
Run the training process, we can perhaps iterate over all hyper parameters here and spun off model variations
TODO: checkout the RISELab distributed ML projects for this
:return: None
"""
model_name = self.transaction.model_metadata[KEY_MODEL_NAME]
model_stats = self.session.mongo.mindsdb.model_stats
from mindsdb.libs.constants.mindsdb import *
from mindsdb.libs.phases.base_module import BaseModule
class StatsLoader(BaseModule):
phase_name = PHASE_STATS_GENERATOR
def run(self):
self.transaction.persistent_model_metadata = self.transaction.persistent_model_metadata.find_one(self.transaction.persistent_model_metadata.getPkey())
# laod the most accurate model
info = self.transaction.persistent_ml_model_info.find({'model_name':self.transaction.metadata.model_name}, order_by=[('r_squared',-1)])
if info is not None and len(info)>0:
self.transaction.persistent_ml_model_info = info[0]
else:
self.log.error('No model found for this statement, please check if model_name {model_name} was trained'.format(model_name=self.transaction.metadata.model_name))
from mindsdb.config import CONFIG
from mindsdb.libs.constants.mindsdb import *
from mindsdb.libs.phases.base_module import BaseModule
from mindsdb.libs.data_types.mindsdb_logger import log
from mindsdb.libs.constants.mindsdb import *
class DataSplitter(BaseModule):
def run(self):
group_by = self.transaction.lmd['model_group_by']
if group_by is None or len(group_by) == 0:
group_by = []
for col in self.transaction.lmd['predict_columns']:
if self.transaction.lmd['column_stats'][col]['data_type'] == DATA_TYPES.CATEGORICAL:
group_by.append(col)
if len(group_by) > 0:
self.transaction.input_data.data_frame = self.transaction.input_data.data_frame.sort_values(group_by)
KEY_NO_GROUP_BY = '{PLEASE_DONT_TELL_ME_ANYONE_WOULD_CALL_A_COLUMN_THIS}##ALL_ROWS_NO_GROUP_BY##{PLEASE_DONT_TELL_ME_ANYONE_WOULD_CALL_A_COLUMN_THIS}'
# create all indexes by group by, that is all the rows that belong to each group by
all_indexes = {}
train_indexes = {}
test_indexes = {}
* This file is part of MindsDB Server.
*
* MindsDB Server can not be copied and/or distributed without the express
* permission of MindsDB Inc
*******************************************************
"""
import numpy
from mindsdb.config import *
from mindsdb.libs.constants.mindsdb import *
from mindsdb.libs.phases.base_module import BaseModule
from collections import OrderedDict
from mindsdb.libs.helpers.norm_denorm_helpers import denorm
class DataDevectorizer(BaseModule):
phase_name = PHASE_DATA_DEVECTORIZATION
def run(self):
result = []
#NOTE: we only use this model in PREDICT
for group in self.transaction.model_data.predict_set:
for column in self.transaction.model_data.predict_set[group]:
column_results = []
for value in self.transaction.model_data.predict_set[group][column]:
stats = self.transaction.model_stats[column]
denormed = denorm(value=value, cell_stats=stats)
column_results.append(denormed)
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import MiniBatchKMeans
import imagehash
from PIL import Image
from mindsdb.config import CONFIG
from mindsdb.libs.constants.mindsdb import *
from mindsdb.libs.phases.base_module import BaseModule
from mindsdb.libs.helpers.text_helpers import splitRecursive, clean_float, cast_string_to_python_type
from mindsdb.external_libs.stats import calculate_sample_size
class StatsGenerator(BaseModule):
"""
# The stats generator phase is responsible for generating the insights we need about the data in order to vectorize it
# Additionally, the stats generator also provides the user with some extra meaningful information about his data,
thoguh this functionality may be moved to a different step (after vectorization) in the future
"""
phase_name = PHASE_STATS_GENERATOR
def _get_file_type(self, potential_path):
could_be_fp = False
for char in ('/', '\\', ':\\'):
if char in potential_path:
could_be_fp = True
if not could_be_fp:
return False
from mindsdb.libs.phases.base_module import BaseModule
from mindsdb.libs.constants.mindsdb import *
import datetime
class ModelInterface(BaseModule):
def run(self, mode='train'):
try:
from mindsdb.libs.backends.ludwig import LudwigBackend
except ImportError as e:
self.transaction.log.warning(e)
try:
from mindsdb.libs.backends.lightwood import LightwoodBackend
except ImportError as e:
self.transaction.log.warning(e)
if self.transaction.hmd['model_backend'] == 'ludwig':
self.transaction.model_backend = LudwigBackend(self.transaction)
elif self.transaction.hmd['model_backend'] == 'lightwood':
self.transaction.model_backend = LightwoodBackend(self.transaction)
else:
from mindsdb.libs.helpers.general_helpers import pickle_obj
from mindsdb.libs.constants.mindsdb import *
from mindsdb.libs.phases.base_module import BaseModule
from mindsdb.libs.helpers.probabilistic_validator import ProbabilisticValidator
from mindsdb.libs.phases.model_analyzer.helpers.column_evaluator import ColumnEvaluator
import pandas as pd
import numpy as np
class ModelAnalyzer(BaseModule):
def run(self):
"""
# Runs the model on the validation set in order to fit a probabilistic model that will evaluate the accuracy of future predictions
"""
output_columns = self.transaction.lmd['predict_columns']
input_columns = [col for col in self.transaction.lmd['columns'] if col not in output_columns and col not in self.transaction.lmd['columns_to_ignore']]
# Test some hypotheses about our columns
if self.transaction.lmd['disable_optional_analysis'] is False:
column_evaluator = ColumnEvaluator(self.transaction)
column_importances, buckets_stats, columnless_prediction_distribution, all_columns_prediction_distribution = column_evaluator.get_column_importance(model=self.transaction.model_backend, output_columns=output_columns, input_columns=input_columns, full_dataset=self.transaction.input_data.validation_df, stats=self.transaction.lmd['column_stats'])
self.transaction.lmd['column_importances'] = column_importances
self.transaction.lmd['columns_buckets_importances'] = buckets_stats
self.transaction.lmd['columnless_prediction_distribution'] = columnless_prediction_distribution
import copy
import numpy as np
import itertools
import logging
import traceback
from mindsdb.libs.constants.mindsdb import *
from mindsdb.libs.phases.base_module import BaseModule
from collections import OrderedDict
from mindsdb.libs.helpers.norm_denorm_helpers import norm, norm_buckets
from mindsdb.libs.helpers.text_helpers import hashtext, cleanfloat, tryCastToNumber
from mindsdb.libs.data_types.transaction_metadata import TransactionMetadata
class DataVectorizer(BaseModule):
phase_name = PHASE_DATA_VECTORIZATION
def _getRowExtraVector(self, ret, column_name, col_row_index, distances):
predict_columns = self.train_meta_data.model_predict_columns
desired_total = self.train_meta_data.window_size
batch_height = len(ret[column_name])
remaining_row_count = batch_height - (col_row_index +1)
harvest_count = desired_total if desired_total < remaining_row_count else remaining_row_count
empty_count = desired_total - harvest_count
empty_vector_len = (
len(ret[column_name][col_row_index])