Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
("out", "CRSDepTime_mod", "CRSDepTime", "Origin"),
("in", "CRSArrTime_mod", "CRSArrTime", "Dest")
]:
X[:, new_col] = X[:, dt.f[col] // timeslice_mins]
group_cols = [date_col, group, new_col]
new_name = 'flights_%s_per_%d_min' % (name, timeslice_mins)
flights = X[:, {new_name: dt.count()}, dt.by(*group_cols)]
flights.key = group_cols
cols_to_keep.append(new_name)
X = X[:, :, dt.join(flights)]
# select flights leaving from SFO only
X = X[dt.f['Origin'] == 'SFO', :]
# Fill NaNs in DepDelay column
X[dt.isna(dt.f['DepDelay']), 'DepDelay'] = 0
# create binary target column
depdelay_threshold_mins = 15
target = 'DepDelay%dm' % depdelay_threshold_mins
X[:, target] = dt.f['DepDelay'] > depdelay_threshold_mins
cols_to_keep.extend([
target,
'Year',
'Month',
'DayofMonth',
'DayOfWeek',
'CRSDepTime',
'UniqueCarrier',
'FlightNum',
'TailNum',
'CRSElapsedTime',
def test_assign_string_columns():
DT = dt.Frame(A=["One", "two", "three", None, "five"])
DT[dt.isna(f.A), f.A] = dt.Frame(["FOUR"])
assert_equals(DT, dt.Frame(A=["One", "two", "three", "FOUR", "five"]))
def test_rows_isna(df1):
from datatable import isna
dt1 = df1[isna(f.A), :]
frame_integrity_check(dt1)
assert dt1.names == df1.names
assert dt1.to_list() == [[None, None], [None, 8]]
if self.num_classes >= 2:
model = SVC(kernel='linear', probability=True, random_state=self.random_state)
lb = LabelEncoder()
lb.fit(self.labels)
y = lb.transform(y)
else:
model = SVR(kernel='linear')
self.means = dict()
for col in X.names:
XX = X[:, col]
self.means[col] = XX.mean1()
if np.isnan(self.means[col]):
self.means[col] = 0
XX.replace(None, self.means[col])
X[:, col] = XX
assert X[dt.isna(dt.f[col]), col].nrows == 0
X = X.to_numpy()
model.fit(X, y, sample_weight=sample_weight)
self.set_model_properties(model=model,
features=orig_cols,
importances=abs(model.coef_[0]),
iterations=0)
else:
model = als.FMRegression(n_iter=self.params["n_iter"], init_stdev=self.params["init_stdev"],
rank=self.params["rank"], l2_reg_w=self.params["l2_reg_w"],
l2_reg_V=self.params["l2_reg_V"], random_state=self.random_state)
self.means = dict()
self.standard_scaler = StandardScaler()
for col in X.names:
XX = X[:, col]
self.means[col] = XX.mean1()
if np.isnan(self.means[col]):
self.means[col] = 0
XX.replace(None, self.means[col])
X[:, col] = XX
assert X[dt.isna(dt.f[col]), col].nrows == 0
X = X.to_numpy()
X = self.standard_scaler.fit_transform(X)
X = csr_matrix(X) # requires sparse matrix
model.fit(X, y)
importances = np.array(abs(model.w_))
self.set_model_properties(model=model,
features=orig_cols,
importances=importances.tolist(), # abs(model.coef_[0])
iterations=0)
y = lb.transform(y)
else:
feature_model = NuSVR(kernel='linear', nu=self.params['nu'])
model = NuSVR(nu=self.params['nu'], kernel=self.params['kernel'],
degree=self.params['degree'])
self.means = dict()
for col in X.names:
XX = X[:, col]
self.means[col] = XX.mean1()
if self.means[col] is None:
self.means[col] = 0
XX.replace(None, self.means[col])
X[:, col] = XX
assert X[dt.isna(dt.f[col]), col].nrows == 0
X = X.to_numpy()
# nu is infeasible sometimes
# doing quaternary search on both sides of selected nu
valid_nu = None
while valid_nu is None:
try:
model.fit(X, y)
valid_nu = self.params['nu']
except:
if self.params['nu'] > 0.5:
self.params['nu'] = 1.0 - self.params['nu']
else:
self.params['nu'] = (4.0 - 3.0 * self.params['nu']) / 4.0
if self.num_classes >= 2:
model = SVC(C=self.params["C"], kernel=self.params["kernel"], probability=True, random_state=self.random_state)
lb = LabelEncoder()
lb.fit(self.labels)
y = lb.transform(y)
else:
model = SVR(C=self.params["C"], kernel=self.params["kernel"], epsilon=self.params["epsilon"])
self.means = dict()
self.scaler=StandardScaler()
for col in X.names:
XX = X[:, col]
self.means[col] = XX.mean1()
if np.isnan(self.means[col]):
self.means[col] = 0
XX.replace(None, self.means[col])
X[:, col] = XX
assert X[dt.isna(dt.f[col]), col].nrows == 0
X = X.to_numpy()
X=self.scaler.fit_transform(X)
if self.num_classes >= 2:
feature_model.fit(X, y, sample_weight=sample_weight)
model.fit(X, y, sample_weight=sample_weight)
else :
feature_model.fit(X, y)
model.fit(X, y)
importances=np.array(abs(feature_model.coef_))
self.set_model_properties(model=model,
features=orig_cols,
importances=importances.tolist(),#abs(model.coef_[0])
iterations=0)
def transform(self, X: dt.Frame):
if X.ncols == 0:
return np.zeros(X.nrows)
return X[:, dt.sum([dt.isna(dt.f[x]) for x in range(X.ncols)])]
def transform(self, X: dt.Frame):
try:
X = dt.Frame(X)
X.names = ['zip_key']
X = X[:, str('zip_key')]
zip_list = dt.unique(X[~dt.isna(dt.f.zip_key), 0]).to_list()[0]
zip_features = [self.get_zipcode_property(self.parse_zipcode(x)) for x in zip_list]
X_g = dt.Frame({"zip_key": zip_list, self.get_property_name(): zip_features})
X_g.key = 'zip_key'
X_result = X[:, :, dt.join(X_g)]
return X_result[:, 1:]
except:
return np.zeros(X.shape[0])