Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
X[:, date_col] = dt.f['Year'] * 10000 + dt.f['Month'] * 100 + dt.f['DayofMonth']
cols_to_keep = ['Date']
# add number of flights in/out for each airport per given interval
timeslice_mins = 60
for name, new_col, col, group in [
("out", "CRSDepTime_mod", "CRSDepTime", "Origin"),
("in", "CRSArrTime_mod", "CRSArrTime", "Dest")
]:
X[:, new_col] = X[:, dt.f[col] // timeslice_mins]
group_cols = [date_col, group, new_col]
new_name = 'flights_%s' % name
flights = X[:, {new_name: dt.count()}, dt.by(*group_cols)]
flights.key = group_cols
cols_to_keep.append(new_name)
X = X[:, :, dt.join(flights)]
# Fill NaNs with 0s
X[dt.isna(dt.f['DepDelay']), 'DepDelay'] = 0
cols_to_keep.extend([
'DepDelay',
'Year',
'Month',
'DayofMonth',
'DayOfWeek',
'CRSDepTime',
'UniqueCarrier',
'FlightNum',
'TailNum',
'CRSElapsedTime',
'Origin',
'Dest',
node.generate_c()
out = _header
out += "// Extern declarations\n"
out += self._extern_declarations
out += "\n\n"
out += "// Global variables\n"
out += self._global_declarations
out += "\n"
out += "\n\n\n"
for fnbody in self._functions.values():
out += fnbody
out += "\n\n"
return out
sz_sint, sz_int, sz_lint, sz_llint, sz_sizet = core.get_integer_sizes()
t16 = ("int" if sz_int == 2 else
"short int" if sz_sint == 2 else "")
t32 = ("int" if sz_int == 4 else
"long int" if sz_lint == 4 else "")
t64 = ("int" if sz_int == 8 else
"long int" if sz_lint == 8 else
"long long int" if sz_llint == 8 else "")
tsz = (t32 if sz_sizet == 4 else
t64 if sz_sizet == 8 else "")
if not (t16 and t32 and t64 and tsz):
raise RuntimeError("Invalid integer sizes: short int(%d), int(%d), "
"long int(%d), long long int(%d), size_t(%d)"
% (sz_sint, sz_int, sz_lint, sz_llint, sz_sizet))
decl_sizes = "\n".join(["typedef signed char int8_t;",
"typedef %s int16_t;" % t16,
"typedef %s int16_t;" % t16,
"typedef %s int32_t;" % t32,
"typedef %s int64_t;" % t64,
"typedef unsigned char uint8_t;",
"typedef unsigned %s uint16_t;" % t16,
"typedef unsigned %s uint32_t;" % t32,
"typedef unsigned %s uint64_t;" % t64,
"typedef unsigned %s size_t;" % tsz])
(ptr_dt_malloc,
ptr_dt_realloc,
ptr_dt_free,
# ptr_rowindex_from_filterfn32,
ptr_dt_column_data,
ptr_dt_unpack_slicerowindex,
ptr_dt_unpack_arrayrowindex) = core.get_internal_function_ptrs()
_header = """
/**
* This code is auto-generated by context.py
**/
// Integer types
%s
#define NULL ((void*)0)
// External functions
typedef void* (*ptr_0)(size_t);
typedef void* (*ptr_1)(void*, size_t);
typedef void (*ptr_2)(void*);
typedef void* (*ptr_3)(void*, int64_t, int);
typedef void* (*ptr_4)(void*, int64_t);
("out", "CRSDepTime_mod", "CRSDepTime", "Origin"),
("in", "CRSArrTime_mod", "CRSArrTime", "Dest")
]:
X[:, new_col] = X[:, dt.f[col] // timeslice_mins]
group_cols = [date_col, group, new_col]
new_name = 'flights_%s_per_%d_min' % (name, timeslice_mins)
flights = X[:, {new_name: dt.count()}, dt.by(*group_cols)]
flights.key = group_cols
cols_to_keep.append(new_name)
X = X[:, :, dt.join(flights)]
# select flights leaving from SFO only
X = X[dt.f['Origin'] == 'SFO', :]
# Fill NaNs in DepDelay column
X[dt.isna(dt.f['DepDelay']), 'DepDelay'] = 0
# create binary target column
depdelay_threshold_mins = 15
target = 'DepDelay%dm' % depdelay_threshold_mins
X[:, target] = dt.f['DepDelay'] > depdelay_threshold_mins
cols_to_keep.extend([
target,
'Year',
'Month',
'DayofMonth',
'DayOfWeek',
'CRSDepTime',
'UniqueCarrier',
'FlightNum',
'TailNum',
'CRSElapsedTime',
# add date
date_col = 'Date'
X[:, date_col] = dt.f['Year'] * 10000 + dt.f['Month'] * 100 + dt.f['DayofMonth']
cols_to_keep = ['Date']
# add number of flights in/out for each airport per given interval
timeslice_mins = 60
for name, new_col, col, group in [
("out", "CRSDepTime_mod", "CRSDepTime", "Origin"),
("in", "CRSArrTime_mod", "CRSArrTime", "Dest")
]:
X[:, new_col] = X[:, dt.f[col] // timeslice_mins]
group_cols = [date_col, group, new_col]
new_name = 'flights_%s' % name
flights = X[:, {new_name: dt.count()}, dt.by(*group_cols)]
flights.key = group_cols
cols_to_keep.append(new_name)
X = X[:, :, dt.join(flights)]
# Fill NaNs with 0s
X[dt.isna(dt.f['DepDelay']), 'DepDelay'] = 0
cols_to_keep.extend([
'DepDelay',
'Year',
'Month',
'DayofMonth',
'DayOfWeek',
'CRSDepTime',
'UniqueCarrier',
'FlightNum',
'TailNum',
ft = Ftrl(alpha = 0.1, nepochs = 10000, model_type = "binomial")
df_train_odd = dt.Frame([[1, 3, 7, 5, 9]])
df_target_odd = dt.Frame([["odd", "odd", "odd", "odd", "odd"]])
ft.fit(df_train_odd, df_target_odd)
assert_equals(ft.labels, dt.Frame([["odd"], [0]], names = ["label", "id"]))
df_train_wrong = dt.Frame([[2, 4, None, 6]])
df_target_wrong = dt.Frame([["even", "even", "none", "even"]])
with pytest.raises(ValueError) as e:
ft.fit(df_train_wrong, df_target_wrong)
assert ("Got two new labels in the target column, however, positive "
"label is already set"
== str(e.value))
df_train_even_odd = dt.Frame([[2, 1, 8, 3]])
df_target_even_odd = dt.Frame([["even", "odd", "even", "odd"]])
ft.fit(df_train_even_odd, df_target_even_odd)
assert_equals(ft.labels, dt.Frame([["even", "odd"], [1, 0]], names = ["label", "id"]))
p = ft.predict(df_train_odd)
p_dict = p.to_dict()
delta_odd = [abs(i - j) for i, j in zip(p_dict["odd"], [1, 1, 1, 1, 1])]
delta_even = [abs(i - j) for i, j in zip(p_dict["even"], [0, 0, 0, 0, 0])]
assert ft.model_type_trained == "binomial"
assert max(delta_odd) < epsilon
assert max(delta_even) < epsilon
p = ft.predict(df_train_even_odd)
p_dict = p.to_dict()
delta_even = [abs(i - j) for i, j in zip(p_dict["even"], [1, 0, 1, 0])]
delta_odd = [abs(i - j) for i, j in zip(p_dict["odd"], [0, 1, 0, 1])]
assert ft.model_type_trained == "binomial"
def test_ftrl_fit_predict_nones():
ft = Ftrl()
ft.fit(None, None)
df_target = ft.predict(None)
assert df_target == None
def test_ftrl_wrong_validation_target_type():
nepochs = 1234
nepochs_validation = 56
nbins = 78
ft = Ftrl(alpha = 0.5, nbins = nbins, nepochs = nepochs)
r = range(ft.nbins)
df_X = dt.Frame(r)
df_y = dt.Frame(r)
df_X_val = df_X
df_y_val = dt.Frame(["Some string data" for _ in r])
with pytest.raises(TypeError) as e:
res = ft.fit(df_X, df_y, df_X_val, df_y_val,
nepochs_validation = 0)
assert ("Training and validation target columns must have the same ltype, "
"got: `integer` and `string`" == str(e.value))
@pytest.mark.parametrize('target',
[[True, False],
["yes", "no"],
[20, 10],
[0.5, -0.5]])
def test_ftrl_fit_predict_bool_binomial(target):
ft = Ftrl(alpha = 0.1, nepochs = 10000, model_type = "binomial")
df_train = dt.Frame([True, False])
df_target = dt.Frame(target)
ft.fit(df_train, df_target)
df_res = ft.predict(df_train)
assert ft.labels[:, 0].to_list() == [sorted(target)]
assert ft.model_type_trained == "binomial"
assert df_res[0, 1] <= 1
assert df_res[0, 1] >= 1 - epsilon
assert df_res[1, 1] >= 0
assert df_res[1, 1] < epsilon
assert df_res[0, 0] >= 0
assert df_res[0, 0] < epsilon
assert df_res[1, 0] <= 1
assert df_res[1, 0] >= 1 - epsilon
def test_ftrl_early_stopping_multinomial():
nepochs = 2000
ft = Ftrl(alpha = 0.2, nepochs = nepochs, double_precision = True)
labels = ["blue", "green", "red"]
df_train = dt.Frame(["cucumber", None, "shift", "sky", "day", "orange",
"ocean"])
df_target = dt.Frame(["green", "red", "red", "blue", "green", None,
"blue"])
res = ft.fit(df_train, df_target, df_train[:4, :], df_target[:4, :],
nepochs_validation = 1, validation_error = 1e-3)
frame_integrity_check(ft.model)
p = ft.predict(df_train)
frame_integrity_check(p)
p_none = 1/p.ncols
p_dict = p.to_dict()
p_list = p.to_list()
sum_p =[sum(row) for row in zip(*p_list)]
delta_sum = [abs(i - j) for i, j in zip(sum_p, [1] * 5)]