Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
X[:, date_col] = dt.f['Year'] * 10000 + dt.f['Month'] * 100 + dt.f['DayofMonth']
cols_to_keep = ['Date']
# add number of flights in/out for each airport per given interval
timeslice_mins = 60
for name, new_col, col, group in [
("out", "CRSDepTime_mod", "CRSDepTime", "Origin"),
("in", "CRSArrTime_mod", "CRSArrTime", "Dest")
]:
X[:, new_col] = X[:, dt.f[col] // timeslice_mins]
group_cols = [date_col, group, new_col]
new_name = 'flights_%s' % name
flights = X[:, {new_name: dt.count()}, dt.by(*group_cols)]
flights.key = group_cols
cols_to_keep.append(new_name)
X = X[:, :, dt.join(flights)]
# Fill NaNs with 0s
X[dt.isna(dt.f['DepDelay']), 'DepDelay'] = 0
cols_to_keep.extend([
'DepDelay',
'Year',
'Month',
'DayofMonth',
'DayOfWeek',
'CRSDepTime',
'UniqueCarrier',
'FlightNum',
'TailNum',
'CRSElapsedTime',
'Origin',
'Dest',
keys = list(set(dt.Frame(keys, stype=st).to_list()[0]))
else:
keys = list(set(keys))
else:
l = int(random.expovariate(0.05)) + 1
keys = list(set(random_string(l) for _ in range(nkeys)))
nkeys = len(keys)
dkey = dt.Frame(KEY=keys, VAL=range(nkeys), stypes={"KEY": st})
dkey.key = "KEY"
keys, vals = dkey.to_list()
main = [random.choice(keys) for i in range(ndata)]
dmain = dt.Frame(KEY=main, stype=st)
res = [vals[keys.index(main[i])] for i in range(ndata)]
djoined = dmain[:, :, join(dkey)]
frame_integrity_check(djoined)
assert djoined.shape == (ndata, 2)
assert djoined.names == ("KEY", "VAL")
assert djoined.to_list() == [main, res]
def test_join_missing_levels():
d0 = dt.Frame(A=[1, 2, 3])
d1 = dt.Frame(A=[1, 2], K=[True, False])
d1.key = "A"
res = d0[:, :, join(d1)]
frame_integrity_check(res)
assert res.to_list() == [[1, 2, 3], [True, False, None]]
def join_self(self):
ncols = self.ncols
if self.nkeys:
self.df = self.df[:, :, join(self.df)]
else:
with pytest.raises(ValueError, match="The join frame is not keyed"):
self.df = self.df[:, :, join(self.df)]
return False
s = slice(self.nkeys, ncols)
join_data = copy.deepcopy(self.data[s])
join_types = self.types[s].copy()
join_names = self.names[s].copy()
self.data += join_data
self.types += join_types
self.names += join_names
self.nkeys = 0
self.dedup_names()
return True
def test_assign_from_joined_frame():
DT = dt.Frame(A=range(5))
JDT = dt.Frame(A=[1, 2, 3], B=['a', 'b', 'c'])
JDT.key = 'A'
DT[:, "Z", join(JDT)] = g.B
assert_equals(DT, dt.Frame(A=range(5), Z=[None, 'a', 'b', 'c', None]))
print("MyIEEEGroupBysTransformers name {} {}".format(
self._output_feature_names, self._feature_desc
)
)
return X[:, dt.f["col_cnt"] / dt.f["daily_cnt"]]
elif self.group_type == 'hour':
X = dt.Frame(X[:, self.group_col])
X[:, 'date'] = dt.Frame(ieee_datetime.dt.strftime('%Y%m%d_%H').values)
# Compute daily counts
hourly_cnt = X[:, {"hourly_cnt": dt.count()}, dt.by("date")]
hourly_cnt.key = ["date"]
X = X[:, :, dt.join(hourly_cnt)]
# Compute card count
col_cnt = X[:, {"col_cnt": dt.count()}, dt.by(*["date", self.group_col])]
col_cnt.key = ["date", self.group_col]
X = X[:, :, dt.join(col_cnt)]
self._output_feature_names = ["IEEEGroupBys_{}_{}".format(self.group_col, self.group_type)]
self._feature_desc = ["IEEEGroupBys_{}_{}".format(self.group_col, self.group_type)]
print('=' * 50)
print("MyIEEEGroupBysTransformers name {} {}".format(
self._output_feature_names, self._feature_desc
)
)
return X[:, dt.f["col_cnt"] / dt.f["hourly_cnt"]]
return X[:, dt.f["col_cnt"] / dt.f["daily_cnt"]]
elif self.group_type == 'hour':
X = dt.Frame(X[:, self.group_col])
X[:, 'date'] = dt.Frame(ieee_datetime.dt.strftime('%Y%m%d_%H').values)
# Compute daily counts
hourly_cnt = X[:, {"hourly_cnt": dt.count()}, dt.by("date")]
hourly_cnt.key = ["date"]
X = X[:, :, dt.join(hourly_cnt)]
# Compute card count
col_cnt = X[:, {"col_cnt": dt.count()}, dt.by(*["date", self.group_col])]
col_cnt.key = ["date", self.group_col]
X = X[:, :, dt.join(col_cnt)]
self._output_feature_names = ["IEEEGroupBys_{}_{}".format(self.group_col, self.group_type)]
self._feature_desc = ["IEEEGroupBys_{}_{}".format(self.group_col, self.group_type)]
print('=' * 50)
print("MyIEEEGroupBysTransformers name {} {}".format(
self._output_feature_names, self._feature_desc
)
)
return X[:, dt.f["col_cnt"] / dt.f["hourly_cnt"]]
else:
print('='*50)
print("MyIEEEGroupBysTransformers ERROR {} {}".format(
y.key = 'id1'
ans = x[:, :, join(y)][isfinite(f.v2), :] # , on='id1'
tmp = ans.copy(deep=True) ## ensure join results materialized #141
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, [sum(f.v1), sum(f.v2)]]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk)
del ans, y, tmp
gc.collect()
y = small.copy(deep=True)
t_start = timeit.default_timer()
y.key = 'id1'
ans = x[:, :, join(y)][isfinite(f.v2), :] # , on='id1'
tmp = ans.copy(deep=True)
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, [sum(f.v1), sum(f.v2)]]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk)
print(ans.head(3), flush=True)
print(ans.tail(3), flush=True)
del ans, y, tmp
question = "medium inner on int" # q2
gc.collect()
y = medium.copy(deep=True)
t_start = timeit.default_timer()
def predict(self, X, **kwargs):
if self.tgc is None or not all([x in X.names for x in self.tgc]):
return np.ones(X.shape[0]) * self.nan_value
tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
# Datatable code
if len(tgc_wo_time) > 0:
# Join the average per group to the input datafrane
self.group_means.key = tgc_wo_time
# Predictions for unknown tgc will be None in DT
yhat_dt = X[:, :, dt.join(self.group_means)][:, "yhat"]
# In DT missing values after the join are None
# Need to cast to float64 to replace None or np.nan
yhat_dt.replace(None, np.float64(self.nan_value))
return yhat_dt.to_numpy()[:, 0]
else:
# if no Groups are avaible then just return the target average
return np.full((X.shape[0], 1), self.nan_value)
def make_datatable(dt, rows, select, groupby=None, join=None, sort=None,
engine=None, mode=None, replacement=None):
"""
Implementation of the `Frame.__call__()` method.
This is the "main" function in the module; it is responsible for
evaluating various transformations when they are applied to a target
Frame.
"""
if isinstance(groupby, datatable.join):
join = groupby
groupby = None
update_mode = mode == "update"
delete_mode = mode == "delete"
jframe = join.joinframe if join else None
with f.bind_datatable(dt), g.bind_datatable(jframe):
ee = make_engine(engine, dt, jframe)
ee.rowindex = dt.internal.rowindex
rowsnode = make_rowfilter(rows, ee)
grbynode = make_groupby(groupby, ee)
colsnode = make_columnset(select, ee, update_mode)
sortnode = make_sort(sort, ee)
if join:
join.execute(ee)