Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# add date
date_col = 'Date'
X[:, date_col] = dt.f['Year'] * 10000 + dt.f['Month'] * 100 + dt.f['DayofMonth']
cols_to_keep = ['Date']
# add number of flights in/out for each airport per given interval
timeslice_mins = 60
for name, new_col, col, group in [
("out", "CRSDepTime_mod", "CRSDepTime", "Origin"),
("in", "CRSArrTime_mod", "CRSArrTime", "Dest")
]:
X[:, new_col] = X[:, dt.f[col] // timeslice_mins]
group_cols = [date_col, group, new_col]
new_name = 'flights_%s' % name
flights = X[:, {new_name: dt.count()}, dt.by(*group_cols)]
flights.key = group_cols
cols_to_keep.append(new_name)
X = X[:, :, dt.join(flights)]
# Fill NaNs with 0s
X[dt.isna(dt.f['DepDelay']), 'DepDelay'] = 0
cols_to_keep.extend([
'DepDelay',
'Year',
'Month',
'DayofMonth',
'DayOfWeek',
'CRSDepTime',
'UniqueCarrier',
'FlightNum',
'TailNum',
def test_issue_2242(seed):
n = 25000
X = dt.Frame(AGE=[random.randint(1, 50) for i in range(n)],
PAY=[random.choice([True, False]) for i in range(n)])
RES = X[:, dt.math.log((count() + 1)/(sum(f.PAY) + 0.5) - 1), by(f.AGE)]
assert RES.shape == (50, 2)
data = RES.to_list()
assert data[0] == list(range(1, 51))
assert all(isinstance(x, float) for x in data[1])
def test_groups_large2_str(n, seed):
random.seed(seed)
while n == 0:
n = int(random.expovariate(0.0005))
src = ["%x" % random.getrandbits(6) for _ in range(n)]
f0 = dt.Frame({"A": src})
f1 = f0[:, count(), by("A")]
frame_integrity_check(f1)
assert f1.nrows == len(set(src))
def test_count_2d_array_integer():
a_in = [[9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1],
[0, 1, 0, 5, 3, 8, 1, 0, 2, 5, None, 8, 1]]
a_reduce = count(a_in)
assert a_reduce == 2
def test_group_empty_frame3():
DT = dt.Frame(A=[], stype=dt.float32)
D2 = DT[:, count(f.A), by(f.A)]
frame_integrity_check(D2)
assert D2.shape == (0, 2)
assert D2.stypes == (DT.stype, dt.int64)
def test_count_dt_groupby_string():
df_in = dt.Frame([None, "blue", "green", "indico", None, None, "orange",
"red", "violet", "yellow", "green", None, "blue"])
df_reduce = df_in[:, [count(f.C0), count()], "C0"]
frame_integrity_check(df_reduce)
assert df_reduce.shape == (8, 3)
assert df_reduce.ltypes == (ltype.str, ltype.int, ltype.int,)
assert df_reduce.to_list() == [[None, "blue", "green", "indico", "orange",
"red", "violet", "yellow"],
[0, 2, 2, 1, 1, 1, 1, 1],
[4, 2, 2, 1, 1, 1, 1, 1]]
def test_count_dt_integer_large(numpy):
n = 12345678
a_in = numpy.random.randint(2**20, size=n, dtype=numpy.int32)
df_in = dt.Frame(a_in)
df_reduce = df_in[:, count()]
assert df_reduce.shape == (1, 1)
assert df_reduce.ltypes == (ltype.int,)
assert df_reduce.to_list() == [[n]]
def test_count_with_i():
# See issue 1316
DT = dt.Frame(A=range(100))
assert DT[:5, count()][0, 0] == 5
assert DT[-12:, count()][0, 0] == 12
assert DT[::3, count()][0, 0] == 34
def test_count_2d_dt_integer():
df_in = dt.Frame([[9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1],
[0, 1, 0, 5, 3, 8, 1, 0, 2, 5, None, 8, 1]])
df_reduce = df_in[:, [count(f.C0), count(f.C1), count()]]
frame_integrity_check(df_reduce)
assert df_reduce.shape == (1, 3)
assert df_reduce.ltypes == (ltype.int, ltype.int, ltype.int)
assert df_reduce.to_list() == [[10], [12], [13]]
)
return X[:, dt.f["col_cnt"] / dt.f["daily_cnt"]]
elif self.group_type == 'hour':
X = dt.Frame(X[:, self.group_col])
X[:, 'date'] = dt.Frame(ieee_datetime.dt.strftime('%Y%m%d_%H').values)
# Compute daily counts
hourly_cnt = X[:, {"hourly_cnt": dt.count()}, dt.by("date")]
hourly_cnt.key = ["date"]
X = X[:, :, dt.join(hourly_cnt)]
# Compute card count
col_cnt = X[:, {"col_cnt": dt.count()}, dt.by(*["date", self.group_col])]
col_cnt.key = ["date", self.group_col]
X = X[:, :, dt.join(col_cnt)]
self._output_feature_names = ["IEEEGroupBys_{}_{}".format(self.group_col, self.group_type)]
self._feature_desc = ["IEEEGroupBys_{}_{}".format(self.group_col, self.group_type)]
print('=' * 50)
print("MyIEEEGroupBysTransformers name {} {}".format(
self._output_feature_names, self._feature_desc
)
)
return X[:, dt.f["col_cnt"] / dt.f["hourly_cnt"]]
else: