Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_replace_str64():
Y = dt.Frame([["BLSD", "RY", "IO OUSEVOUY", "@"], [3, 4, 1, 2]],
names=["A", "B"], stypes=["str64", "int32"])
Y[f.B < 100, f.A] = "*"
frame_integrity_check(Y)
assert Y.stypes == (dt.str64, dt.int32)
assert Y.to_list() == [["*"] * 4, [3, 4, 1, 2]]
def test_rows_less_than_or_equal(df1):
dt1 = df1[f.A <= f.B, :]
frame_integrity_check(dt1)
assert dt1.names == df1.names
assert dt1.to_list() == [[0, 1, 3, 4, None, 9], [3, 2, 3, 4, None, 9]]
lb = LabelEncoder()
lb.fit(self.labels)
y = lb.transform(y)
else:
model = KNeighborsRegressor(n_neighbors=self.params['n_neighbors'], metric=self.params['metric'],
weights=self.params['weights'], n_jobs=self.params['n_jobs'])
self.means = dict()
self.standard_scaler = StandardScaler()
for col in X.names:
XX = X[:, col]
self.means[col] = XX.mean1()
if self.means[col] is None:
self.means[col] = 0
XX.replace(None, self.means[col])
X[:, col] = XX
assert X[dt.isna(dt.f[col]), col].nrows == 0
X = X.to_numpy()
X = self.standard_scaler.fit_transform(X)
feature_model.fit(X, y)
model.fit(X, y)
importances = np.array(abs(feature_model.coef_))
self.set_model_properties(model=model,
features=orig_cols,
importances=importances.tolist(), # abs(model.coef_[0])
iterations=0)
cache = "TRUE"
wc_lines = subprocess.run(['wc','-l',data_name], stdout=subprocess.PIPE).stdout.decode('utf-8').split(" ", 1)[0]
in_rows = int(wc_lines)-1
print("reading...")
question = "all rows" #1
gc.collect()
t_start = timeit.default_timer()
ans = dt.fread(data_name, show_progress=False)
print(ans.shape)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.v3)]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt)
del ans
gc.collect()
t_start = timeit.default_timer()
ans = dt.fread(data_name, show_progress=False)
print(ans.shape)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.v3)]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt)
del ans
gc.collect()
t_start = timeit.default_timer()
def transform(self, X: dt.Frame):
try:
return X[:, {"x": (dt.isna(dt.f[0])) & None | self.get_ip_property(self.parse_ipaddress(dt.f[0]))}]
# return X.to_pandas().astype(str).iloc[:, 0].apply(lambda x: self.get_ip_property(self.parse_ipaddress(x)))
except ValueError:
return np.zeros(X.shape[0])
def fit_transform(self, X: dt.Frame, y: np.array = None):
target = '__internal_target__'
X[:, target] = dt.Frame(y)
target_is_numeric = X[:, target][:, [bool, int, float]].shape[1] > 0
if target_is_numeric:
self._group_means = X[:, dt.mean(dt.f[target]), dt.by(*self.input_feature_names)]
else:
X[:, target] = dt.Frame(LabelEncoder().fit_transform(X[:, target].to_pandas().iloc[:, 0].values).ravel())
self._group_means = X[:, dt.median(dt.f[target]), dt.by(*self.input_feature_names)]
del X[:, target]
self._group_means.key = self.input_feature_names
return self.transform(X)
:return: this instance
"""
if not USE_DT:
raise RuntimeError('this function requires the package "datatable" to be installed')
import datatable as dt
if not isinstance(tokendf, dt.Frame):
raise ValueError('`tokendf` must be a datatable Frame object')
if {'doc', 'position', 'token'} & set(pd_dt_colnames(tokendf)) != {'doc', 'position', 'token'}:
raise ValueError('`tokendf` must contain a columns "doc", "position" and "token"')
# convert big dataframe to dict of document token dicts to be used in load_tokens
tokens = {}
for dl in dt.unique(tokendf[:, dt.f.doc]).to_list()[0]:
doc_df = tokendf[dt.f.doc == dl, :]
colnames = pd_dt_colnames(doc_df)
colnames.pop(colnames.index('doc'))
tokens[dl] = doc_df[:, colnames]
return self.load_tokens(tokens)
big = dt.fread(src_jn_y[2])
print(x.nrows, flush=True)
print(small.nrows, flush=True)
print(medium.nrows, flush=True)
print(big.nrows, flush=True)
task_init = timeit.default_timer()
print("joining...", flush=True)
question = "small inner on int" # q1
gc.collect()
y = small.copy(deep=True)
t_start = timeit.default_timer()
y.key = 'id1'
ans = x[:, :, join(y)][isfinite(f.v2), :] # , on='id1'
tmp = ans.copy(deep=True) ## ensure join results materialized #141
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, [sum(f.v1), sum(f.v2)]]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk)
del ans, y, tmp
gc.collect()
y = small.copy(deep=True)
t_start = timeit.default_timer()
y.key = 'id1'
ans = x[:, :, join(y)][isfinite(f.v2), :] # , on='id1'
tmp = ans.copy(deep=True)
print(ans.shape, flush=True)
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, [sum(f.v1), sum(f.v3)]]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk)
del ans
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {"v1": sum(f.v1), "v3": mean(f.v3)}, by(f.id3)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, [sum(f.v1), sum(f.v3)]]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk)
print(ans.head(3), flush=True)
print(ans.tail(3), flush=True)
del ans
question = "mean v1:v3 by id4" # q4
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {"v1": mean(f.v1), "v2": mean(f.v2), "v3": mean(f.v3)}, by(f.id4)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, [sum(f.v1), sum(f.v2), sum(f.v3)]]
chkt = timeit.default_timer() - t_start