Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def get_infer_dataframes(self, infer_idx, Y_pred):
distr = []
infer_val = []
Y_assign = Y_pred.data.numpy().argmax(axis=1)
domain_size = self.feat_dataset.var_to_domsize
# Need to map the inferred value index of the random variable to the actual value
# val_idx = val_id - 1 since val_id was numbered starting from 1 whereas
# val_idx starts at 0.
query = 'SELECT _vid_, val_id-1, rv_val FROM {pos_values}'.format(pos_values=AuxTables.pos_values.name)
pos_values = self.ds.engine.execute_query(query)
# dict mapping _vid_ --> val_idx --> value
vid_to_val = {}
for vid, val_idx, val in pos_values:
vid_to_val[vid] = vid_to_val.get(vid, {})
vid_to_val[vid][val_idx] = val
for idx in range(Y_pred.shape[0]):
vid = int(infer_idx[idx])
rv_distr = list(Y_pred[idx].data.numpy())
rv_val_idx = int(Y_assign[idx])
rv_val = vid_to_val[vid][rv_val_idx]
rv_prob = Y_pred[idx].data.numpy().max()
d_size = domain_size[vid]
distr.append({'_vid_': vid, 'distribution':[str(p) for p in rv_distr[:d_size]]})
infer_val.append({'_vid_': vid, 'inferred_val_idx': rv_val_idx, 'inferred_val': rv_val, 'prob':rv_prob})
We also distinguish between repairs on correct cells and repairs on
incorrect cells (correct cells are cells where init == ground truth).
"""
query = """
SELECT
(t1.init_value = t3._value_) AS is_correct,
count(*)
FROM {} as t1, {} as t2, {} as t3
WHERE t1._tid_ = t2._tid_
AND t1.attribute = t2.attribute
AND t1.init_value != t2.rv_value
AND t1._tid_ = t3._tid_
AND t1.attribute = t3._attribute_
GROUP BY is_correct
""".format(AuxTables.cell_domain.name,
AuxTables.inf_values_dom.name,
self.clean_data.name)
res = self.ds.engine.execute_query(query)
# Memoize the number of repairs on correct cells and incorrect cells.
# Since we do a GROUP BY we need to check which row of the result
# corresponds to the correct/incorrect counts.
self.total_repairs_grdt_correct, self.total_repairs_grdt_incorrect = 0, 0
self.total_repairs_grdt = 0
if not res:
return
if res[0][0]:
correct_idx, incorrect_idx = 0, 1
else:
correct_idx, incorrect_idx = 1, 0
def compute_correct_repairs(self):
"""
compute_correct_repairs memoizes the number of error cells
that were correctly inferred.
This value is always equal or less than total errors (see
compute_total_errors).
"""
queries = []
correct_repairs = 0.0
for attr in self.ds.get_attributes():
query = correct_repairs_template.substitute(init_table=self.ds.raw_data.name, grdt_table=self.clean_data.name,
attr=attr, inf_dom=AuxTables.inf_values_dom.name)
queries.append(query)
results = self.ds.engine.execute_queries(queries)
for res in results:
correct_repairs += float(res[0][0])
self.correct_repairs = correct_repairs
def compute_total_repairs(self):
"""
compute_total_repairs memoizes the number of repairs:
the # of cells that were inferred and where the inferred value
is not equal to the initial value.
"""
query = "SELECT count(*) FROM " \
" (SELECT _vid_ " \
" FROM {} as t1, {} as t2 " \
" WHERE t1._tid_ = t2._tid_ " \
" AND t1.attribute = t2.attribute " \
" AND t1.init_value != t2.rv_value) AS t".format(AuxTables.cell_domain.name,
AuxTables.inf_values_dom.name)
res = self.ds.engine.execute_query(query)
self.total_repairs = float(res[0][0])
from
{cell_domain} as t1,
{clean_data} as t2
left join {dk_cells} as t3 on t2._tid_ = t3._tid_ and t2._attribute_ = t3.attribute
left join {inf_values_dom} as t4 on t2._tid_ = t4._tid_ and t2._attribute_ = t4.attribute where t1._tid_ = t2._tid_ and t1.attribute = t2._attribute_
group by
clean,
status,
inferred,
init_eq_grdth,
init_eq_infer,
wl_eq_init,
wl_eq_grdth,
wl_eq_infer,
infer_eq_grdth
""".format(cell_domain=AuxTables.cell_domain.name,
clean_data=self.clean_data.name,
dk_cells=AuxTables.dk_cells.name,
inf_values_dom=AuxTables.inf_values_dom.name)
res = self.ds.engine.execute_query(query)
df_stats = pd.DataFrame(res,
columns=["is_clean", "cell_status", "is_inferred",
"init = grdth", "init = inferred",
"w. label = init", "w. label = grdth", "w. label = inferred",
"infer = grdth", "count"])
df_stats = df_stats.sort_values(list(df_stats.columns)).reset_index(drop=True)
logging.debug("weak label statistics:")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', len(df_stats))
pd.set_option('display.max_colwidth', -1)
def create_tensor(self):
"""
For each unique VID (cell) returns the co-occurrence probability between
each possible domain value for this VID and the initial/raw values for the
corresponding entity/tuple of this cell.
:return: Torch.Tensor of shape (# of VIDs) X (max domain) X (# of attributes)
where tensor[i][j][k] contains the co-occur probability between the j-th domain value
of the i-th random variable (VID) and the initial/raw value of the k-th
attribute for the corresponding entity.
"""
# Iterate over tuples in domain
tensors = []
# Set tuple_id index on raw_data
t = self.ds.aux_table[AuxTables.cell_domain]
sorted_domain = t.df.reset_index().sort_values(by=['_vid_'])[['_tid_','attribute','_vid_','domain']]
records = sorted_domain.to_records()
for row in tqdm(list(records)):
#Get tuple from raw_dataset
tid = row['_tid_']
tuple = self.raw_data_dict[tid]
feat_tensor = self.gen_feat_tensor(row, tuple)
tensors.append(feat_tensor)
combined = torch.cat(tensors)
return combined
"""
store_domains stores the 'domain' DataFrame as the 'cell_domain'
auxiliary table as well as generates the 'pos_values' auxiliary table,
a long-format of the domain values, in Postgres.
pos_values schema:
_tid_: entity/tuple ID
_cid_: cell ID
_vid_: random variable ID (all cells with more than 1 domain value)
_
"""
if domain.empty:
raise Exception("ERROR: Generated domain is empty.")
else:
self.ds.generate_aux_table(AuxTables.cell_domain, domain, store=True, index_attrs=['_vid_'])
self.ds.aux_table[AuxTables.cell_domain].create_db_index(self.ds.engine, ['_tid_'])
self.ds.aux_table[AuxTables.cell_domain].create_db_index(self.ds.engine, ['_cid_'])
query = "SELECT _vid_, _cid_, _tid_, attribute, a.rv_val, a.val_id from %s , unnest(string_to_array(regexp_replace(domain,\'[{\"\"}]\',\'\',\'gi\'),\'|||\')) WITH ORDINALITY a(rv_val,val_id)" % AuxTables.cell_domain.name
self.ds.generate_aux_table_sql(AuxTables.pos_values, query, index_attrs=['_tid_', 'attribute'])
def create_tensor(self):
query = 'SELECT _vid_, attribute, init_index FROM %s ORDER BY _vid_'%AuxTables.cell_domain.name
results = self.ds.engine.execute_query(query)
map_input = []
for res in results:
map_input.append((res[0], self.attr_to_idx[res[1]], res[2]))
tensors = self.pool.map(partial(gen_feat_tensor, classes=self.classes, total_attrs=self.total_attrs), map_input)
combined = torch.cat(tensors)
return combined
{clean_data} as t2
left join {dk_cells} as t3 on t2._tid_ = t3._tid_ and t2._attribute_ = t3.attribute
left join {inf_values_dom} as t4 on t2._tid_ = t4._tid_ and t2._attribute_ = t4.attribute where t1._tid_ = t2._tid_ and t1.attribute = t2._attribute_
group by
clean,
status,
inferred,
init_eq_grdth,
init_eq_infer,
wl_eq_init,
wl_eq_grdth,
wl_eq_infer,
infer_eq_grdth
""".format(cell_domain=AuxTables.cell_domain.name,
clean_data=self.clean_data.name,
dk_cells=AuxTables.dk_cells.name,
inf_values_dom=AuxTables.inf_values_dom.name)
res = self.ds.engine.execute_query(query)
df_stats = pd.DataFrame(res,
columns=["is_clean", "cell_status", "is_inferred",
"init = grdth", "init = inferred",
"w. label = init", "w. label = grdth", "w. label = inferred",
"infer = grdth", "count"])
df_stats = df_stats.sort_values(list(df_stats.columns)).reset_index(drop=True)
logging.debug("weak label statistics:")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', len(df_stats))
pd.set_option('display.max_colwidth', -1)
logging.debug("%s", df_stats)
pd.reset_option('display.max_columns')