Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
@check_empty(default=None)
def getGiniCoef(self,nodeType='root', eventTypes=None, content_field="root"):
'''
Wrapper function calculate the gini coefficient for the data frame.
Question #6,14,26
Input: df - Data frame containing data can be any subset of data
nodeType - Type of node to calculate the Gini coefficient over. Options: user or repo (case sensitive)
eventTypes - A list of event types to include in the calculation
Output: g - gini coefficient
'''
return self.getGiniCoefHelper(self.main_df, nodeType, eventTypes, content_field)
@check_empty(default=None)
@check_root_only(default=None)
def cascade_collection_participation_palma(self, community_grouper=None):
if not community_grouper:
all_node_users = self.main_df[self.user_col].values
return palma_ratio(list(Counter(all_node_users).values()))
elif community_grouper in self.main_df.columns:
meas = {}
for community in self.main_df[community_grouper].unique():
all_node_users = self.main_df[self.main_df[community_grouper] == community][self.user_col].values
meas[community] = palma_ratio(list(Counter(all_node_users).values()))
return meas
else:
return None
@check_empty(default=None)
@check_root_only(default=0)
def get_cascade_lifetime(self, granularity="D"):
"""
:param granularity: "s", "m", "H", "D" [seconds/minutes/days/hours]
"""
try:
lifetime = (max(self.main_df[self.timestamp_col]) - min(self.main_df[self.timestamp_col])).total_seconds()
except:
lifetime = (max(self.main_df[self.timestamp_col]) - min(self.main_df[self.timestamp_col]))
if granularity in ["m", "H", "D"]:
lifetime /= 60
if granularity in ["H", "D"]:
lifetime /= 60
if granularity == "D":
lifetime /= 24
return lifetime
@check_empty(default=None)
def get_depth_based_measurements(self):
"""
:return: pandas dataframe with "breadth", "size", "structural_virality", "unique_nodes", "new_node_ratio" at each depth
"""
self.main_df["depth"] = -1
self.reset_cascade()
self.cascade.set_root_node(self.main_df)
self.main_df.loc[self.main_df[self.node_col] == self.cascade.root_node, 'depth'] = 0
seed_nodes = [self.cascade.root_node]
depth = 1
while len(seed_nodes) > 0:
self.main_df.loc[(self.main_df[self.parent_node_col].isin(seed_nodes)) & (
self.main_df[self.node_col] != self.main_df[self.parent_node_col]), 'depth'] = depth
seed_nodes = self.main_df[(self.main_df[self.parent_node_col].isin(seed_nodes)) & (
@check_empty(default=None)
@check_root_only(default=None)
def cascade_participation_palma(self):
return palma_ratio(self.node_participation())
@check_empty(default=None)
def get_cascades_distribution_measurements(self):
"""
:return: pandas dataframe with cascade identiifer and "depth", "breadth", "size", "structural_virality" and lifetime for each cascade in the population
"""
cascades_distribution_measurements = []
for cascade_identifier, scm in self.scms.items():
cascades_distribution_measurements.append([cascade_identifier,
scm.community,
scm.cascade.get_cascade_depth(),
scm.cascade.get_cascade_size(),
scm.cascade.get_cascade_breadth(),
scm.cascade.get_cascade_structural_virality(),
scm.cascade.get_cascade_lifetime()
])
cols = ["rootID", "communityID", "depth", "size", "breadth", "structural_virality", "lifetime"]
@check_empty(default=None)
def cascade_collection_initialization_gini(self, community_grouper=None):
if not community_grouper:
root_node_users = self.main_df[self.main_df[self.node_col] == self.main_df[self.root_node_col]][
self.user_col].values
return pysal.inequality.gini.Gini(list(Counter(root_node_users).values())).g
elif community_grouper in self.main_df.columns:
meas = {}
for community in self.main_df[community_grouper].unique():
root_node_users = self.main_df[(self.main_df[self.node_col] == self.main_df[self.root_node_col]) & (
self.main_df[community_grouper] == community)][self.user_col].values
meas[community] = pysal.inequality.gini.Gini(list(Counter(root_node_users).values())).g
return meas
else:
return None
@check_empty(default=None)
def cascade_collection_participation_gini(self, community_grouper=None):
if not community_grouper:
all_node_users = self.main_df[self.user_col].values
return pysal.inequality.gini.Gini(list(Counter(all_node_users).values())).g
elif community_grouper in self.main_df.columns:
meas = {}
for community in self.main_df[community_grouper].unique():
all_node_users = self.main_df[self.main_df[community_grouper] == community][self.user_col].values
meas[community] = pysal.inequality.gini.Gini(list(Counter(all_node_users).values())).g
return meas
else:
return None
@check_empty(default=None)
@check_root_only(default=1.0)
def fraction_of_nodes_in_lcc(self, community_grouper=None):
if not community_grouper:
return max([scm.cascade.get_cascade_size() for scm in self.scms.values()]) / len(self.main_df)
elif community_grouper in self.main_df.columns:
meas = {}
for community in self.main_df[community_grouper].unique():
meas[community] = max(
[scm.cascade.get_cascade_size() for scm in self.scms.values() if scm.community == community]) / len(
self.main_df[self.main_df[community_grouper] == community])
return meas
else:
return None
@check_empty(default=None)
def get_community_users_count_timeseries(self, time_granularity="M", community_grouper=None):
"""
:param time_granularity: "Y", "M", "D", "H" [years/months/days/hours]
:param community_grouper: column that indicates a community, eg. communityID, keyword
:return: pandas dataframe with number of unique users who participate in start in that interval
"""
temporal_measurements = []
if community_grouper in self.main_df.columns:
for community_identifier, community_df in self.main_df.groupby(community_grouper):
cumul_df = None
for ts, df in community_df.set_index(self.timestamp_col).groupby(pd.Grouper(freq=time_granularity),
sort=True):
if cumul_df is None:
cumul_df = df.copy()
old_unique_users_count = 0